diff --git a/app.py b/app.py
index 41271739..98e3bd0d 100644
--- a/app.py
+++ b/app.py
@@ -405,6 +405,16 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.warning(f"BackgroundJob recovery failed (non-critical): {e}")
+ # Subscribe knowledge ingestion to connection lifecycle events so OAuth
+ # connect/disconnect reliably trigger bootstrap/purge.
+ try:
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorIngestConsumer import (
+ registerKnowledgeIngestionConsumer,
+ )
+ registerKnowledgeIngestionConsumer()
+ except Exception as e:
+ logger.warning(f"KnowledgeIngestionConsumer registration failed (non-critical): {e}")
+
yield
# --- Stop Managers ---
@@ -672,6 +682,9 @@ app.include_router(navigationRouter)
from modules.routes.routeWorkflowDashboard import router as workflowDashboardRouter
app.include_router(workflowDashboardRouter)
+from modules.routes.routeAutomationWorkspace import router as automationWorkspaceRouter
+app.include_router(automationWorkspaceRouter)
+
# ============================================================================
# PLUG&PLAY FEATURE ROUTERS
# Dynamically load routers from feature containers in modules/features/
diff --git a/modules/connectors/providerClickup/connectorClickup.py b/modules/connectors/providerClickup/connectorClickup.py
index f8b4fae1..10517db2 100644
--- a/modules/connectors/providerClickup/connectorClickup.py
+++ b/modules/connectors/providerClickup/connectorClickup.py
@@ -210,6 +210,9 @@ class ClickupListsAdapter(ServiceAdapter):
data = await self._svc.getTask(task_id)
if isinstance(data, dict) and data.get("error"):
return json.dumps(data).encode("utf-8")
+ returnedId = data.get("id", "") if isinstance(data, dict) else ""
+ if returnedId and returnedId != task_id:
+ logger.warning(f"ClickUp download: requested task_id={task_id} but API returned id={returnedId}")
payload = json.dumps(data, indent=2).encode("utf-8")
return DownloadResult(data=payload, fileName=f"task-{task_id}.json", mimeType="application/json")
diff --git a/modules/connectors/providerMsft/connectorMsft.py b/modules/connectors/providerMsft/connectorMsft.py
index bf290eca..49f6fdaa 100644
--- a/modules/connectors/providerMsft/connectorMsft.py
+++ b/modules/connectors/providerMsft/connectorMsft.py
@@ -126,6 +126,11 @@ def _stripGraphBase(url: str) -> str:
def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry:
isFolder = "folder" in item
+ # Graph exposes the driveItem content hash as ``eTag`` (quoted) or
+ # ``cTag``; we normalise to a "revision" string so callers can use it as a
+ # stable ``contentVersion`` for idempotent ingestion without re-downloading
+ # file bytes.
+ revision = item.get("eTag") or item.get("cTag")
return ExternalEntry(
name=item.get("name", ""),
path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""),
@@ -137,6 +142,9 @@ def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> Exter
"id": item.get("id"),
"webUrl": item.get("webUrl"),
"childCount": item.get("folder", {}).get("childCount") if isFolder else None,
+ "revision": revision,
+ "lastModifiedDateTime": item.get("lastModifiedDateTime"),
+ "parentReference": item.get("parentReference", {}),
},
)
@@ -167,21 +175,36 @@ class SharepointAdapter(_GraphApiMixin, ServiceAdapter):
return await self._discoverSites()
if not folderPath or folderPath == "/":
- endpoint = f"sites/{siteId}/drive/root/children"
+ endpoint: Optional[str] = f"sites/{siteId}/drive/root/children?$top=200"
else:
cleanPath = folderPath.lstrip("/")
- endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children"
+ endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children?$top=200"
- result = await self._graphGet(endpoint)
- if "error" in result:
- logger.warning(f"SharePoint browse failed: {result['error']}")
- return []
+ # Follow @odata.nextLink until a hard cap is reached so large libraries
+ # are fully enumerated (required for bootstrap). Per-page size uses
+ # Graph's max supported value to minimise round-trips.
+ effectiveLimit = int(limit) if limit is not None else None
+ items: List[Dict[str, Any]] = []
+ hardCap = 5000
+ while endpoint and len(items) < hardCap:
+ result = await self._graphGet(endpoint)
+ if "error" in result:
+ logger.warning(f"SharePoint browse failed: {result['error']}")
+ break
+ for raw in result.get("value", []) or []:
+ items.append(raw)
+ if effectiveLimit is not None and len(items) >= effectiveLimit:
+ break
+ if effectiveLimit is not None and len(items) >= effectiveLimit:
+ break
+ nextLink = result.get("@odata.nextLink")
+ endpoint = _stripGraphBase(nextLink) if nextLink else None
- entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])]
+ entries = [_graphItemToExternalEntry(item, path) for item in items]
if filter:
entries = [e for e in entries if _matchFilter(e, filter)]
- if limit is not None:
- entries = entries[: max(1, int(limit))]
+ if effectiveLimit is not None:
+ entries = entries[: max(1, effectiveLimit)]
return entries
async def _discoverSites(self) -> List[ExternalEntry]:
diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py
index cfc10db2..786eea7d 100644
--- a/modules/datamodels/datamodelAi.py
+++ b/modules/datamodels/datamodelAi.py
@@ -162,6 +162,7 @@ class AiCallOptions(BaseModel):
# Provider filtering (from UI multiselect or automation config)
allowedProviders: Optional[List[str]] = Field(default=None, description="List of allowed AI providers to use (empty = all RBAC-permitted)")
+ allowedModels: Optional[List[str]] = Field(default=None, description="Whitelist of allowed model names (AND-filter with allowedProviders). None/empty = all allowed.")
class AiCallRequest(BaseModel):
diff --git a/modules/datamodels/datamodelDocref.py b/modules/datamodels/datamodelDocref.py
index 27ba5e2b..f4ce09aa 100644
--- a/modules/datamodels/datamodelDocref.py
+++ b/modules/datamodels/datamodelDocref.py
@@ -110,11 +110,13 @@ class DocumentReferenceList(BaseModel):
# docItem:documentId
references.append(DocumentItemReference(documentId=parts[0]))
- # Unknown format - skip or log warning
else:
- # Try to parse as simple string (backward compatibility)
- # Assume it's a label if it doesn't match known patterns
- if refStr:
+ if not refStr:
+ continue
+ import re
+ if re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', refStr, re.I):
+ references.append(DocumentItemReference(documentId=refStr))
+ else:
references.append(DocumentListReference(label=refStr))
return cls(references=references)
@@ -153,9 +155,12 @@ def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList:
return coerceDocumentReferenceList(value[innerKey])
docId = value.get("documentId") or value.get("id")
if docId:
+ docIdStr = str(docId)
+ if docIdStr.startswith("docItem:") or docIdStr.startswith("docList:"):
+ return DocumentReferenceList.from_string_list([docIdStr])
return DocumentReferenceList(references=[
DocumentItemReference(
- documentId=str(docId),
+ documentId=docIdStr,
fileName=value.get("fileName") or value.get("name"),
)
])
@@ -178,10 +183,15 @@ def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList:
continue
docId = item.get("documentId") or item.get("id")
if docId:
- references.append(DocumentItemReference(
- documentId=str(docId),
- fileName=item.get("fileName") or item.get("name"),
- ))
+ docIdStr = str(docId)
+ if docIdStr.startswith("docItem:") or docIdStr.startswith("docList:"):
+ parsed = DocumentReferenceList.from_string_list([docIdStr])
+ references.extend(parsed.references)
+ else:
+ references.append(DocumentItemReference(
+ documentId=docIdStr,
+ fileName=item.get("fileName") or item.get("name"),
+ ))
elif item.get("label"):
references.append(DocumentListReference(
label=str(item["label"]),
diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py
index 0aaaffd8..38fd1d27 100644
--- a/modules/datamodels/datamodelExtraction.py
+++ b/modules/datamodels/datamodelExtraction.py
@@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel):
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Merging strategy
- mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
+ mergeStrategy: Optional[MergeStrategy] = Field(
+ default_factory=MergeStrategy,
+ description=(
+ "Strategy for merging extraction results. Pass None to skip merging entirely "
+ "(required for per-chunk ingestion pipelines like RAG, where per-page/per-section "
+ "granularity must be preserved for embedding)."
+ ),
+ )
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")
diff --git a/modules/datamodels/datamodelFileFolder.py b/modules/datamodels/datamodelFileFolder.py
deleted file mode 100644
index 4829385e..00000000
--- a/modules/datamodels/datamodelFileFolder.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2025 Patrick Motsch
-# All rights reserved.
-"""FileFolder: hierarchical folder structure for file organization."""
-
-from typing import Optional
-from pydantic import BaseModel, Field
-from modules.datamodels.datamodelBase import PowerOnModel
-from modules.shared.i18nRegistry import i18nModel
-import uuid
-
-
-@i18nModel("Dateiordner")
-class FileFolder(PowerOnModel):
- """Hierarchischer Ordner fuer die Dateiverwaltung."""
- id: str = Field(
- default_factory=lambda: str(uuid.uuid4()),
- description="Primary key",
- json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
- )
- name: str = Field(
- description="Folder name",
- json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True},
- )
- parentId: Optional[str] = Field(
- default=None,
- description="Parent folder ID (null = root)",
- json_schema_extra={
- "label": "Uebergeordneter Ordner",
- "frontend_type": "text",
- "frontend_readonly": False,
- "frontend_required": False,
- "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
- },
- )
- mandateId: Optional[str] = Field(
- default=None,
- description="Mandate context",
- json_schema_extra={
- "label": "Mandanten-ID",
- "frontend_type": "text",
- "frontend_readonly": True,
- "frontend_required": False,
- "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"},
- },
- )
- featureInstanceId: Optional[str] = Field(
- default=None,
- description="Feature instance context",
- json_schema_extra={
- "label": "Feature-Instanz-ID",
- "frontend_type": "text",
- "frontend_readonly": True,
- "frontend_required": False,
- "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
- },
- )
- scope: str = Field(
- default="personal",
- description="Data visibility scope: personal, featureInstance, mandate, global. Inherited by files in this folder.",
- json_schema_extra={
- "label": "Sichtbarkeit",
- "frontend_type": "select",
- "frontend_readonly": False,
- "frontend_required": False,
- "frontend_options": [
- {"value": "personal", "label": "Persönlich"},
- {"value": "featureInstance", "label": "Feature-Instanz"},
- {"value": "mandate", "label": "Mandant"},
- {"value": "global", "label": "Global"},
- ],
- },
- )
- neutralize: bool = Field(
- default=False,
- description="Whether files in this folder should be neutralized before AI processing. Inherited by new/moved files.",
- json_schema_extra={
- "label": "Neutralisieren",
- "frontend_type": "checkbox",
- "frontend_readonly": False,
- "frontend_required": False,
- },
- )
diff --git a/modules/datamodels/datamodelFiles.py b/modules/datamodels/datamodelFiles.py
index 82628e0c..6adf6642 100644
--- a/modules/datamodels/datamodelFiles.py
+++ b/modules/datamodels/datamodelFiles.py
@@ -10,6 +10,69 @@ import uuid
import base64
+@i18nModel("Ordner")
+class FileFolder(PowerOnModel):
+ """Persistenter Datei-Ordner im Management-DB-Kontext (RBAC wie FileItem)."""
+
+ id: str = Field(
+ default_factory=lambda: str(uuid.uuid4()),
+ description="Primary key",
+ json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
+ )
+ name: str = Field(
+ description="Display name of the folder",
+ json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True},
+ )
+ parentId: Optional[str] = Field(
+ default=None,
+ description="Parent folder id; empty or None for root",
+ json_schema_extra={
+ "label": "Uebergeordneter Ordner",
+ "frontend_type": "text",
+ "frontend_readonly": False,
+ "frontend_required": False,
+ "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
+ },
+ )
+ mandateId: Optional[str] = Field(
+ default="",
+ description="ID of the mandate this folder belongs to",
+ json_schema_extra={
+ "label": "Mandant",
+ "frontend_type": "text",
+ "frontend_readonly": True,
+ "frontend_required": False,
+ "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"},
+ },
+ )
+ featureInstanceId: Optional[str] = Field(
+ default="",
+ description="ID of the feature instance this folder belongs to",
+ json_schema_extra={
+ "label": "Feature-Instanz",
+ "frontend_type": "text",
+ "frontend_readonly": True,
+ "frontend_required": False,
+ "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
+ },
+ )
+ scope: str = Field(
+ default="personal",
+ description="Data visibility scope: personal, featureInstance, mandate, global",
+ json_schema_extra={"label": "Sichtbarkeit", "frontend_type": "select", "frontend_readonly": False, "frontend_required": False, "frontend_options": [
+ {"value": "personal", "label": "Persönlich"},
+ {"value": "featureInstance", "label": "Feature-Instanz"},
+ {"value": "mandate", "label": "Mandant"},
+ {"value": "global", "label": "Global"},
+ ]},
+ )
+ neutralize: bool = Field(
+ default=False,
+ description="Whether files in this folder should be neutralized before AI processing",
+ json_schema_extra={"label": "Neutralisieren", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False},
+ )
+
+
@i18nModel("Datei")
class FileItem(PowerOnModel):
"""Metadaten einer gespeicherten Datei."""
@@ -44,6 +107,17 @@ class FileItem(PowerOnModel):
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
+ folderId: Optional[str] = Field(
+ default=None,
+ description="ID of the folder containing this file (if any)",
+ json_schema_extra={
+ "label": "Ordner",
+ "frontend_type": "text",
+ "frontend_readonly": False,
+ "frontend_required": False,
+ "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
+ },
+ )
mimeType: str = Field(
description="MIME type of the file",
json_schema_extra={"label": "MIME-Typ", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
@@ -68,17 +142,6 @@ class FileItem(PowerOnModel):
description="Tags for categorization and search",
json_schema_extra={"label": "Tags", "frontend_type": "tags", "frontend_readonly": False, "frontend_required": False},
)
- folderId: Optional[str] = Field(
- default=None,
- description="ID of the parent folder",
- json_schema_extra={
- "label": "Ordner-ID",
- "frontend_type": "text",
- "frontend_readonly": False,
- "frontend_required": False,
- "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
- },
- )
description: Optional[str] = Field(
default=None,
description="User-provided description of the file",
diff --git a/modules/datamodels/datamodelJson.py b/modules/datamodels/datamodelJson.py
index 784cc042..0228fbad 100644
--- a/modules/datamodels/datamodelJson.py
+++ b/modules/datamodels/datamodelJson.py
@@ -6,7 +6,7 @@ Unified JSON document schema and helpers used by both generation prompts and ren
This defines a single canonical template and the supported section types.
"""
-from typing import List
+from typing import List, Literal, TypedDict
# Canonical list of supported section types across the system
supportedSectionTypes: List[str] = [
@@ -18,6 +18,21 @@ supportedSectionTypes: List[str] = [
"image",
]
+class InlineRun(TypedDict, total=False):
+ """Single inline content run. Every paragraph/cell/list-item is a List[InlineRun]."""
+ type: Literal["text", "image", "link", "bold", "italic", "code"]
+ value: str # text content (for text/bold/italic/code/link-label)
+ fileId: str # for type=image: reference to FileItem
+ base64Data: str # for type=image: resolved base64 (post-processing)
+ mimeType: str # for type=image: e.g. "image/png"
+ widthPt: int # for type=image: optional render width
+ href: str # for type=link: URL target
+
+supportedInlineRunTypes: List[str] = [
+ "text", "image", "link", "bold", "italic", "code",
+]
+
+
# Canonical JSON template used for AI generation (documents array + sections)
# This template is used for STRUCTURE generation - sections have empty elements arrays.
# For content generation, elements arrays will be populated later.
diff --git a/modules/datamodels/datamodelKnowledge.py b/modules/datamodels/datamodelKnowledge.py
index 163328a4..d0af2216 100644
--- a/modules/datamodels/datamodelKnowledge.py
+++ b/modules/datamodels/datamodelKnowledge.py
@@ -90,6 +90,16 @@ class FileContentIndex(PowerOnModel):
description="Data visibility scope: personal, featureInstance, mandate, global",
json_schema_extra={"label": "Sichtbarkeit"},
)
+ sourceKind: str = Field(
+ default="file",
+ description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...",
+ json_schema_extra={"label": "Quellenart"},
+ )
+ connectionId: Optional[str] = Field(
+ default=None,
+ description="UserConnection ID if this index entry originates from an external connector",
+ json_schema_extra={"label": "Connection-ID"},
+ )
neutralizationStatus: Optional[str] = Field(
default=None,
description="Neutralization status: completed, failed, skipped, None = not required",
diff --git a/modules/datamodels/datamodelPagination.py b/modules/datamodels/datamodelPagination.py
index 2719327b..7bda7717 100644
--- a/modules/datamodels/datamodelPagination.py
+++ b/modules/datamodels/datamodelPagination.py
@@ -13,6 +13,42 @@ import math
T = TypeVar('T')
+# ---------------------------------------------------------------------------
+# Table Grouping models
+# ---------------------------------------------------------------------------
+
+class TableGroupNode(BaseModel):
+ """
+ A single node in a user-defined group tree for a FormGeneratorTable.
+
+ Items belong to exactly one group (no multi-membership).
+ Groups can be nested to arbitrary depth via subGroups.
+ """
+ id: str
+ name: str
+ itemIds: List[str] = Field(default_factory=list)
+ subGroups: List['TableGroupNode'] = Field(default_factory=list)
+ order: int = 0
+ isExpanded: bool = True
+
+TableGroupNode.model_rebuild()
+
+
+class TableGrouping(BaseModel):
+ """
+ Persisted grouping configuration for one (user, contextKey) pair.
+ Stored in table_groupings in poweron_app (auto-created).
+
+ contextKey convention: API path without /api/ prefix and without trailing slash.
+ Examples: "connections", "prompts", "admin/users", "trustee/{instanceId}/documents"
+ """
+ id: str
+ userId: str
+ contextKey: str
+ rootGroups: List[TableGroupNode] = Field(default_factory=list)
+ updatedAt: Optional[float] = None
+
+
class SortField(BaseModel):
"""
Single sort field configuration.
@@ -24,12 +60,23 @@ class SortField(BaseModel):
class PaginationParams(BaseModel):
"""
Complete pagination state including page, sorting, and filters.
+
+ Grouping extensions (both optional — omit when not using grouping):
+ groupId — Scope the request to items belonging to this group.
+ The backend resolves it to an itemIds IN-filter before
+ applying normal pagination/search/filter logic.
+ Also applied for mode=ids and mode=filterValues so that
+ bulk-select and filter-dropdowns respect the group scope.
+ saveGroupTree — If present the backend persists this tree for the current
+ (user, contextKey) pair *before* fetching, then returns
+ the confirmed tree in the response groupTree field.
+ Omit on every request that does not change the group tree.
"""
page: int = Field(ge=1, description="Current page number (1-based)")
pageSize: int = Field(ge=1, le=1000, description="Number of items per page")
sort: List[SortField] = Field(default_factory=list, description="List of sort fields in priority order")
filters: Optional[Dict[str, Any]] = Field(
- default=None,
+ default=None,
description="""Filter criteria dictionary. Supports:
- General search: {"search": "text"} - searches across all text fields (case-insensitive)
- Field-specific filters:
@@ -38,6 +85,14 @@ class PaginationParams(BaseModel):
- Supported operators: equals/eq, contains, startsWith, endsWith, gt, gte, lt, lte, in, notIn
- Multiple filters are combined with AND logic"""
)
+ groupId: Optional[str] = Field(
+ default=None,
+ description="Scope request to items of this group (resolved server-side to itemIds IN-filter)",
+ )
+ saveGroupTree: Optional[List[Dict[str, Any]]] = Field(
+ default=None,
+ description="If set, persist this group tree before fetching (optimistic save)",
+ )
class PaginationRequest(BaseModel):
@@ -74,10 +129,19 @@ class PaginationMetadata(BaseModel):
class PaginatedResponse(BaseModel, Generic[T]):
"""
Response containing paginated data and metadata.
+
+ groupTree is included when the endpoint supports table grouping and the
+ current user has a saved group tree for the requested contextKey.
+ It is None when grouping is not configured for the endpoint or the user
+ has not created any groups yet. Frontend must treat None as an empty tree.
"""
items: List[T] = Field(..., description="Array of items for current page")
pagination: Optional[PaginationMetadata] = Field(..., description="Pagination metadata (None if pagination not applied)")
-
+ groupTree: Optional[List[TableGroupNode]] = Field(
+ default=None,
+ description="Current group tree for this (user, contextKey) pair — None if no grouping configured",
+ )
+
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -85,29 +149,33 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any]
"""
Normalize pagination dictionary to handle frontend variations.
Moves top-level "search" field into filters if present.
-
+ Grouping fields (groupId, saveGroupTree) are passed through as-is.
+
Args:
pagination_dict: Raw pagination dictionary from frontend
-
+
Returns:
Normalized pagination dictionary ready for PaginationParams parsing
"""
if not pagination_dict:
return pagination_dict
-
+
# Create a copy to avoid modifying the original
normalized = dict(pagination_dict)
-
+
# Ensure required fields have sensible defaults
if "page" not in normalized:
normalized["page"] = 1
if "pageSize" not in normalized:
normalized["pageSize"] = 25
-
+
# Move top-level "search" into filters if present
if "search" in normalized:
if "filters" not in normalized or normalized["filters"] is None:
normalized["filters"] = {}
normalized["filters"]["search"] = normalized.pop("search")
-
+
+ # groupId / saveGroupTree are valid PaginationParams fields — pass through unchanged.
+ # No transformation needed; Pydantic will validate them.
+
return normalized
diff --git a/modules/datamodels/datamodelUam.py b/modules/datamodels/datamodelUam.py
index 0f7fe6b8..6aba24eb 100644
--- a/modules/datamodels/datamodelUam.py
+++ b/modules/datamodels/datamodelUam.py
@@ -475,7 +475,23 @@ class UserConnection(PowerOnModel):
description="OAuth scopes granted for this connection",
json_schema_extra={"frontend_type": "list", "frontend_readonly": True, "frontend_required": False, "label": "Gewährte Berechtigungen"},
)
-
+ knowledgeIngestionEnabled: bool = Field(
+ default=False,
+ description="Whether the user has consented to knowledge ingestion for this connection",
+ json_schema_extra={"frontend_type": "boolean", "frontend_readonly": False, "frontend_required": False, "label": "Wissensdatenbank aktiv"},
+ )
+ knowledgePreferences: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description=(
+ "Per-connection knowledge ingestion preferences. schemaVersion=1 keys: "
+ "neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), "
+ "mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), "
+ "clickupScope (titles|title_description|with_comments), "
+ "surfaceToggles (dict per authority), maxAgeDays (int)."
+ ),
+ json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"},
+ )
+
@computed_field
@property
def connectionReference(self) -> str:
diff --git a/modules/features/commcoach/serviceCommcoachIndexer.py b/modules/features/commcoach/serviceCommcoachIndexer.py
index b43764a1..2f042795 100644
--- a/modules/features/commcoach/serviceCommcoachIndexer.py
+++ b/modules/features/commcoach/serviceCommcoachIndexer.py
@@ -174,14 +174,26 @@ async def indexSessionData(
for c in chunks
]
- await knowledgeService.indexFile(
- fileId=syntheticFileId,
- fileName=f"coaching-session-{sessionId[:8]}",
- mimeType="application/x-coaching-session",
- userId=userId,
- featureInstanceId=featureInstanceId,
- mandateId=mandateId,
- contentObjects=contentObjects,
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="coaching_session",
+ sourceId=syntheticFileId,
+ fileName=f"coaching-session-{sessionId[:8]}",
+ mimeType="application/x-coaching-session",
+ userId=userId,
+ featureInstanceId=featureInstanceId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ provenance={
+ "lane": "feature",
+ "feature": "commcoach",
+ "sessionId": sessionId,
+ "contextId": contextId,
+ "messageCount": len(messages or []),
+ },
+ )
)
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
except Exception as e:
diff --git a/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py b/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py
index 5ebf629e..10d1f47f 100644
--- a/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py
+++ b/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py
@@ -72,7 +72,7 @@ class AutoWorkflow(PowerOnModel):
},
)
featureInstanceId: str = Field(
- description="Feature instance ID",
+ description="Feature instance ID (GE owner instance / RBAC scope)",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
@@ -81,6 +81,17 @@ class AutoWorkflow(PowerOnModel):
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
+ targetFeatureInstanceId: Optional[str] = Field(
+ default=None,
+ description="Target feature instance for execution data scope. NULL for templates, mandatory for non-templates.",
+ json_schema_extra={
+ "frontend_type": "select",
+ "frontend_readonly": False,
+ "frontend_required": False,
+ "label": "Ziel-Instanz",
+ "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
+ },
+ )
label: str = Field(
description="User-friendly workflow name",
json_schema_extra={"frontend_type": "text", "frontend_required": True, "label": "Bezeichnung"},
diff --git a/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py b/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py
index c84db9d3..3b665981 100644
--- a/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py
+++ b/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py
@@ -217,6 +217,8 @@ class GraphicalEditorObjects:
data["id"] = str(uuid.uuid4())
data["mandateId"] = self.mandateId
data["featureInstanceId"] = self.featureInstanceId
+ if not data.get("targetFeatureInstanceId") and not data.get("isTemplate"):
+ data["targetFeatureInstanceId"] = self.featureInstanceId
if "active" not in data or data.get("active") is None:
data["active"] = True
data["invocations"] = normalize_invocations_list(data.get("invocations"))
diff --git a/modules/features/graphicalEditor/nodeDefinitions/ai.py b/modules/features/graphicalEditor/nodeDefinitions/ai.py
index d0e0eb22..65e97654 100644
--- a/modules/features/graphicalEditor/nodeDefinitions/ai.py
+++ b/modules/features/graphicalEditor/nodeDefinitions/ai.py
@@ -3,6 +3,15 @@
from modules.shared.i18nRegistry import t
+_AI_COMMON_PARAMS = [
+ {"name": "requireNeutralization", "type": "boolean", "required": False,
+ "frontendType": "checkbox", "default": False,
+ "description": t("Eingaben fuer diesen Call neutralisieren")},
+ {"name": "allowedModels", "type": "array", "required": False,
+ "frontendType": "modelMultiSelect", "default": [],
+ "description": t("Erlaubte LLM-Modelle (leer = alle erlaubten)")},
+]
+
AI_NODES = [
{
"id": "ai.prompt",
@@ -10,16 +19,21 @@ AI_NODES = [
"label": t("Prompt"),
"description": t("Prompt eingeben und KI führt aus"),
"parameters": [
- {"name": "aiPrompt", "type": "string", "required": True, "frontendType": "textarea",
+ {"name": "aiPrompt", "type": "string", "required": True, "frontendType": "templateTextarea",
"description": t("KI-Prompt")},
{"name": "resultType", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["txt", "json", "md", "csv", "xml", "html", "pdf", "docx", "xlsx", "pptx", "png", "jpg"]},
"description": t("Ausgabeformat"), "default": "txt"},
- {"name": "documentList", "type": "string", "required": False, "frontendType": "hidden",
- "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
+ {"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "dataRef",
+ "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
+ {"name": "context", "type": "string", "required": False, "frontendType": "dataRef",
+ "description": t("Kontextdaten fuer den Prompt (Upstream-Output binden)"), "default": ""},
+ {"name": "documentTheme", "type": "string", "required": False, "frontendType": "select",
+ "frontendOptions": {"options": ["general", "finance", "legal", "technical", "hr"]},
+ "description": t("Dokument-Thema (Style-Hinweis fuer den Renderer)"), "default": "general"},
{"name": "simpleMode", "type": "boolean", "required": False, "frontendType": "checkbox",
"description": t("Einfacher Modus"), "default": True},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": [
@@ -38,7 +52,7 @@ AI_NODES = [
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
"description": t("Recherche-Anfrage")},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
@@ -53,12 +67,12 @@ AI_NODES = [
"label": t("Dokument zusammenfassen"),
"description": t("Dokumentinhalt zusammenfassen"),
"parameters": [
- {"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
- "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
+ {"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
+ "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "summaryLength", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["brief", "medium", "detailed"]},
"description": t("Kurz, mittel oder ausführlich"), "default": "medium"},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@@ -73,11 +87,11 @@ AI_NODES = [
"label": t("Dokument übersetzen"),
"description": t("Dokument in Zielsprache übersetzen"),
"parameters": [
- {"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
- "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
+ {"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
+ "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "targetLanguage", "type": "string", "required": True, "frontendType": "text",
"description": t("Zielsprache (z.B. de, en, French)")},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@@ -92,12 +106,12 @@ AI_NODES = [
"label": t("Dokument konvertieren"),
"description": t("Dokument in anderes Format konvertieren"),
"parameters": [
- {"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
- "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
+ {"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
+ "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "targetFormat", "type": "string", "required": True, "frontendType": "select",
"frontendOptions": {"options": ["docx", "pdf", "xlsx", "csv", "txt", "html", "json", "md"]},
"description": t("Zielformat")},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@@ -114,7 +128,7 @@ AI_NODES = [
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
"description": t("Generierungs-Prompt")},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
@@ -134,7 +148,7 @@ AI_NODES = [
{"name": "resultType", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["py", "js", "ts", "html", "java", "cpp", "txt", "json", "csv", "xml"]},
"description": t("Datei-Endung der erzeugten Code-Datei"), "default": "py"},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
@@ -154,7 +168,7 @@ AI_NODES = [
"description": t("Konsolidierungsmodus"), "default": "summarize"},
{"name": "prompt", "type": "string", "required": False, "frontendType": "textarea",
"description": t("Optionaler Prompt für die Konsolidierung"), "default": ""},
- ],
+ ] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["AggregateResult", "Transit"]}},
diff --git a/modules/features/graphicalEditor/nodeDefinitions/email.py b/modules/features/graphicalEditor/nodeDefinitions/email.py
index 11ff9895..270b8d63 100644
--- a/modules/features/graphicalEditor/nodeDefinitions/email.py
+++ b/modules/features/graphicalEditor/nodeDefinitions/email.py
@@ -62,7 +62,7 @@ EMAIL_NODES = [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("E-Mail-Konto")},
- {"name": "context", "type": "string", "required": False, "frontendType": "textarea",
+ {"name": "context", "type": "string", "required": False, "frontendType": "templateTextarea",
"description": t("Kontext / Brief-Beschreibung für die KI-Komposition"), "default": ""},
{"name": "to", "type": "string", "required": False, "frontendType": "text",
"description": t("Empfänger (komma-separiert, optional für Entwurf)"), "default": ""},
diff --git a/modules/features/graphicalEditor/portTypes.py b/modules/features/graphicalEditor/portTypes.py
index e8d5b48d..f1513f9e 100644
--- a/modules/features/graphicalEditor/portTypes.py
+++ b/modules/features/graphicalEditor/portTypes.py
@@ -83,7 +83,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
PortField(name="listId", type="str", description="ClickUp-Listen-ID"),
PortField(name="name", type="str", required=False, description="Listenname"),
PortField(name="spaceId", type="str", required=False, description="Space-ID"),
- PortField(name="folderId", type="str", required=False, description="Ordner-ID"),
+ PortField(name="groupId", type="str", required=False, description="Gruppen-ID für die Gruppierungszuordnung"),
PortField(name="connection", type="ConnectionRef", required=False,
description="ClickUp-Verbindung"),
]),
diff --git a/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py b/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py
index aed94a68..4748f39a 100644
--- a/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py
+++ b/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py
@@ -111,6 +111,44 @@ def _validateInstanceAccess(instanceId: str, context: RequestContext) -> str:
return str(instance.mandateId) if instance.mandateId else ""
+def _validateTargetInstance(
+ workflowData: Dict[str, Any],
+ ownerInstanceId: str,
+ context: RequestContext,
+) -> None:
+ """Enforce targetFeatureInstanceId rules for non-template workflows.
+
+ - Templates (isTemplate=True) may omit targetFeatureInstanceId.
+ - Non-templates MUST have a non-empty targetFeatureInstanceId.
+ - If the targetFeatureInstanceId differs from the GE owner instance,
+ the user must also have FeatureAccess on that target instance.
+ """
+ if workflowData.get("isTemplate"):
+ return
+
+ targetId = workflowData.get("targetFeatureInstanceId")
+ if not targetId:
+ return
+
+ if targetId == ownerInstanceId:
+ return
+
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ rootInterface = getRootInterface()
+ targetInstance = rootInterface.getFeatureInstance(targetId)
+ if not targetInstance:
+ raise HTTPException(
+ status_code=400,
+ detail=routeApiMsg("targetFeatureInstanceId refers to a non-existent feature instance"),
+ )
+ targetAccess = rootInterface.getFeatureAccess(str(context.user.id), targetId)
+ if not targetAccess or not targetAccess.enabled:
+ raise HTTPException(
+ status_code=403,
+ detail=routeApiMsg("Access denied to target feature instance"),
+ )
+
+
@router.get("/{instanceId}/node-types")
@limiter.limit("60/minute")
def get_node_types(
@@ -318,9 +356,12 @@ async def post_execute(
workflowId = body.get("workflowId")
req_nodes = graph.get("nodes") or []
workflow_for_envelope: Optional[Dict[str, Any]] = None
+ targetFeatureInstanceId: Optional[str] = None
if workflowId and not str(workflowId).startswith("transient-"):
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
workflow_for_envelope = iface.getWorkflow(workflowId)
+ if workflow_for_envelope:
+ targetFeatureInstanceId = workflow_for_envelope.get("targetFeatureInstanceId")
if workflowId and len(req_nodes) == 0:
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
wf = iface.getWorkflow(workflowId)
@@ -328,10 +369,18 @@ async def post_execute(
graph = wf["graph"]
logger.info("graphicalEditor execute: loaded graph from workflow %s", workflowId)
workflow_for_envelope = wf
+ targetFeatureInstanceId = wf.get("targetFeatureInstanceId")
if not workflowId:
import uuid
workflowId = f"transient-{uuid.uuid4().hex[:12]}"
logger.info("graphicalEditor execute: using transient workflowId=%s", workflowId)
+
+ if targetFeatureInstanceId and targetFeatureInstanceId != instanceId:
+ _validateTargetInstance(
+ {"targetFeatureInstanceId": targetFeatureInstanceId},
+ instanceId,
+ context,
+ )
nodes_count = len(graph.get("nodes") or [])
connections_count = len(graph.get("connections") or [])
logger.info(
@@ -363,6 +412,7 @@ async def post_execute(
automation2_interface=ge_interface,
run_envelope=run_env,
label=_wfLabel,
+ targetFeatureInstanceId=targetFeatureInstanceId,
)
logger.info(
"graphicalEditor execute result: success=%s error=%s nodeOutputs_keys=%s failedNode=%s paused=%s",
@@ -1371,6 +1421,7 @@ def create_workflow(
) -> dict:
"""Create a new workflow."""
mandateId = _validateInstanceAccess(instanceId, context)
+ _validateTargetInstance(body, instanceId, context)
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
created = iface.createWorkflow(body)
return created
@@ -1388,6 +1439,11 @@ def update_workflow(
"""Update a workflow."""
mandateId = _validateInstanceAccess(instanceId, context)
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
+ existing = iface.getWorkflow(workflowId)
+ if not existing:
+ raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found"))
+ merged = {**existing, **body}
+ _validateTargetInstance(merged, instanceId, context)
updated = iface.updateWorkflow(workflowId, body)
if not updated:
raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found"))
diff --git a/modules/features/trustee/mainTrustee.py b/modules/features/trustee/mainTrustee.py
index fba4346a..b8ab853d 100644
--- a/modules/features/trustee/mainTrustee.py
+++ b/modules/features/trustee/mainTrustee.py
@@ -361,6 +361,17 @@ QUICK_ACTIONS = [
# The placeholder {{featureInstanceId}} is replaced by _copyTemplateWorkflows.
# ---------------------------------------------------------------------------
+_FINANCE_STYLE_HINT = (
+ "\n\nWenn du ein Dokument erstellst, verwende einen professionellen Finanz-Stil:\n"
+ "- Schriftart: Calibri\n"
+ "- Primaerfarbe: #1F3864 (Dunkelblau)\n"
+ "- Akzentfarbe: #2980B9\n"
+ "- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n"
+ "- Konservatives, seriöses Layout\n"
+ "Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen."
+)
+
+
def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]:
"""Build a standard analysis graph: trigger -> refreshAccountingData -> ai.prompt."""
return {
@@ -370,8 +381,9 @@ def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]:
"parameters": {"featureInstanceId": "{{featureInstanceId}}", "forceRefresh": False}, "position": {"x": 250, "y": 0}},
{"id": "analyse", "type": "ai.prompt", "label": "Analyse", "_method": "ai", "_action": "process",
"parameters": {
- "aiPrompt": prompt,
+ "aiPrompt": prompt + _FINANCE_STYLE_HINT,
"context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]},
+ "requireNeutralization": False,
"simpleMode": False,
}, "position": {"x": 500, "y": 0}},
],
@@ -440,15 +452,33 @@ TEMPLATE_WORKFLOWS = [
{"id": "analyse", "type": "ai.prompt", "label": "Budget-Analyse", "_method": "ai", "_action": "process",
"parameters": {
"aiPrompt": (
- "Fuehre einen Budget-Soll/Ist-Vergleich durch.\n"
- "Die Budget-Datei (Excel) wurde als Dokument uebergeben. "
- "Die aktuellen Buchhaltungsdaten sind im Kontext verfuegbar.\n"
- "1. Lies die Soll-Werte aus dem uebergebenen Budget-Dokument\n"
- "2. Vergleiche sie mit den Ist-Werten aus der Buchhaltung pro Konto\n"
- "3. Berechne die Abweichung (absolut und prozentual)\n"
- "4. Erstelle ein Abweichungs-Chart (Balkendiagramm: Soll vs. Ist pro Konto)\n"
- "5. Markiere kritische Abweichungen (>10%) und gib eine kurze Einschaetzung"
+ "Fuehre einen Budget-Soll/Ist-Vergleich durch und liefere EIN Excel-Dokument "
+ "mit folgender Struktur:\n\n"
+ "1. Tabelle \"Konten-Vergleich\" -- EINE Tabelle, EINE Zeile pro Konto:\n"
+ " Spalten: Konto-Nr | Konto-Name | Soll | Ist | Abweichung absolut | "
+ "Abweichung % | Status (OK / Warnung / Kritisch).\n"
+ "2. EINE Visualisierung \"Soll vs. Ist gesamt\" -- ein einziges "
+ "Balkendiagramm UNTER der Tabelle, das ALLE Konten in einer Grafik "
+ "gegenueberstellt (gruppierte Balken: Soll und Ist je Konto).\n"
+ "3. Kurzer Management-Summary-Absatz (3-5 Saetze) UNTER dem Chart "
+ "mit den 3 groessten Abweichungen (>10%) und einer fachlichen "
+ "Einschaetzung.\n\n"
+ "Verwende die uebergebene Budget-Datei als Soll-Quelle und die im "
+ "Kontext bereitgestellten Buchhaltungsdaten als Ist-Quelle.\n"
+ "WICHTIG: Erstelle KEINEN separaten Chart pro Konto. Nur EIN "
+ "Uebersichts-Chart ueber alle Konten ist gewuenscht.\n\n"
+ "Hinweis: Das documentTheme ist 'finance'. Wenn du ein Dokument erstellst, "
+ "verwende einen professionellen Finanz-Stil:\n"
+ "- Schriftart: Calibri\n"
+ "- Primaerfarbe: #1F3864 (Dunkelblau)\n"
+ "- Akzentfarbe: #2980B9\n"
+ "- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n"
+ "- Konservatives, seriöses Layout\n"
+ "Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen."
),
+ "resultType": "xlsx",
+ "documentTheme": "finance",
+ "requireNeutralization": False,
"documentList": {"type": "ref", "nodeId": "trigger", "path": ["payload", "documentList"]},
"context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]},
"simpleMode": False,
diff --git a/modules/features/workspace/datamodelFeatureWorkspace.py b/modules/features/workspace/datamodelFeatureWorkspace.py
index b12d4b84..4e32702c 100644
--- a/modules/features/workspace/datamodelFeatureWorkspace.py
+++ b/modules/features/workspace/datamodelFeatureWorkspace.py
@@ -2,8 +2,8 @@
# All rights reserved.
"""Workspace feature data models — WorkspaceUserSettings."""
-from typing import Optional
-from pydantic import BaseModel, Field
+from typing import List, Optional
+from pydantic import Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
import uuid
@@ -52,3 +52,18 @@ class WorkspaceUserSettings(PowerOnModel):
description="Max agent rounds override (None = instance default)",
json_schema_extra={"label": "Max. Agenten-Runden", "frontend_type": "number", "frontend_readonly": False, "frontend_required": False},
)
+ requireNeutralization: bool = Field(
+ default=False,
+ description="Default neutralization setting for this user",
+ json_schema_extra={"label": "Neutralisierung", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False},
+ )
+ allowedProviders: List[str] = Field(
+ default_factory=list,
+ description="Allowed AI providers (empty = all permitted by RBAC)",
+ json_schema_extra={"label": "Erlaubte Provider", "frontend_type": "multiselect", "frontend_readonly": False, "frontend_required": False},
+ )
+ allowedModels: List[str] = Field(
+ default_factory=list,
+ description="Allowed AI models (empty = all permitted)",
+ json_schema_extra={"label": "Erlaubte Modelle", "frontend_type": "modelMultiSelect", "frontend_readonly": False, "frontend_required": False},
+ )
diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py
index 3e1a54b7..9595fee4 100644
--- a/modules/features/workspace/routeFeatureWorkspace.py
+++ b/modules/features/workspace/routeFeatureWorkspace.py
@@ -110,6 +110,7 @@ class WorkspaceInputRequest(BaseModel):
workflowId: Optional[str] = Field(default=None, description="Continue existing workflow")
userLanguage: str = Field(default="en", description="User language code")
allowedProviders: List[str] = Field(default_factory=list, description="Restrict AI to these providers")
+ allowedModels: List[str] = Field(default_factory=list, description="Restrict AI to these models")
requireNeutralization: Optional[bool] = Field(default=None, description="Per-request neutralization override")
@@ -635,6 +636,7 @@ async def streamWorkspaceStart(
userLanguage=userInput.userLanguage,
instanceConfig=instanceConfig,
allowedProviders=userInput.allowedProviders,
+ allowedModels=userInput.allowedModels,
requireNeutralization=userInput.requireNeutralization,
billingFeatureCode=wsBillingFeatureCode,
)
@@ -692,6 +694,7 @@ async def _runWorkspaceAgent(
userLanguage: str = "en",
instanceConfig: Dict[str, Any] = None,
allowedProviders: List[str] = None,
+ allowedModels: List[str] = None,
requireNeutralization: Optional[bool] = None,
billingFeatureCode: Optional[str] = None,
):
@@ -715,6 +718,9 @@ async def _runWorkspaceAgent(
logger.info(f"Workspace agent: allowedProviders={allowedProviders}")
else:
logger.debug("Workspace agent: no allowedProviders in request")
+ if allowedModels:
+ aiService.services.allowedModels = allowedModels
+ logger.info(f"Workspace agent: allowedModels={allowedModels}")
if requireNeutralization is not None:
ctx.requireNeutralization = requireNeutralization
@@ -1202,7 +1208,7 @@ async def patchWorkspaceWorkflowAttachments(
# ---------------------------------------------------------------------------
-# File and folder list endpoints
+# File endpoints
# ---------------------------------------------------------------------------
@router.get("/{instanceId}/files")
@@ -1210,7 +1216,6 @@ async def patchWorkspaceWorkflowAttachments(
async def listWorkspaceFiles(
request: Request,
instanceId: str = Path(...),
- folderId: Optional[str] = Query(None),
tags: Optional[str] = Query(None),
search: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
@@ -1265,30 +1270,6 @@ async def getFileContent(
return Response(content=content, media_type=mimeType)
-@router.get("/{instanceId}/folders")
-@limiter.limit("300/minute")
-async def listWorkspaceFolders(
- request: Request,
- instanceId: str = Path(...),
- parentId: Optional[str] = Query(None),
- context: RequestContext = Depends(getRequestContext),
-):
- _mandateId, _ = _validateInstanceAccess(instanceId, context)
- try:
- from modules.serviceCenter import getService
- from modules.serviceCenter.context import ServiceCenterContext
- ctx = ServiceCenterContext(
- user=context.user,
- mandate_id=_mandateId or "",
- feature_instance_id=instanceId,
- )
- chatService = getService("chat", ctx)
- folders = chatService.listFolders(parentId=parentId)
- return JSONResponse({"folders": folders or []})
- except Exception:
- return JSONResponse({"folders": []})
-
-
@router.get("/{instanceId}/datasources")
@limiter.limit("300/minute")
async def listWorkspaceDataSources(
@@ -2139,6 +2120,76 @@ async def updateGeneralSettings(
return await getGeneralSettings(request, instanceId, context)
+# =========================================================================
+# User-level AI settings (neutralisation, providers, models)
+# =========================================================================
+
+@router.get("/{instanceId}/user-settings")
+@limiter.limit("120/minute")
+async def getWorkspaceUserSettings(
+ request: Request,
+ instanceId: str = Path(...),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Get the current user's workspace AI settings (auto-creates with defaults if not exists)."""
+ _mandateId, _ = _validateInstanceAccess(instanceId, context)
+ wsInterface = _getWorkspaceInterface(context, instanceId)
+ userId = str(context.user.id)
+
+ settings = wsInterface.getWorkspaceUserSettings(userId)
+ if settings:
+ return JSONResponse({
+ "requireNeutralization": settings.requireNeutralization,
+ "allowedProviders": settings.allowedProviders,
+ "allowedModels": settings.allowedModels,
+ })
+
+ data = {
+ "userId": userId,
+ "mandateId": str(context.mandateId) if context.mandateId else "",
+ "featureInstanceId": instanceId,
+ }
+ created = wsInterface.saveWorkspaceUserSettings(data)
+ return JSONResponse({
+ "requireNeutralization": created.requireNeutralization,
+ "allowedProviders": created.allowedProviders,
+ "allowedModels": created.allowedModels,
+ })
+
+
+@router.put("/{instanceId}/user-settings")
+@limiter.limit("120/minute")
+async def putWorkspaceUserSettings(
+ request: Request,
+ instanceId: str = Path(...),
+ body: dict = Body(...),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Save the current user's workspace AI settings."""
+ _mandateId, _ = _validateInstanceAccess(instanceId, context)
+ wsInterface = _getWorkspaceInterface(context, instanceId)
+ userId = str(context.user.id)
+
+ data = {
+ "userId": userId,
+ "mandateId": str(context.mandateId) if context.mandateId else "",
+ "featureInstanceId": instanceId,
+ }
+ if "requireNeutralization" in body:
+ data["requireNeutralization"] = bool(body["requireNeutralization"])
+ if "allowedProviders" in body:
+ data["allowedProviders"] = body["allowedProviders"]
+ if "allowedModels" in body:
+ data["allowedModels"] = body["allowedModels"]
+
+ saved = wsInterface.saveWorkspaceUserSettings(data)
+ return JSONResponse({
+ "requireNeutralization": saved.requireNeutralization,
+ "allowedProviders": saved.allowedProviders,
+ "allowedModels": saved.allowedModels,
+ })
+
+
# =========================================================================
# RAG / Knowledge — anonymised instance statistics (presentation / KPIs)
# =========================================================================
diff --git a/modules/interfaces/_legacyMigrationTelemetry.py b/modules/interfaces/_legacyMigrationTelemetry.py
new file mode 100644
index 00000000..4a0db04c
--- /dev/null
+++ b/modules/interfaces/_legacyMigrationTelemetry.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Lightweight Bootstrap-Telemetrie fuer entfernte Migrationsroutinen.
+
+Wenn eine idempotente Bootstrap-Migration (z.B. ``_migrateAndDropSysAdminRole``)
+aus dem Boot-Pfad entfernt wird, koennte ein theoretischer Edge-Case (alte
+DB-Restore, manueller INSERT) wieder Legacy-Daten ins System bringen. Damit das
+nicht still bleibt, ruft ``initBootstrap`` nach Abschluss aller Init-Schritte
+einmalig ``runLegacyDataChecks`` auf -- das logged WARN bei Restbestand.
+
+Designprinzipien:
+- KEINE Schreibzugriffe (rein lesend).
+- Process-lokal gecached (``_cache``), damit identische Boots/Reloads den Check
+ nur einmal laufen lassen.
+- Pro Check eine Recordset-Abfrage; Ausnahmen werden als WARN geloggt, nicht
+ re-raised, damit Telemetrie den Boot nie crasht.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from modules.connectors.connectorDbPostgre import DatabaseConnector
+from modules.datamodels.datamodelRbac import Role
+from modules.datamodels.datamodelUam import Mandate
+from modules.shared.mandateNameUtils import isValidMandateName
+
+logger = logging.getLogger(__name__)
+
+_alreadyRan: bool = False
+
+
+def runLegacyDataChecks(db: DatabaseConnector) -> None:
+ """Logged WARN, falls noch Legacy-Daten existieren, die durch entfernte
+ Migrationsroutinen behandelt wurden. Prozessweit nur einmal aktiv.
+
+ Aufruf: am Ende von ``initBootstrap``.
+ """
+ global _alreadyRan
+ if _alreadyRan:
+ return
+ _alreadyRan = True
+
+ _checkMandateDescription(db)
+ _checkMandateSlugRules(db)
+ _checkLegacyRootMandate(db)
+ _checkSysadminRole(db)
+ _backfillTargetFeatureInstanceId()
+
+
+def _safe(checkName: str, fn) -> Any:
+ try:
+ return fn()
+ except Exception as exc:
+ logger.warning(
+ "Legacy-data telemetry check '%s' failed: %s: %s",
+ checkName, type(exc).__name__, exc,
+ )
+ return None
+
+
+def _checkMandateDescription(db: DatabaseConnector) -> None:
+ def _do() -> None:
+ rows = db.getRecordset(Mandate)
+ bad = [
+ r.get("id") for r in rows
+ if r.get("description") and not r.get("label")
+ ]
+ if bad:
+ logger.warning(
+ "Legacy-data check: %d Mandate row(s) still have description "
+ "but empty label (removed migration: _migrateMandateDescriptionToLabel). "
+ "Run scripts/script_db_audit_legacy_state.py for details. IDs: %s",
+ len(bad), bad[:5],
+ )
+
+ _safe("mandate-description", _do)
+
+
+def _checkMandateSlugRules(db: DatabaseConnector) -> None:
+ def _do() -> None:
+ rows = db.getRecordset(Mandate)
+ seen: set[str] = set()
+ bad: list[str] = []
+ for r in sorted(rows, key=lambda x: str(x.get("id", ""))):
+ mid = r.get("id")
+ if not mid:
+ continue
+ name = (r.get("name") or "").strip()
+ labelRaw = r.get("label")
+ labelEmpty = not (labelRaw or "").strip() if labelRaw is not None else True
+ invalid = not isValidMandateName(name)
+ collides = name in seen
+ if not invalid and not collides:
+ seen.add(name)
+ if labelEmpty or invalid or collides:
+ bad.append(str(mid))
+ if bad:
+ logger.warning(
+ "Legacy-data check: %d Mandate row(s) violate slug/label rules "
+ "(removed migration: _migrateMandateNameLabelSlugRules). "
+ "Run scripts/script_db_audit_legacy_state.py for details. IDs: %s",
+ len(bad), bad[:5],
+ )
+
+ _safe("mandate-slug-rules", _do)
+
+
+def _checkLegacyRootMandate(db: DatabaseConnector) -> None:
+ def _do() -> None:
+ legacy = db.getRecordset(Mandate, recordFilter={"name": "Root"})
+ rootRows = db.getRecordset(Mandate, recordFilter={"name": "root"})
+ legacyByFlag = [r for r in rootRows if not r.get("isSystem")]
+ all_ = list(legacy) + legacyByFlag
+ if all_:
+ logger.warning(
+ "Legacy-data check: %d Root-Mandate row(s) still in legacy form "
+ "(removed migration: initRootMandate-legacy-branch). IDs: %s",
+ len(all_), [r.get("id") for r in all_][:5],
+ )
+
+ _safe("root-mandate-legacy", _do)
+
+
+def _checkSysadminRole(db: DatabaseConnector) -> None:
+ def _do() -> None:
+ rootMandates = db.getRecordset(
+ Mandate, recordFilter={"name": "root", "isSystem": True}
+ )
+ if not rootMandates:
+ return
+ rootId = str(rootMandates[0].get("id"))
+ rows = db.getRecordset(
+ Role,
+ recordFilter={
+ "roleLabel": "sysadmin",
+ "mandateId": rootId,
+ "featureInstanceId": None,
+ },
+ )
+ if rows:
+ logger.warning(
+ "Legacy-data check: %d 'sysadmin' role(s) still present in root mandate "
+ "(removed migration: _migrateAndDropSysAdminRole). "
+ "Authority is now User.isPlatformAdmin -- migrate manually. IDs: %s",
+ len(rows), [r.get("id") for r in rows],
+ )
+
+ _safe("sysadmin-role", _do)
+
+
+def _backfillTargetFeatureInstanceId() -> None:
+ """Idempotent backfill: set targetFeatureInstanceId = featureInstanceId
+ for all non-template AutoWorkflow rows where it is still NULL.
+
+ Connects to ``poweron_graphicaleditor`` independently.
+ """
+ def _do() -> None:
+ from modules.shared.configuration import APP_CONFIG
+ from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow
+
+ dbHost = APP_CONFIG.get("DB_HOST", "localhost")
+ dbUser = APP_CONFIG.get("DB_USER")
+ dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD")
+ dbPort = int(APP_CONFIG.get("DB_PORT", 5432))
+ geDb = DatabaseConnector(
+ dbHost=dbHost,
+ dbDatabase="poweron_graphicaleditor",
+ dbUser=dbUser,
+ dbPassword=dbPassword,
+ dbPort=dbPort,
+ userId=None,
+ )
+ if not geDb._ensureTableExists(AutoWorkflow):
+ return
+
+ rows = geDb.getRecordset(AutoWorkflow) or []
+ backfilled = 0
+ for r in rows:
+ if r.get("isTemplate"):
+ continue
+ if r.get("targetFeatureInstanceId"):
+ continue
+ srcId = r.get("featureInstanceId")
+ if not srcId:
+ continue
+ geDb.recordModify(AutoWorkflow, r["id"], {"targetFeatureInstanceId": srcId})
+ backfilled += 1
+
+ if backfilled:
+ logger.info(
+ "targetFeatureInstanceId backfill: set %d non-template AutoWorkflow row(s) "
+ "to their featureInstanceId",
+ backfilled,
+ )
+
+ _safe("backfill-targetFeatureInstanceId", _do)
diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index a859ffa7..dcf819cc 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -111,6 +111,19 @@ class AiObjects:
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1,
)
+ allowedModels = getattr(options, 'allowedModels', None) if options else None
+ if allowedModels:
+ filteredModels = [m for m in availableModels if m.name in allowedModels]
+ if filteredModels:
+ availableModels = filteredModels
+ else:
+ errorMsg = f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}"
+ logger.error(errorMsg)
+ return AiCallResponse(
+ content=errorMsg, modelName="error", priceCHF=0.0,
+ processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1,
+ )
+
failoverModelList = modelSelector.getFailoverModelList(prompt, context, options, availableModels)
if not failoverModelList:
@@ -364,6 +377,19 @@ class AiObjects:
)
return
+ allowedModels = getattr(options, 'allowedModels', None) if options else None
+ if allowedModels:
+ filtered = [m for m in availableModels if m.name in allowedModels]
+ if filtered:
+ availableModels = filtered
+ else:
+ yield AiCallResponse(
+ content=f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}",
+ modelName="error", priceCHF=0.0, processingTime=0.0,
+ bytesSent=0, bytesReceived=0, errorCount=1,
+ )
+ return
+
failoverModelList = modelSelector.getFailoverModelList(
request.prompt, request.context or "", options, availableModels
)
@@ -516,6 +542,14 @@ class AiObjects:
else:
logger.warning(f"No embedding models match allowedProviders {allowedProviders}")
+ allowedModels = getattr(options, 'allowedModels', None) if options else None
+ if allowedModels:
+ filtered = [m for m in availableModels if m.name in allowedModels]
+ if filtered:
+ availableModels = filtered
+ else:
+ logger.warning(f"No embedding models match allowedModels {allowedModels}")
+
failoverModelList = modelSelector.getFailoverModelList(
combinedText, "", options, availableModels
)
diff --git a/modules/interfaces/interfaceBootstrap.py b/modules/interfaces/interfaceBootstrap.py
index a6ae0052..b7a56a02 100644
--- a/modules/interfaces/interfaceBootstrap.py
+++ b/modules/interfaces/interfaceBootstrap.py
@@ -56,14 +56,8 @@ def initBootstrap(db: DatabaseConnector) -> None:
logger.info("Starting system bootstrap")
- # Initialize root mandate
mandateId = initRootMandate(db)
- # Migrate existing mandate records: description -> label
- _migrateMandateDescriptionToLabel(db)
- _migrateMandateNameLabelSlugRules(db)
-
- # Clean up duplicate roles and fix corrupted templates FIRST
_deduplicateRoles(db)
# Initialize system role TEMPLATES (mandateId=None, isSystemRole=True)
@@ -76,14 +70,6 @@ def initBootstrap(db: DatabaseConnector) -> None:
# This also serves as migration for existing mandates that don't have instance roles yet
_ensureAllMandatesHaveSystemRoles(db)
- # Migration: eliminate the legacy ``sysadmin`` role in root mandate
- # (replaced by ``User.isPlatformAdmin`` flag — see
- # wiki/c-work/4-done/2026-04-sysadmin-authority-split.md).
- # Idempotent: noop after first successful run.
- if mandateId:
- _migrateAndDropSysAdminRole(db, mandateId)
-
- # Ensure UI rules for navigation items (admin/user/viewer roles)
_ensureUiContextRules(db)
# Initialize admin user
@@ -129,9 +115,22 @@ def initBootstrap(db: DatabaseConnector) -> None:
# Bootstrap system workflow templates for graphical editor
_bootstrapSystemTemplates(db)
+ # Sync feature template workflows (update graph of existing instance workflows
+ # whose templateSourceId matches a current code-defined template)
+ _syncFeatureTemplateWorkflows()
+
# Ensure billing settings and accounts exist for all mandates
_bootstrapBilling()
+ # Telemetrie: warne falls Restbestaende der entfernten idempotenten
+ # Migrationen wieder auftauchen (Edge-Case: alter DB-Restore o.ae.).
+ # Schreibt nicht, scheitert nicht den Boot.
+ try:
+ from modules.interfaces._legacyMigrationTelemetry import runLegacyDataChecks
+ runLegacyDataChecks(db)
+ except Exception as e:
+ logger.warning(f"Legacy-data telemetry skipped: {e}")
+
def _bootstrapBilling() -> None:
"""
@@ -195,6 +194,97 @@ def _bootstrapSystemTemplates(db: DatabaseConnector) -> None:
logger.warning(f"System workflow template bootstrap failed: {e}")
+def _syncFeatureTemplateWorkflows() -> None:
+ """Sync existing instance-scoped workflows with current code-defined templates.
+
+ For each feature that exposes getTemplateWorkflows(), find all AutoWorkflow
+ rows whose templateSourceId matches a template ID and update their graph
+ if the code-defined version has changed. Preserves instance-specific
+ fields (label, tags, targetFeatureInstanceId, invocations, active).
+ Idempotent, runs on every boot.
+ """
+ import json
+
+ try:
+ from modules.system.registry import loadFeatureMainModules
+ from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow
+ from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
+
+ mainModules = loadFeatureMainModules()
+
+ templatesBySourceId: dict = {}
+ for featureCode, mod in mainModules.items():
+ getTemplateWorkflows = getattr(mod, "getTemplateWorkflows", None)
+ if not getTemplateWorkflows:
+ continue
+ try:
+ templates = getTemplateWorkflows() or []
+ except Exception:
+ continue
+ for tpl in templates:
+ tplId = tpl.get("id")
+ if tplId:
+ templatesBySourceId[tplId] = tpl
+
+ if not templatesBySourceId:
+ logger.info("_syncFeatureTemplateWorkflows: no templates found, skipping")
+ return
+ logger.info(f"_syncFeatureTemplateWorkflows: found {len(templatesBySourceId)} template(s): {list(templatesBySourceId.keys())}")
+
+ greenfieldDb = DatabaseConnector(
+ dbHost=APP_CONFIG.get("DB_HOST", "localhost"),
+ dbDatabase=graphicalEditorDatabase,
+ dbUser=APP_CONFIG.get("DB_USER"),
+ dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"),
+ )
+
+ updated = 0
+ for sourceId, tpl in templatesBySourceId.items():
+ instances = greenfieldDb.getRecordset(AutoWorkflow, recordFilter={
+ "templateSourceId": sourceId,
+ "isTemplate": False,
+ })
+ if not instances:
+ continue
+
+ canonicalGraph = tpl.get("graph", {})
+
+ for inst in instances:
+ instId = inst.get("id") if isinstance(inst, dict) else getattr(inst, "id", None)
+ targetInstanceId = (
+ inst.get("targetFeatureInstanceId") if isinstance(inst, dict)
+ else getattr(inst, "targetFeatureInstanceId", None)
+ ) or ""
+
+ graphJson = json.dumps(canonicalGraph)
+ graphJson = graphJson.replace("{{featureInstanceId}}", targetInstanceId)
+ newGraph = json.loads(graphJson)
+
+ existingGraph = inst.get("graph") if isinstance(inst, dict) else getattr(inst, "graph", None)
+ if isinstance(existingGraph, str):
+ try:
+ existingGraph = json.loads(existingGraph)
+ except Exception:
+ existingGraph = None
+
+ if existingGraph == newGraph:
+ logger.debug(f"_syncFeatureTemplateWorkflows: graph unchanged for workflow {instId} (template={sourceId})")
+ continue
+ logger.debug(f"_syncFeatureTemplateWorkflows: graph DIFFERS for workflow {instId} (template={sourceId}), updating")
+
+ greenfieldDb.recordModify(AutoWorkflow, instId, {"graph": newGraph})
+ updated += 1
+ logger.info(f"_syncFeatureTemplateWorkflows: updated graph for workflow {instId} (template={sourceId})")
+
+ if updated:
+ logger.info(f"_syncFeatureTemplateWorkflows: synced {updated} workflow(s) with current templates")
+ else:
+ logger.info("_syncFeatureTemplateWorkflows: all instance graphs already match current templates")
+ greenfieldDb.close()
+ except Exception as e:
+ logger.warning(f"Feature template workflow sync failed: {e}")
+
+
def _buildSystemTemplates():
"""Build the graph definitions for platform system templates."""
return [
@@ -396,21 +486,12 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]:
Returns:
Mandate ID if created or found, None otherwise
"""
- # Find existing root mandate by name AND isSystem flag
existingMandates = db.getRecordset(Mandate, recordFilter={"name": "root", "isSystem": True})
if existingMandates:
mandateId = existingMandates[0].get("id")
logger.info(f"Root mandate already exists with ID {mandateId}")
return mandateId
-
- # Check for legacy root mandates (name="Root" without isSystem flag) and migrate
- legacyMandates = db.getRecordset(Mandate, recordFilter={"name": "Root"})
- if legacyMandates:
- mandateId = legacyMandates[0].get("id")
- logger.info(f"Migrating legacy Root mandate {mandateId}: setting name='root', isSystem=True")
- db.recordModify(Mandate, mandateId, {"name": "root", "isSystem": True})
- return mandateId
-
+
logger.info("Creating Root mandate")
rootMandate = Mandate(name="root", label="Root", isSystem=True, enabled=True)
createdMandate = db.recordCreate(Mandate, rootMandate)
@@ -419,98 +500,6 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]:
return mandateId
-def _migrateMandateDescriptionToLabel(db: DatabaseConnector) -> None:
- """
- Migration: Rename 'description' field to 'label' in all Mandate records.
- Copies existing 'description' values to 'label' and removes the old field.
- Safe to run multiple times (idempotent).
- """
- allMandates = db.getRecordset(Mandate)
- migratedCount = 0
- for mandateRecord in allMandates:
- mandateId = mandateRecord.get("id")
- hasDescription = "description" in mandateRecord and mandateRecord.get("description") is not None
- hasLabel = "label" in mandateRecord and mandateRecord.get("label") is not None
-
- if hasDescription and not hasLabel:
- # Copy description to label
- updateData = {"label": mandateRecord["description"]}
- db.recordModify(Mandate, mandateId, updateData)
- migratedCount += 1
- logger.info(f"Migrated mandate {mandateId}: description -> label")
-
- if migratedCount > 0:
- logger.info(f"Migrated {migratedCount} mandate(s) from description to label")
- else:
- logger.debug("No mandate description->label migration needed")
-
-
-def _migrateMandateNameLabelSlugRules(db: DatabaseConnector) -> None:
- """
- Migration: normalize Mandate.name to the slug rules ([a-z0-9-], length 2..32, single
- hyphen segments) and ensure Mandate.label is non-empty.
-
- Rules (see wiki/c-work/1-plan/2026-04-mandate-name-label-logic.md):
- 1. If ``label`` is empty/None → set ``label := name`` (or "Mandate" when both empty).
- 2. If ``name`` is not a valid slug, or collides with an earlier mandate in stable id
- order, allocate a unique slug from the (now non-empty) ``label`` using
- ``slugifyMandateName`` + ``allocateUniqueMandateSlug``.
-
- Idempotent: a second run is a no-op because all valid names stay valid and stay unique.
- Each rename and label fill-in is logged for audit.
- """
- from modules.shared.mandateNameUtils import (
- allocateUniqueMandateSlug,
- isValidMandateName,
- slugifyMandateName,
- )
-
- allRows = db.getRecordset(Mandate)
- if not allRows:
- return
- sortedRows = sorted(allRows, key=lambda r: str(r.get("id", "")))
-
- used: set[str] = set()
- labelFills = 0
- nameRenames: list[tuple[str, str, str]] = []
-
- for rec in sortedRows:
- mid = rec.get("id")
- if not mid:
- continue
- name = (rec.get("name") or "").strip()
- labelRaw = rec.get("label")
- label = (labelRaw or "").strip() if labelRaw is not None else ""
-
- if not label:
- label = name if name else "Mandate"
- db.recordModify(Mandate, mid, {"label": label})
- labelFills += 1
- logger.info(f"Mandate {mid}: filled empty label with '{label}'")
-
- nameFits = isValidMandateName(name)
- nameCollides = name in used
- if nameFits and not nameCollides:
- used.add(name)
- continue
-
- base = slugifyMandateName(label) or "mn"
- newName = allocateUniqueMandateSlug(base, used)
- used.add(newName)
- if newName != name:
- db.recordModify(Mandate, mid, {"name": newName})
- nameRenames.append((str(mid), name, newName))
- logger.info(f"Mandate {mid}: renamed name '{name}' -> '{newName}'")
-
- if labelFills or nameRenames:
- logger.info(
- "Mandate name/label slug migration: %d label fill-in(s), %d name rename(s)",
- labelFills, len(nameRenames),
- )
- else:
- logger.debug("No mandate name/label slug migration needed")
-
-
def initAdminUser(db: DatabaseConnector, mandateId: Optional[str]) -> Optional[str]:
"""
Creates the Admin user if it doesn't exist.
@@ -837,101 +826,6 @@ def copySystemRolesToMandate(db: DatabaseConnector, mandateId: str) -> int:
return copiedCount
-def _migrateAndDropSysAdminRole(db: DatabaseConnector, mandateId: str) -> None:
- """
- One-shot migration: eliminate the legacy ``sysadmin`` role in the root mandate.
-
- Authority semantics moved to two orthogonal flags on User:
- - ``isSysAdmin`` → Infrastructure-Operator (RBAC bypass)
- - ``isPlatformAdmin`` → Cross-Mandate-Governance (no bypass)
-
- Migration steps (idempotent):
- 1. Find sysadmin role(s) in root mandate. If none exist → done.
- 2. For every UserMandateRole row referencing such a role: set
- ``user.isPlatformAdmin = True`` (preserves cross-mandate authority).
- 3. Delete those UserMandateRole rows.
- 4. Delete AccessRules attached to the sysadmin role.
- 5. Delete the sysadmin Role record.
-
- Args:
- db: Database connector instance
- mandateId: Root mandate ID
- """
- sysadminRoles = db.getRecordset(
- Role,
- recordFilter={"roleLabel": "sysadmin", "mandateId": mandateId, "featureInstanceId": None},
- )
- if not sysadminRoles:
- logger.debug("Sysadmin role migration: no legacy sysadmin role present, nothing to do")
- return
-
- sysadminRoleIds = [str(r.get("id")) for r in sysadminRoles if r.get("id")]
- logger.warning(
- f"Sysadmin role migration: found {len(sysadminRoleIds)} legacy sysadmin role(s) "
- f"in root mandate, migrating to isPlatformAdmin flag"
- )
-
- # 1) Promote every holder to isPlatformAdmin=True
- promoted = 0
- for sysadminRoleId in sysadminRoleIds:
- umRoleRows = db.getRecordset(
- UserMandateRole, recordFilter={"roleId": sysadminRoleId}
- )
- userMandateIds = [str(r.get("userMandateId")) for r in umRoleRows if r.get("userMandateId")]
- if not userMandateIds:
- continue
-
- # Resolve userIds via UserMandate
- userIds = set()
- for umId in userMandateIds:
- ums = db.getRecordset(UserMandate, recordFilter={"id": umId})
- for um in ums:
- uid = um.get("userId") if isinstance(um, dict) else getattr(um, "userId", None)
- if uid:
- userIds.add(str(uid))
-
- for userId in userIds:
- users = db.getRecordset(UserInDB, recordFilter={"id": userId})
- if not users:
- continue
- current = users[0].get("isPlatformAdmin", False)
- if not current:
- db.recordModify(UserInDB, userId, {"isPlatformAdmin": True})
- promoted += 1
- logger.warning(
- f"Sysadmin role migration: granted isPlatformAdmin=True to user {userId}"
- )
-
- # 2) Delete UserMandateRole rows
- for umRow in umRoleRows:
- rowId = umRow.get("id") if isinstance(umRow, dict) else getattr(umRow, "id", None)
- if rowId:
- try:
- db.recordDelete(UserMandateRole, str(rowId))
- except Exception as e:
- logger.error(f"Sysadmin role migration: failed to drop UserMandateRole {rowId}: {e}")
-
- # 3) Delete AccessRules
- accessRules = db.getRecordset(AccessRule, recordFilter={"roleId": sysadminRoleId})
- for ar in accessRules:
- arId = ar.get("id") if isinstance(ar, dict) else getattr(ar, "id", None)
- if arId:
- try:
- db.recordDelete(AccessRule, str(arId))
- except Exception as e:
- logger.error(f"Sysadmin role migration: failed to drop AccessRule {arId}: {e}")
-
- # 4) Delete the Role
- try:
- db.recordDelete(Role, sysadminRoleId)
- except Exception as e:
- logger.error(f"Sysadmin role migration: failed to drop Role {sysadminRoleId}: {e}")
-
- logger.warning(
- f"Sysadmin role migration: completed; promoted {promoted} user(s) to isPlatformAdmin"
- )
-
-
def _getRoleId(db: DatabaseConnector, roleLabel: str) -> Optional[str]:
"""
Get role ID by label, using cache or database lookup.
diff --git a/modules/interfaces/interfaceDbApp.py b/modules/interfaces/interfaceDbApp.py
index 51519a29..6f1d9487 100644
--- a/modules/interfaces/interfaceDbApp.py
+++ b/modules/interfaces/interfaceDbApp.py
@@ -1268,19 +1268,7 @@ class AppObjects:
result = []
for conn_dict in connections:
try:
- # Create UserConnection object
- connection = UserConnection(
- id=conn_dict["id"],
- userId=conn_dict["userId"],
- authority=conn_dict.get("authority"),
- externalId=conn_dict.get("externalId", ""),
- externalUsername=conn_dict.get("externalUsername", ""),
- externalEmail=conn_dict.get("externalEmail"),
- status=conn_dict.get("status", "pending"),
- connectedAt=conn_dict.get("connectedAt"),
- lastChecked=conn_dict.get("lastChecked"),
- expiresAt=conn_dict.get("expiresAt"),
- )
+ connection = UserConnection.model_validate(conn_dict)
result.append(connection)
except Exception as e:
logger.error(
@@ -1293,6 +1281,28 @@ class AppObjects:
logger.error(f"Error getting user connections: {str(e)}")
return []
+ def getActiveKnowledgeConnections(self) -> List[UserConnection]:
+ """Return all UserConnections with knowledgeIngestionEnabled=True and status=active.
+
+ Used by the daily re-sync scheduler to determine which connections to re-index.
+ """
+ try:
+ rows = self.db.getRecordset(
+ UserConnection,
+ recordFilter={"knowledgeIngestionEnabled": True, "status": ConnectionStatus.ACTIVE.value},
+ )
+ result = []
+ for row in rows or []:
+ try:
+ conn = UserConnection.model_validate(row) if isinstance(row, dict) else row
+ result.append(conn)
+ except Exception as _e:
+ logger.warning(f"getActiveKnowledgeConnections: could not parse row: {_e}")
+ return result
+ except Exception as e:
+ logger.error(f"getActiveKnowledgeConnections failed: {e}")
+ return []
+
def getUserConnectionById(self, connectionId: str) -> Optional[UserConnection]:
"""Get a single UserConnection by ID or by reference string (connection:authority:username)."""
try:
@@ -1317,18 +1327,21 @@ class AppObjects:
if connections:
conn_dict = connections[0]
- return UserConnection(
- id=conn_dict["id"],
- userId=conn_dict["userId"],
- authority=conn_dict.get("authority"),
- externalId=conn_dict.get("externalId", ""),
- externalUsername=conn_dict.get("externalUsername", ""),
- externalEmail=conn_dict.get("externalEmail"),
- status=conn_dict.get("status", "pending"),
- connectedAt=conn_dict.get("connectedAt"),
- lastChecked=conn_dict.get("lastChecked"),
- expiresAt=conn_dict.get("expiresAt"),
- )
+ try:
+ return UserConnection.model_validate(conn_dict)
+ except Exception:
+ return UserConnection(
+ id=conn_dict["id"],
+ userId=conn_dict["userId"],
+ authority=conn_dict.get("authority"),
+ externalId=conn_dict.get("externalId", ""),
+ externalUsername=conn_dict.get("externalUsername", ""),
+ externalEmail=conn_dict.get("externalEmail"),
+ status=conn_dict.get("status", "pending"),
+ connectedAt=conn_dict.get("connectedAt"),
+ lastChecked=conn_dict.get("lastChecked"),
+ expiresAt=conn_dict.get("expiresAt"),
+ )
return None
except Exception as e:
logger.error(f"Error getting user connection by ID: {str(e)}")
@@ -4014,6 +4027,59 @@ class AppObjects:
logger.error(f"Error deleting role {roleId}: {str(e)}")
raise
+ # -------------------------------------------------------------------------
+ # Table Grouping (user-defined groups for FormGeneratorTable instances)
+ # -------------------------------------------------------------------------
+
+ def getTableGrouping(self, contextKey: str):
+ """
+ Load the group tree for the current user and the given contextKey.
+
+ Returns a TableGrouping instance or None if no grouping has been saved yet.
+ contextKey identifies the table instance, e.g. "connections", "prompts",
+ "admin/users", "trustee/{instanceId}/documents".
+ """
+ from modules.datamodels.datamodelPagination import TableGrouping
+ try:
+ records = self.db.getRecordset(
+ TableGrouping,
+ recordFilter={"userId": str(self.userId), "contextKey": contextKey},
+ )
+ if not records:
+ return None
+ row = records[0]
+ return TableGrouping.model_validate(row) if isinstance(row, dict) else row
+ except Exception as e:
+ logger.error(f"getTableGrouping failed for user={self.userId} key={contextKey}: {e}")
+ return None
+
+ def upsertTableGrouping(self, contextKey: str, rootGroups: list):
+ """
+ Create or replace the group tree for the current user and contextKey.
+
+ rootGroups is a list of TableGroupNode-compatible dicts (the full tree).
+ Returns the saved TableGrouping instance.
+ """
+ from modules.datamodels.datamodelPagination import TableGrouping
+ from modules.shared.timeUtils import getUtcTimestamp
+ try:
+ existing = self.getTableGrouping(contextKey)
+ data = {
+ "id": existing.id if existing else str(uuid.uuid4()),
+ "userId": str(self.userId),
+ "contextKey": contextKey,
+ "rootGroups": rootGroups,
+ "updatedAt": getUtcTimestamp(),
+ }
+ if existing:
+ self.db.recordModify(TableGrouping, existing.id, data)
+ else:
+ self.db.recordCreate(TableGrouping, data)
+ return TableGrouping.model_validate(data)
+ except Exception as e:
+ logger.error(f"upsertTableGrouping failed for user={self.userId} key={contextKey}: {e}")
+ raise
+
# Public Methods
diff --git a/modules/interfaces/interfaceDbKnowledge.py b/modules/interfaces/interfaceDbKnowledge.py
index f819615e..c2f79b67 100644
--- a/modules/interfaces/interfaceDbKnowledge.py
+++ b/modules/interfaces/interfaceDbKnowledge.py
@@ -93,6 +93,46 @@ class KnowledgeObjects:
self.db.recordModify(FileContentIndex, fileId, {"status": status})
return True
+ def deleteFileContentIndexByConnectionId(self, connectionId: str) -> Dict[str, int]:
+ """Delete all FileContentIndex rows (and their ContentChunks) for a connection.
+
+ Used when a UserConnection is revoked / disconnected so the knowledge corpus
+ no longer references data the user no longer grants access to. Returns a dict
+ with counts to support observability logs.
+ """
+ if not connectionId:
+ return {"indexRows": 0, "chunks": 0}
+
+ rows = self.db.getRecordset(
+ FileContentIndex, recordFilter={"connectionId": connectionId}
+ )
+ mandateIds: set = set()
+ chunkCount = 0
+ indexCount = 0
+ for row in rows:
+ fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None)
+ mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "")
+ if not fid:
+ continue
+ chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid})
+ for chunk in chunks:
+ if self.db.recordDelete(ContentChunk, chunk["id"]):
+ chunkCount += 1
+ if self.db.recordDelete(FileContentIndex, fid):
+ indexCount += 1
+ if mid:
+ mandateIds.add(str(mid))
+
+ for mid in mandateIds:
+ try:
+ from modules.interfaces.interfaceDbBilling import _getRootInterface
+
+ _getRootInterface().reconcileMandateStorageBilling(mid)
+ except Exception as ex:
+ logger.warning("reconcileMandateStorageBilling after connection purge failed: %s", ex)
+
+ return {"indexRows": indexCount, "chunks": chunkCount}
+
def deleteFileContentIndex(self, fileId: str) -> bool:
"""Delete a FileContentIndex and all associated ContentChunks."""
existing = self.getFileContentIndex(fileId)
@@ -603,41 +643,10 @@ def aggregateMandateRagTotalBytes(mandateId: str) -> int:
if rid and str(rid) not in byId:
byId[str(rid)] = row
- # DEPRECATED: file-ID-correlation fallback from poweron_management.
- # Only needed for pre-migration data where mandateId/featureInstanceId on the
- # FileContentIndex are empty. Safe to remove once all environments are migrated.
- _fallbackCount = 0
- try:
- from modules.datamodels.datamodelFiles import FileItem
- from modules.interfaces.interfaceDbManagement import ComponentObjects
- mgmtDb = ComponentObjects().db
- knowledgeIf = getInterface(None)
-
- fileIds: set = set()
- for f in mgmtDb.getRecordset(FileItem, recordFilter={"mandateId": mandateId}):
- fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
- if fid:
- fileIds.add(str(fid))
- for instId in instIds:
- for f in mgmtDb.getRecordset(FileItem, recordFilter={"featureInstanceId": instId}):
- fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
- if fid:
- fileIds.add(str(fid))
-
- for fid in fileIds:
- if fid in byId:
- continue
- row = knowledgeIf.getFileContentIndex(fid)
- if row:
- byId[fid] = row
- _fallbackCount += 1
- except Exception as e:
- logger.warning("aggregateMandateRagTotalBytes fallback failed: %s", e)
-
total = sum(int(r.get("totalSize") or 0) for r in byId.values())
logger.info(
- "aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes (fallback: %d)",
- mandateId, len(byId), total, _fallbackCount,
+ "aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes",
+ mandateId, len(byId), total,
)
return total
diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py
index f72597b3..120aecce 100644
--- a/modules/interfaces/interfaceDbManagement.py
+++ b/modules/interfaces/interfaceDbManagement.py
@@ -19,8 +19,7 @@ from modules.interfaces.interfaceRbac import getRecordsetWithRBAC, getRecordsetP
from modules.security.rbac import RbacClass
from modules.datamodels.datamodelRbac import AccessRuleContext
from modules.datamodels.datamodelUam import AccessLevel
-from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData
-from modules.datamodels.datamodelFileFolder import FileFolder
+from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData, FileFolder
from modules.datamodels.datamodelUtils import Prompt
from modules.datamodels.datamodelMessaging import (
MessagingSubscription,
@@ -1068,7 +1067,242 @@ class ComponentObjects:
except Exception as e:
logger.error(f"Error converting file record: {str(e)}")
return None
-
+
+ # ── Folder methods ─────────────────────────────────────────────────────────
+
+ def getOwnFolderTree(self) -> List[Dict[str, Any]]:
+ """Folders owned by the current user, filtered via RBAC."""
+ return getRecordsetWithRBAC(
+ self.db, FileFolder, self.currentUser,
+ recordFilter={"sysCreatedBy": self.userId},
+ mandateId=self.mandateId,
+ featureInstanceId=self.featureInstanceId,
+ )
+
+ def getSharedFolderTree(self) -> List[Dict[str, Any]]:
+ """Folders visible via scope but NOT owned by the current user.
+ Adds contextOrphan=True when a folder's parentId is not in the result set."""
+ allFolders = getRecordsetWithRBAC(
+ self.db, FileFolder, self.currentUser,
+ mandateId=self.mandateId,
+ featureInstanceId=self.featureInstanceId,
+ )
+ shared = [f for f in allFolders if f.get("sysCreatedBy") != self.userId]
+ sharedIds = {f["id"] for f in shared}
+ for f in shared:
+ f["contextOrphan"] = bool(f.get("parentId") and f["parentId"] not in sharedIds)
+ return shared
+
+ def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]:
+ """Return a single folder dict or None."""
+ results = getRecordsetWithRBAC(
+ self.db, FileFolder, self.currentUser,
+ recordFilter={"id": folderId},
+ mandateId=self.mandateId,
+ featureInstanceId=self.featureInstanceId,
+ )
+ return results[0] if results else None
+
+ def _isFolderOwner(self, folder) -> bool:
+ createdBy = (
+ getattr(folder, "sysCreatedBy", None)
+ or (folder.get("sysCreatedBy") if isinstance(folder, dict) else None)
+ )
+ return createdBy == self.userId
+
+ def _requireFolderWriteAccess(self, folder, folderId: str, operation: str = "update"):
+ """Raise PermissionError if the user cannot mutate this folder.
+ Owners always can. Non-owners need RBAC ALL level."""
+ if self._isFolderOwner(folder):
+ return
+ from modules.interfaces.interfaceRbac import buildDataObjectKey
+ objectKey = buildDataObjectKey("FileFolder")
+ permissions = self.rbac.getUserPermissions(
+ self.currentUser, AccessRuleContext.DATA, objectKey,
+ mandateId=self.mandateId, featureInstanceId=self.featureInstanceId,
+ )
+ level = getattr(permissions, operation, None)
+ if level != AccessLevel.ALL:
+ raise PermissionError(
+ f"No permission to {operation} folder {folderId} (not owner, access level: {level})"
+ )
+
+ def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]:
+ if not self.checkRbacPermission(FileFolder, "create"):
+ raise PermissionError("No permission to create folders")
+ folder = FileFolder(
+ name=name,
+ parentId=parentId,
+ mandateId=self.mandateId or "",
+ featureInstanceId=self.featureInstanceId or "",
+ scope="personal",
+ neutralize=False,
+ )
+ self.db.recordCreate(FileFolder, folder)
+ return folder.model_dump()
+
+ def renameFolder(self, folderId: str, newName: str) -> Dict[str, Any]:
+ folder = self.getFolder(folderId)
+ if not folder:
+ raise FileNotFoundError(f"Folder {folderId} not found")
+ self._requireFolderWriteAccess(folder, folderId, "update")
+ self.db.recordModify(FileFolder, folderId, {"name": newName})
+ folder["name"] = newName
+ return folder
+
+ def moveFolder(self, folderId: str, newParentId: Optional[str] = None) -> Dict[str, Any]:
+ folder = self.getFolder(folderId)
+ if not folder:
+ raise FileNotFoundError(f"Folder {folderId} not found")
+ self._requireFolderWriteAccess(folder, folderId, "update")
+
+ if newParentId:
+ parent = self.getFolder(newParentId)
+ if not parent:
+ raise FileNotFoundError(f"Target parent folder {newParentId} not found")
+ self._requireFolderWriteAccess(parent, newParentId, "update")
+ # Circular-reference guard: newParentId must not be a descendant of folderId
+ if self._isDescendant(newParentId, folderId):
+ raise ValueError(f"Cannot move folder into its own subtree (circular reference)")
+
+ self.db.recordModify(FileFolder, folderId, {"parentId": newParentId})
+ folder["parentId"] = newParentId
+ return folder
+
+ def _isDescendant(self, candidateId: str, ancestorId: str) -> bool:
+ """Return True if candidateId is a descendant of (or equal to) ancestorId."""
+ visited = set()
+ current = candidateId
+ while current:
+ if current == ancestorId:
+ return True
+ if current in visited:
+ break
+ visited.add(current)
+ f = self.getFolder(current)
+ current = f.get("parentId") if f else None
+ return False
+
+ def deleteFolderCascade(self, folderId: str) -> Dict[str, Any]:
+ """Delete a folder and all owned sub-folders + their files."""
+ folder = self.getFolder(folderId)
+ if not folder:
+ raise FileNotFoundError(f"Folder {folderId} not found")
+ self._requireFolderWriteAccess(folder, folderId, "delete")
+
+ folderIds = self._collectChildFolderIds(folderId)
+
+ # Verify all child folders are owned
+ for fid in folderIds:
+ if fid == folderId:
+ continue
+ child = self.getFolder(fid)
+ if child and not self._isFolderOwner(child):
+ raise PermissionError(f"Cannot delete folder tree: sub-folder {fid} is not owned by you")
+
+ # Collect files in those folders
+ fileRows = []
+ for fid in folderIds:
+ items = self.db.getRecordset(FileItem, recordFilter={"folderId": fid})
+ fileRows.extend(items)
+
+ for item in fileRows:
+ itemOwner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
+ if itemOwner != self.userId:
+ itemId = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
+ raise PermissionError(f"Cannot delete folder tree: file {itemId} is not owned by you")
+
+ fileIds = [
+ (item.get("id") if isinstance(item, dict) else getattr(item, "id", None))
+ for item in fileRows
+ ]
+
+ # Single transaction: delete FileData, FileItem, then FileFolder (children first)
+ self.db._ensure_connection()
+ try:
+ with self.db.connection.cursor() as cursor:
+ if fileIds:
+ cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (fileIds,))
+ cursor.execute('DELETE FROM "FileItem" WHERE "id" = ANY(%s)', (fileIds,))
+ orderedIds = list(folderIds)
+ orderedIds.remove(folderId)
+ orderedIds.append(folderId)
+ if orderedIds:
+ cursor.execute('DELETE FROM "FileFolder" WHERE "id" = ANY(%s)', (orderedIds,))
+ self.db.connection.commit()
+ except Exception:
+ self.db.connection.rollback()
+ raise
+
+ return {"deletedFolders": len(folderIds), "deletedFiles": len(fileIds)}
+
+ def _collectChildFolderIds(self, folderId: str) -> List[str]:
+ """BFS to collect folderId + all descendant folder IDs owned by user."""
+ result = [folderId]
+ queue = [folderId]
+ while queue:
+ parentId = queue.pop(0)
+ children = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId})
+ for child in children:
+ cid = child.get("id") if isinstance(child, dict) else getattr(child, "id", None)
+ if cid and cid not in result:
+ result.append(cid)
+ queue.append(cid)
+ return result
+
+ def patchFolderScope(self, folderId: str, scope: str, cascadeToFiles: bool = False) -> Dict[str, Any]:
+ validScopes = {"personal", "featureInstance", "mandate", "global"}
+ if scope not in validScopes:
+ raise ValueError(f"Invalid scope: {scope}. Must be one of {validScopes}")
+
+ folder = self.getFolder(folderId)
+ if not folder:
+ raise FileNotFoundError(f"Folder {folderId} not found")
+ self._requireFolderWriteAccess(folder, folderId, "update")
+
+ if scope == "global":
+ from modules.interfaces.interfaceRbac import buildDataObjectKey
+ objectKey = buildDataObjectKey("FileFolder")
+ permissions = self.rbac.getUserPermissions(
+ self.currentUser, AccessRuleContext.DATA, objectKey,
+ mandateId=self.mandateId, featureInstanceId=self.featureInstanceId,
+ )
+ if getattr(permissions, "update", None) != AccessLevel.ALL:
+ raise PermissionError("Setting global scope requires ALL permission")
+
+ self.db.recordModify(FileFolder, folderId, {"scope": scope})
+
+ filesUpdated = 0
+ if cascadeToFiles:
+ items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId})
+ for item in items:
+ owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
+ if owner == self.userId:
+ iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
+ self.db.recordModify(FileItem, iid, {"scope": scope})
+ filesUpdated += 1
+
+ return {"folderId": folderId, "scope": scope, "filesUpdated": filesUpdated}
+
+ def patchFolderNeutralize(self, folderId: str, neutralize: bool) -> Dict[str, Any]:
+ folder = self.getFolder(folderId)
+ if not folder:
+ raise FileNotFoundError(f"Folder {folderId} not found")
+ self._requireFolderWriteAccess(folder, folderId, "update")
+
+ self.db.recordModify(FileFolder, folderId, {"neutralize": neutralize})
+
+ items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId})
+ filesUpdated = 0
+ for item in items:
+ owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
+ if owner == self.userId:
+ iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
+ self.db.recordModify(FileItem, iid, {"neutralize": neutralize})
+ filesUpdated += 1
+
+ return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": filesUpdated}
+
def _isfileNameUnique(self, fileName: str, excludeFileId: Optional[str] = None) -> bool:
"""Checks if a fileName is unique for the current user."""
# Get all files filtered by RBAC (will be filtered by user's access level)
@@ -1103,15 +1337,12 @@ class ComponentObjects:
return newfileName
counter += 1
- def createFile(self, name: str, mimeType: str, content: bytes, folderId: Optional[str] = None) -> FileItem:
+ def createFile(self, name: str, mimeType: str, content: bytes) -> FileItem:
"""Creates a new file entry if user has permission. Computes fileHash and fileSize from content.
Duplicate check: if a file with the same user + fileHash + fileName already exists,
the existing file is returned instead of creating a new one.
Same hash with different name is allowed (intentional copy by user).
-
- Args:
- folderId: Optional parent folder ID. None/empty means the root folder.
"""
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to create files")
@@ -1139,11 +1370,6 @@ class ComponentObjects:
else:
scope = "personal"
- # Normalize folderId: treat empty string as "no folder" (= root) – NULL in DB
- normalizedFolderId: Optional[str] = folderId
- if isinstance(normalizedFolderId, str) and not normalizedFolderId.strip():
- normalizedFolderId = None
-
fileItem = FileItem(
mandateId=mandateId,
featureInstanceId=featureInstanceId,
@@ -1152,7 +1378,6 @@ class ComponentObjects:
mimeType=mimeType,
fileSize=fileSize,
fileHash=fileHash,
- folderId=normalizedFolderId,
)
# Store in database
@@ -1277,382 +1502,47 @@ class ComponentObjects:
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting files in batch: {str(e)}")
- # ---- Folder methods ----
-
- _RESERVED_FOLDER_NAMES = {"(Global)"}
-
- def _validateFolderName(self, name: str, parentId: Optional[str], excludeFolderId: Optional[str] = None):
- """Ensures folder name is not reserved and is unique within parent."""
- if name in self._RESERVED_FOLDER_NAMES:
- raise ValueError(f"Folder name '{name}' is reserved")
- if not name or not name.strip():
- raise ValueError("Folder name cannot be empty")
- existingFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId or ""})
- for f in existingFolders:
- if f.get("name") == name and f.get("id") != excludeFolderId:
- raise ValueError(f"Folder '{name}' already exists in this directory")
-
- def _isDescendantOf(self, folderId: str, ancestorId: str) -> bool:
- """Checks if folderId is a descendant of ancestorId (circular reference check)."""
- visited = set()
- currentId = folderId
- while currentId:
- if currentId == ancestorId:
- return True
- if currentId in visited:
- break
- visited.add(currentId)
- folders = self.db.getRecordset(FileFolder, recordFilter={"id": currentId})
- if not folders:
- break
- currentId = folders[0].get("parentId")
- return False
-
- def _ensureFeatureInstanceFolder(self, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
- """Return the folder ID for a feature instance, creating it on first use.
- The folder is named after the feature instance label."""
- existing = self.db.getRecordset(
- FileFolder,
- recordFilter={
- "featureInstanceId": featureInstanceId,
- "sysCreatedBy": self.userId or "",
- },
- )
- if existing:
- return existing[0].get("id")
-
- # Resolve the instance label for the folder name
- folderName = featureInstanceId[:8]
+ def _ensureFeatureInstanceGroup(self, featureInstanceId: str, contextKey: str = "files/list") -> Optional[str]:
+ """Return the groupId of the default group for a feature instance.
+ Creates the group if it doesn't exist yet."""
try:
- from modules.datamodels.datamodelFeatures import FeatureInstance
- from modules.security.rootAccess import getRootDbAppConnector
- dbApp = getRootDbAppConnector()
- instances = dbApp.getRecordset(FeatureInstance, recordFilter={"id": featureInstanceId})
- if instances:
- folderName = instances[0].get("label") or folderName
+ import modules.interfaces.interfaceDbApp as _appIface
+ appInterface = _appIface.getInterface(self._currentUser)
+ existing = appInterface.getTableGrouping(contextKey)
+ nodes = [n.model_dump() if hasattr(n, 'model_dump') else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
+ # Look for group with name matching featureInstanceId
+ def _find(nds):
+ for nd in nds:
+ nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
+ nmeta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
+ if (nmeta or {}).get("featureInstanceId") == featureInstanceId:
+ return nid
+ subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
+ result = _find(subs)
+ if result:
+ return result
+ return None
+ found = _find(nodes)
+ if found:
+ return found
+ # Create new group
+ import uuid
+ newId = str(uuid.uuid4())
+ newGroup = {
+ "id": newId,
+ "name": featureInstanceId,
+ "itemIds": [],
+ "subGroups": [],
+ "meta": {"featureInstanceId": featureInstanceId},
+ }
+ nodes.append(newGroup)
+ appInterface.upsertTableGrouping(contextKey, nodes)
+ return newId
except Exception as e:
- logger.warning(f"Could not resolve feature instance label: {e}")
+ logger.error(f"_ensureFeatureInstanceGroup failed: {e}")
+ return None
- folder = FileFolder(
- name=folderName,
- parentId=None,
- mandateId=mandateId,
- featureInstanceId=featureInstanceId,
- )
- created = self.db.recordCreate(FileFolder, folder)
- return created.get("id") if isinstance(created, dict) else getattr(created, "id", None)
-
- def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]:
- """Returns a folder by ID if it belongs to the current user."""
- folders = self.db.getRecordset(FileFolder, recordFilter={"id": folderId, "sysCreatedBy": self.userId or ""})
- return folders[0] if folders else None
-
- def listFolders(self, parentId: Optional[str] = None) -> List[Dict[str, Any]]:
- """List folders visible to the current user.
- Own folders are always returned. Other users' folders are only
- returned when they contain files visible to the current user.
- Each folder is enriched with ``fileCount``."""
- recordFilter = {}
- if parentId is not None:
- recordFilter["parentId"] = parentId
- folders = self.db.getRecordset(FileFolder, recordFilter=recordFilter if recordFilter else None)
-
- if not folders:
- return folders
-
- folderIds = [f["id"] for f in folders if f.get("id")]
- fileCounts: Dict[str, int] = {}
- try:
- from modules.interfaces.interfaceRbac import buildFilesScopeWhereClause
- scopeClause = buildFilesScopeWhereClause(
- self.currentUser, "FileItem", self.db,
- self.mandateId, self.featureInstanceId,
- [], [],
- )
-
- self.db._ensure_connection()
- with self.db.connection.cursor() as cursor:
- baseQuery = (
- 'SELECT "folderId", COUNT(*) AS cnt '
- 'FROM "FileItem" '
- 'WHERE "folderId" = ANY(%s)'
- )
- queryValues: list = [folderIds]
-
- if scopeClause:
- baseQuery += ' AND (' + scopeClause["condition"] + ')'
- queryValues.extend(scopeClause["values"])
-
- baseQuery += ' GROUP BY "folderId"'
- cursor.execute(baseQuery, queryValues)
- for row in cursor.fetchall():
- fileCounts[row["folderId"]] = row["cnt"]
- except Exception as e:
- logger.warning(f"Could not count files per folder: {e}")
-
- userId = self.userId or ""
- result = []
- for folder in folders:
- fc = fileCounts.get(folder.get("id", ""), 0)
- folder["fileCount"] = fc
- isOwn = folder.get("sysCreatedBy") == userId
- if isOwn or fc > 0:
- result.append(folder)
-
- return result
-
- def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]:
- """Create a new folder with unique name validation."""
- self._validateFolderName(name, parentId)
- folder = FileFolder(
- name=name,
- parentId=parentId,
- mandateId=self.mandateId or "",
- featureInstanceId=self.featureInstanceId or "",
- )
- return self.db.recordCreate(FileFolder, folder)
-
- def renameFolder(self, folderId: str, newName: str) -> bool:
- """Rename a folder with unique name validation."""
- folder = self.getFolder(folderId)
- if not folder:
- raise FileNotFoundError(f"Folder {folderId} not found")
- self._validateFolderName(newName, folder.get("parentId"), excludeFolderId=folderId)
- return self.db.recordModify(FileFolder, folderId, {"name": newName})
-
- def updateFolder(self, folderId: str, updateData: Dict[str, Any]) -> bool:
- """
- Update folder metadata (e.g. ``scope``, ``neutralize``). Owner-only,
- same access model as renameFolder/moveFolder. Use ``renameFolder`` for
- ``name`` changes (uniqueness validation) and ``moveFolder`` for
- ``parentId`` changes (cycle/uniqueness validation).
- """
- if not updateData:
- return True
- folder = self.getFolder(folderId)
- if not folder:
- raise FileNotFoundError(f"Folder {folderId} not found")
- forbiddenKeys = {"id", "sysCreatedBy", "sysCreatedAt", "sysUpdatedAt"}
- cleaned: Dict[str, Any] = {k: v for k, v in updateData.items() if k not in forbiddenKeys}
- if "name" in cleaned:
- self._validateFolderName(cleaned["name"], folder.get("parentId"), excludeFolderId=folderId)
- return self.db.recordModify(FileFolder, folderId, cleaned)
-
- def moveFolder(self, folderId: str, targetParentId: Optional[str] = None) -> bool:
- """Move a folder to a new parent, with circular reference and unique name checks."""
- folder = self.getFolder(folderId)
- if not folder:
- raise FileNotFoundError(f"Folder {folderId} not found")
- if targetParentId and self._isDescendantOf(targetParentId, folderId):
- raise ValueError("Cannot move folder into its own subtree")
- self._validateFolderName(folder.get("name", ""), targetParentId, excludeFolderId=folderId)
- return self.db.recordModify(FileFolder, folderId, {"parentId": targetParentId})
-
- def moveFilesBatch(self, fileIds: List[str], targetFolderId: Optional[str] = None) -> Dict[str, Any]:
- """Move multiple files with one SQL update.
- Owner can always move; non-owners need RBAC ALL level."""
- uniqueIds = [str(fid) for fid in dict.fromkeys(fileIds or []) if fid]
- if not uniqueIds:
- return {"movedFiles": 0}
-
- if targetFolderId:
- targetFolder = self.getFolder(targetFolderId)
- if not targetFolder:
- raise FileNotFoundError(f"Target folder {targetFolderId} not found")
-
- try:
- self.db._ensure_connection()
- with self.db.connection.cursor() as cursor:
- cursor.execute(
- 'SELECT "id", "sysCreatedBy" FROM "FileItem" WHERE "id" = ANY(%s)',
- (uniqueIds,),
- )
- rows = cursor.fetchall()
- foundIds = {row["id"] for row in rows}
- missing = sorted(set(uniqueIds) - foundIds)
- if missing:
- raise FileNotFoundError(f"Files not found: {missing}")
-
- for row in rows:
- self._requireFileWriteAccess(row, row["id"], "update")
-
- accessibleIds = [row["id"] for row in rows]
- cursor.execute(
- 'UPDATE "FileItem" SET "folderId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
- 'WHERE "id" = ANY(%s)',
- (targetFolderId, getUtcTimestamp(), self.userId or "", accessibleIds),
- )
- movedFiles = cursor.rowcount
-
- self.db.connection.commit()
- return {"movedFiles": movedFiles}
- except Exception as e:
- logger.error(f"Error moving files in batch: {e}")
- self.db.connection.rollback()
- raise FileError(f"Error moving files in batch: {str(e)}")
-
- def moveFoldersBatch(self, folderIds: List[str], targetParentId: Optional[str] = None) -> Dict[str, Any]:
- """Move multiple folders with one SQL update after validation."""
- uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
- if not uniqueIds:
- return {"movedFolders": 0}
-
- foldersToMove: List[Dict[str, Any]] = []
- for folderId in uniqueIds:
- folder = self.getFolder(folderId)
- if not folder:
- raise FileNotFoundError(f"Folder {folderId} not found")
- if targetParentId and self._isDescendantOf(targetParentId, folderId):
- raise ValueError("Cannot move folder into its own subtree")
- foldersToMove.append(folder)
-
- existingInTarget = self.db.getRecordset(
- FileFolder,
- recordFilter={"parentId": targetParentId or "", "sysCreatedBy": self.userId or ""},
- )
- existingNames = {f.get("name"): f.get("id") for f in existingInTarget}
- movingNames: Dict[str, str] = {}
- movingIds = set(uniqueIds)
-
- for folder in foldersToMove:
- name = folder.get("name", "")
- folderId = folder.get("id")
- if name in movingNames and movingNames[name] != folderId:
- raise ValueError(f"Folder '{name}' already exists in this move batch")
- movingNames[name] = folderId
-
- existingId = existingNames.get(name)
- if existingId and existingId not in movingIds:
- raise ValueError(f"Folder '{name}' already exists in target directory")
-
- try:
- self.db._ensure_connection()
- with self.db.connection.cursor() as cursor:
- cursor.execute(
- 'UPDATE "FileFolder" SET "parentId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
- 'WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
- (targetParentId, getUtcTimestamp(), self.userId or "", uniqueIds, self.userId or ""),
- )
- movedFolders = cursor.rowcount
-
- self.db.connection.commit()
- return {"movedFolders": movedFolders}
- except Exception as e:
- logger.error(f"Error moving folders in batch: {e}")
- self.db.connection.rollback()
- raise FileError(f"Error moving folders in batch: {str(e)}")
-
- def deleteFolder(self, folderId: str, recursive: bool = False) -> Dict[str, Any]:
- """Delete a folder. If recursive, deletes all contents. Returns summary of deletions."""
- folder = self.getFolder(folderId)
- if not folder:
- raise FileNotFoundError(f"Folder {folderId} not found")
-
- childFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": folderId, "sysCreatedBy": self.userId or ""})
- childFiles = self._getFilesByCurrentUser(recordFilter={"folderId": folderId})
-
- if not recursive and (childFolders or childFiles):
- raise ValueError(
- f"Folder '{folder.get('name')}' is not empty "
- f"({len(childFiles)} files, {len(childFolders)} subfolders). "
- f"Use recursive=true to delete contents."
- )
-
- deletedFiles = 0
- deletedFolders = 0
-
- if recursive:
- for subFolder in childFolders:
- subResult = self.deleteFolder(subFolder["id"], recursive=True)
- deletedFiles += subResult.get("deletedFiles", 0)
- deletedFolders += subResult.get("deletedFolders", 0)
- for childFile in childFiles:
- try:
- self.deleteFile(childFile["id"])
- deletedFiles += 1
- except Exception as e:
- logger.warning(f"Failed to delete file {childFile['id']} during folder deletion: {e}")
-
- self.db.recordDelete(FileFolder, folderId)
- deletedFolders += 1
-
- return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
-
- def deleteFoldersBatch(self, folderIds: List[str], recursive: bool = True) -> Dict[str, Any]:
- """Delete multiple folders and their content in batched SQL calls."""
- uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
- if not uniqueIds:
- return {"deletedFiles": 0, "deletedFolders": 0}
-
- if not recursive:
- deletedFiles = 0
- deletedFolders = 0
- for folderId in uniqueIds:
- result = self.deleteFolder(folderId, recursive=False)
- deletedFiles += result.get("deletedFiles", 0)
- deletedFolders += result.get("deletedFolders", 0)
- return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
-
- try:
- self.db._ensure_connection()
- with self.db.connection.cursor() as cursor:
- cursor.execute(
- 'SELECT "id" FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
- (uniqueIds, self.userId or ""),
- )
- rootAccessibleIds = [row["id"] for row in cursor.fetchall()]
- if len(rootAccessibleIds) != len(uniqueIds):
- missingIds = sorted(set(uniqueIds) - set(rootAccessibleIds))
- raise FileNotFoundError(f"Folders not found or not accessible: {missingIds}")
-
- cursor.execute(
- """
- WITH RECURSIVE folder_tree AS (
- SELECT "id"
- FROM "FileFolder"
- WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s
- UNION ALL
- SELECT child."id"
- FROM "FileFolder" child
- INNER JOIN folder_tree ft ON child."parentId" = ft."id"
- WHERE child."sysCreatedBy" = %s
- )
- SELECT DISTINCT "id" FROM folder_tree
- """,
- (rootAccessibleIds, self.userId or "", self.userId or ""),
- )
- allFolderIds = [row["id"] for row in cursor.fetchall()]
-
- cursor.execute(
- 'SELECT "id" FROM "FileItem" WHERE "folderId" = ANY(%s) AND "sysCreatedBy" = %s',
- (allFolderIds, self.userId or ""),
- )
- allFileIds = [row["id"] for row in cursor.fetchall()]
-
- if allFileIds:
- cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (allFileIds,))
- cursor.execute(
- 'DELETE FROM "FileItem" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
- (allFileIds, self.userId or ""),
- )
- deletedFiles = cursor.rowcount
- else:
- deletedFiles = 0
-
- cursor.execute(
- 'DELETE FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
- (allFolderIds, self.userId or ""),
- )
- deletedFolders = cursor.rowcount
-
- self.db.connection.commit()
- return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
- except Exception as e:
- logger.error(f"Error deleting folders in batch: {e}")
- self.db.connection.rollback()
- raise FileDeletionError(f"Error deleting folders in batch: {str(e)}")
-
- def copyFile(self, sourceFileId: str, targetFolderId: Optional[str] = None, newFileName: Optional[str] = None) -> FileItem:
+ def copyFile(self, sourceFileId: str, newFileName: Optional[str] = None) -> FileItem:
"""Create a full duplicate of a file (FileItem + FileData)."""
sourceFile = self.getFile(sourceFileId)
if not sourceFile:
@@ -1665,11 +1555,6 @@ class ComponentObjects:
fileName = newFileName or sourceFile.fileName
copiedFile = self.createFile(fileName, sourceFile.mimeType, sourceData)
- if targetFolderId:
- self.updateFile(copiedFile.id, {"folderId": targetFolderId})
- elif sourceFile.folderId:
- self.updateFile(copiedFile.id, {"folderId": sourceFile.folderId})
-
self.createFileData(copiedFile.id, sourceData)
return copiedFile
@@ -1884,18 +1769,14 @@ class ComponentObjects:
logger.error(f"Error getting file content: {str(e)}")
return None
- def saveUploadedFile(self, fileContent: bytes, fileName: str, folderId: Optional[str] = None) -> tuple[FileItem, str]:
- """Saves an uploaded file if user has permission.
-
- Args:
- folderId: Optional parent folder ID. None means root folder.
- """
+ def saveUploadedFile(self, fileContent: bytes, fileName: str) -> tuple[FileItem, str]:
+ """Saves an uploaded file if user has permission."""
try:
# Check file creation permission
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to upload files")
- logger.debug(f"Starting upload process for file: {fileName} (folderId={folderId!r})")
+ logger.debug(f"Starting upload process for file: {fileName}")
if not isinstance(fileContent, bytes):
logger.error(f"Invalid fileContent type: {type(fileContent)}")
@@ -1921,7 +1802,6 @@ class ComponentObjects:
name=fileName,
mimeType=mimeType,
content=fileContent,
- folderId=folderId,
)
# Save binary data
diff --git a/modules/interfaces/interfaceFeatures.py b/modules/interfaces/interfaceFeatures.py
index ccb64a53..c965edb2 100644
--- a/modules/interfaces/interfaceFeatures.py
+++ b/modules/interfaces/interfaceFeatures.py
@@ -347,6 +347,7 @@ class FeatureInterface:
"templateSourceId": templateId,
"templateScope": "instance",
"active": True,
+ "targetFeatureInstanceId": instanceId,
})
copied += 1
except Exception as e:
diff --git a/modules/interfaces/interfaceRbac.py b/modules/interfaces/interfaceRbac.py
index ad2ac6b5..42a32b82 100644
--- a/modules/interfaces/interfaceRbac.py
+++ b/modules/interfaces/interfaceRbac.py
@@ -529,8 +529,7 @@ def getRecordsetPaginatedWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
- # Covers both historical empty-string values and true NULLs
- # e.g. root-folder files where folderId may be "" or NULL.
+ # Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):
@@ -689,8 +688,7 @@ def getDistinctColumnValuesWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
- # Covers both historical empty-string values and true NULLs
- # e.g. root-folder files where folderId may be "" or NULL.
+ # Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):
diff --git a/modules/migrations/__init__.py b/modules/migrations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modules/migrations/_archive/README.md b/modules/migrations/_archive/README.md
new file mode 100644
index 00000000..c488801a
--- /dev/null
+++ b/modules/migrations/_archive/README.md
@@ -0,0 +1,11 @@
+# Archived one-off migrations
+
+`migrate_folders_to_groups.py` copies `FileFolder` + `FileItem.folderId` into `TableGrouping` (`files/list`). It was used during an experimental UI path; **product choice** is to keep physical folders (`FileFolder`, `folderId`) and recover `FormGeneratorTree` (see `wiki/c-work/1-plan/2026-05-formgenerator-tree-and-folder-recovery.md`).
+
+Run only if you need a historical data rescue:
+
+```bash
+cd gateway
+python -m modules.migrations._archive.migrate_folders_to_groups --verbose
+python -m modules.migrations._archive.migrate_folders_to_groups --execute --verbose
+```
diff --git a/modules/migrations/_archive/__init__.py b/modules/migrations/_archive/__init__.py
new file mode 100644
index 00000000..a733bae9
--- /dev/null
+++ b/modules/migrations/_archive/__init__.py
@@ -0,0 +1 @@
+# Subpackage for archived one-off migration scripts (not part of normal app startup).
diff --git a/modules/migrations/_archive/migrate_folders_to_groups.py b/modules/migrations/_archive/migrate_folders_to_groups.py
new file mode 100644
index 00000000..6beed744
--- /dev/null
+++ b/modules/migrations/_archive/migrate_folders_to_groups.py
@@ -0,0 +1,261 @@
+"""
+One-time migration: Convert FileFolder tree + FileItem.folderId to table_groupings.
+
+Archived per wiki plan 2026-05-formgenerator-tree-and-folder-recovery (Stage 1.A).
+Product direction: keep FileFolder + folderId; do not run DROP migrations.
+This script remains for audit / one-off data rescue only.
+
+Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column
+from the database (those would be separate Alembic/SQL steps -- not part of current product path).
+
+Usage (from gateway working directory):
+ python -m modules.migrations._archive.migrate_folders_to_groups [--dry-run] [--verbose]
+ python -m modules.migrations._archive.migrate_folders_to_groups --execute --verbose
+
+Steps:
+ 1. For each distinct (userId, mandateId) combination that has FileFolder records:
+ a. Build the full folder tree (recursive)
+ b. Write it as a TableGroupNode tree into table_groupings (contextKey='files/list')
+ – merges with any existing groups rather than overwriting
+ c. For each FileItem with a folderId that maps into this tree,
+ add its id to the matching group's itemIds
+ 2. Print a summary (rows migrated, groups created, files assigned)
+ 3. If not --dry-run: commits the inserts/updates
+ NOTE: Schema changes (ALTER TABLE DROP COLUMN, DROP TABLE) are intentionally
+ NOT performed by this script. Run the corresponding Alembic migration
+ (migrations/versions/xxxx_drop_folder_columns.py) afterwards.
+"""
+
+import argparse
+import json
+import logging
+import uuid
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+def _scalarRow(row):
+ if row is None:
+ return None
+ if isinstance(row, dict):
+ return next(iter(row.values()))
+ return row[0]
+
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+def _build_tree(folders: list, parent_id: Optional[str]) -> list:
+ """Recursively build TableGroupNode-compatible dicts from a flat folder list."""
+ children = [f for f in folders if f.get("parentId") == parent_id]
+ result = []
+ for folder in children:
+ node = {
+ "id": str(uuid.uuid4()),
+ "name": folder["name"],
+ "itemIds": [],
+ "subGroups": _build_tree(folders, folder["id"]),
+ "meta": {"migratedFromFolderId": folder["id"]},
+ }
+ result.append(node)
+ return result
+
+
+def _assign_files_to_nodes(nodes: list, files_by_folder: dict) -> list:
+ """Recursively assign file IDs to group nodes based on folder mapping."""
+ for node in nodes:
+ folder_id = (node.get("meta") or {}).get("migratedFromFolderId")
+ if folder_id and folder_id in files_by_folder:
+ node["itemIds"] = list(files_by_folder[folder_id])
+ node["subGroups"] = _assign_files_to_nodes(node.get("subGroups", []), files_by_folder)
+ return nodes
+
+
+def _count_items(nodes: list) -> int:
+ total = 0
+ for node in nodes:
+ total += len(node.get("itemIds", []))
+ total += _count_items(node.get("subGroups", []))
+ return total
+
+
+def _now_ts() -> str:
+ from modules.shared.timeUtils import getUtcTimestamp
+ return getUtcTimestamp()
+
+
+# ── Main migration ────────────────────────────────────────────────────────────
+
+def run_migration(dry_run: bool = True, verbose: bool = False):
+ """Main migration entry point."""
+ logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
+ logger.info(f"Starting folder to group migration (dry_run={dry_run})")
+
+ from modules.connectors.connectorDbPostgre import getCachedConnector
+ from modules.shared.configuration import APP_CONFIG
+
+ connector = getCachedConnector(
+ dbHost=APP_CONFIG.get("DB_HOST", "_no_config_default_data"),
+ dbDatabase="poweron_management",
+ dbUser=APP_CONFIG.get("DB_USER"),
+ dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET"),
+ dbPort=int(APP_CONFIG.get("DB_PORT", 5432)),
+ userId=None,
+ )
+ if not connector or not connector.connection:
+ logger.error("Could not obtain a DB connection. Aborting.")
+ return
+
+ conn = connector.connection
+ cur = conn.cursor()
+
+ # ── 1. Check that the source tables still exist ───────────────────────────
+ cur.execute("""
+ SELECT EXISTS (
+ SELECT 1 FROM information_schema.tables
+ WHERE table_name = 'FileFolder'
+ ) AS ok
+ """)
+ folder_table_exists = bool(_scalarRow(cur.fetchone()))
+
+ cur.execute("""
+ SELECT EXISTS (
+ SELECT 1 FROM information_schema.columns
+ WHERE table_name = 'FileItem' AND column_name = 'folderId'
+ ) AS ok
+ """)
+ folder_column_exists = bool(_scalarRow(cur.fetchone()))
+
+ if not folder_table_exists and not folder_column_exists:
+ logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.")
+ return
+
+ if not folder_table_exists:
+ logger.warning("FileFolder table missing but FileItem.folderId column still present. Only file assignments will be migrated.")
+ if not folder_column_exists:
+ logger.warning("FileItem.folderId column missing but FileFolder table still present. Only group tree structure will be migrated.")
+
+ # ── 2. Load all folders ───────────────────────────────────────────────────
+ folders_by_user: dict = {}
+ if folder_table_exists:
+ cur.execute('SELECT "id", "name", "parentId", "sysCreatedBy", "mandateId" FROM "FileFolder"')
+ for row in cur.fetchall():
+ fid, fname, parent_id, user_id, mandate_id = row
+ key = (str(user_id), str(mandate_id) if mandate_id else "")
+ folders_by_user.setdefault(key, []).append({
+ "id": fid, "name": fname, "parentId": parent_id,
+ })
+ logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations")
+
+ # ── 3. Load file to folder assignments ────────────────────────────────────
+ files_by_key: dict = {}
+ if folder_column_exists:
+ cur.execute(
+ 'SELECT "id", "folderId", "sysCreatedBy", "mandateId" FROM "FileItem" WHERE "folderId" IS NOT NULL AND "folderId" != \'\''
+ )
+ for row in cur.fetchall():
+ file_id, folder_id, user_id, mandate_id = row
+ key = (str(user_id), str(mandate_id) if mandate_id else "")
+ files_by_key.setdefault(key, {}).setdefault(folder_id, []).append(file_id)
+ total_files = sum(
+ sum(len(v) for v in d.values()) for d in files_by_key.values()
+ )
+ logger.info(f"Found {total_files} file to folder assignments across {len(files_by_key)} (user, mandate) combos")
+
+ # ── 4. Combine and upsert groupings ──────────────────────────────────────
+ all_keys = set(folders_by_user.keys()) | set(files_by_key.keys())
+ stats = {"groups_created": 0, "groupings_upserted": 0, "files_assigned": 0}
+
+ for key in all_keys:
+ user_id, mandate_id = key
+ folders = folders_by_user.get(key, [])
+ files_by_folder = files_by_key.get(key, {})
+
+ # Build tree
+ roots = _build_tree(folders, None)
+ roots = _assign_files_to_nodes(roots, files_by_folder)
+
+ # Handle files in unknown folders (folder no longer in tree)
+ known_folder_ids = {f["id"] for f in folders}
+ for folder_id, file_ids in files_by_folder.items():
+ if folder_id not in known_folder_ids:
+ # Orphaned files: put them in an "Orphaned" group
+ roots.append({
+ "id": str(uuid.uuid4()),
+ "name": f"Orphaned (folder {folder_id[:8]}…)",
+ "itemIds": file_ids,
+ "subGroups": [],
+ "meta": {"migratedFromFolderId": folder_id, "orphaned": True},
+ })
+
+ if not roots:
+ continue
+
+ n_items = _count_items(roots)
+ stats["groups_created"] += len(roots)
+ stats["files_assigned"] += n_items
+
+ context_key = "files/list"
+ if verbose:
+ logger.debug(f" user={user_id} mandate={mandate_id}: {len(roots)} root groups, {n_items} files")
+
+ if not dry_run:
+ # Check for existing grouping
+ cur.execute(
+ 'SELECT "id", "rootGroups" FROM "TableGrouping" WHERE "userId" = %s AND "contextKey" = %s',
+ (user_id, context_key),
+ )
+ existing_row = cur.fetchone()
+
+ if existing_row:
+ existing_id, existing_raw = existing_row
+ existing_roots = json.loads(existing_raw) if isinstance(existing_raw, str) else (existing_raw or [])
+ # Merge: append migrated groups (avoid duplicates by migratedFromFolderId)
+ existing_meta_ids = {
+ (n.get("meta") or {}).get("migratedFromFolderId")
+ for n in existing_roots
+ if (n.get("meta") or {}).get("migratedFromFolderId")
+ }
+ new_roots = existing_roots + [
+ r for r in roots
+ if (r.get("meta") or {}).get("migratedFromFolderId") not in existing_meta_ids
+ ]
+ cur.execute(
+ 'UPDATE "TableGrouping" SET "rootGroups" = %s, "updatedAt" = %s WHERE "id" = %s',
+ (json.dumps(new_roots), _now_ts(), existing_id),
+ )
+ else:
+ new_id = str(uuid.uuid4())
+ cur.execute(
+ 'INSERT INTO "TableGrouping" ("id", "userId", "contextKey", "rootGroups", "updatedAt") VALUES (%s, %s, %s, %s, %s)',
+ (new_id, user_id, context_key, json.dumps(roots), _now_ts()),
+ )
+ stats["groupings_upserted"] += 1
+
+ # ── 5. Summary ────────────────────────────────────────────────────────────
+ if not dry_run:
+ conn.commit()
+ logger.info("Migration committed.")
+ else:
+ logger.info("DRY RUN — no changes written.")
+
+ logger.info(
+ f"Summary: groupings_upserted={stats['groupings_upserted']}, "
+ f"groups_created={stats['groups_created']}, "
+ f"files_assigned={stats['files_assigned']}"
+ )
+ logger.info(
+ "Next steps (run after verifying data):\n"
+ " 1. Run Alembic migration to DROP COLUMN FileItem.folderId\n"
+ " 2. Run Alembic migration to DROP TABLE FileFolder"
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings (archived script)")
+ parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)")
+ parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)")
+ parser.add_argument("--verbose", action="store_true", help="Show per-user details")
+ args = parser.parse_args()
+ dry_run = not args.execute
+ run_migration(dry_run=dry_run, verbose=args.verbose)
diff --git a/modules/routes/routeAutomationWorkspace.py b/modules/routes/routeAutomationWorkspace.py
new file mode 100644
index 00000000..b742d7ea
--- /dev/null
+++ b/modules/routes/routeAutomationWorkspace.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""
+User-facing Automation Workspace API.
+
+Lists workflow runs the user can access (via FeatureAccess on
+targetFeatureInstanceId) and provides detail views with step logs
+and linked files. Designed for the "Workspace" tab under
+Nutzung > Automation.
+"""
+
+import logging
+import math
+from typing import Optional
+
+from fastapi import APIRouter, Depends, Request, Query, Path, HTTPException
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+
+from modules.auth.authentication import getRequestContext, RequestContext
+from modules.connectors.connectorDbPostgre import DatabaseConnector
+from modules.shared.configuration import APP_CONFIG
+from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import (
+ AutoRun,
+ AutoStepLog,
+ AutoWorkflow,
+)
+from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
+from modules.shared.i18nRegistry import apiRouteContext
+
+routeApiMsg = apiRouteContext("routeAutomationWorkspace")
+logger = logging.getLogger(__name__)
+limiter = Limiter(key_func=get_remote_address)
+
+router = APIRouter(prefix="/api/automations/runs", tags=["AutomationWorkspace"])
+
+
+def _getDb() -> DatabaseConnector:
+ return DatabaseConnector(
+ dbHost=APP_CONFIG.get("DB_HOST", "localhost"),
+ dbDatabase=graphicalEditorDatabase,
+ dbUser=APP_CONFIG.get("DB_USER"),
+ dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"),
+ dbPort=int(APP_CONFIG.get("DB_PORT", 5432)),
+ userId=None,
+ )
+
+
+def _getUserAccessibleInstanceIds(userId: str) -> list[str]:
+ """Return all featureInstanceIds the user has enabled FeatureAccess for."""
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ rootIface = getRootInterface()
+ allAccess = rootIface.getFeatureAccessesForUser(userId) or []
+ return [
+ a.featureInstanceId
+ for a in allAccess
+ if a.featureInstanceId and a.enabled
+ ]
+
+
+_FILE_REF_KEYS = ("fileId", "documentId", "fileIds", "documents")
+
+
+def _extractFileIdsFromValue(value, accumulator: set[str]) -> None:
+ """Recursively scan a value (dict/list/str) for file id references."""
+ if isinstance(value, dict):
+ for key, sub in value.items():
+ if key in _FILE_REF_KEYS:
+ _collectFileIdsFromRef(sub, accumulator)
+ else:
+ _extractFileIdsFromValue(sub, accumulator)
+ elif isinstance(value, list):
+ for item in value:
+ _extractFileIdsFromValue(item, accumulator)
+
+
+def _collectFileIdsFromRef(val, accumulator: set[str]) -> None:
+ """Add file ids from a value located under a known file-reference key."""
+ if isinstance(val, str) and val:
+ accumulator.add(val)
+ elif isinstance(val, list):
+ for v in val:
+ if isinstance(v, str) and v:
+ accumulator.add(v)
+ elif isinstance(v, dict) and v.get("id"):
+ accumulator.add(v["id"])
+ elif isinstance(val, dict) and val.get("id"):
+ accumulator.add(val["id"])
+
+
+@router.get("")
+@limiter.limit("60/minute")
+def listWorkspaceRuns(
+ request: Request,
+ scope: str = Query("mine", description="mine = own runs, mandate = all accessible"),
+ status: Optional[str] = Query(None, description="Filter by run status"),
+ targetInstanceId: Optional[str] = Query(None, description="Filter by targetFeatureInstanceId"),
+ workflowId: Optional[str] = Query(None, description="Filter by workflow"),
+ limit: int = Query(50, ge=1, le=200),
+ offset: int = Query(0, ge=0),
+ context: RequestContext = Depends(getRequestContext),
+) -> dict:
+ """List workflow runs visible to the user.
+
+ scope=mine: only runs owned by the user.
+ scope=mandate: all runs where the user has FeatureAccess on the
+ workflow's targetFeatureInstanceId.
+ """
+ db = _getDb()
+ if not db._ensureTableExists(AutoRun):
+ return {"runs": [], "total": 0, "limit": limit, "offset": offset}
+
+ userId = str(context.user.id) if context.user else None
+ if not userId:
+ raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required"))
+
+ accessibleInstanceIds = _getUserAccessibleInstanceIds(userId)
+ if not accessibleInstanceIds:
+ return {"runs": [], "total": 0, "limit": limit, "offset": offset}
+
+ if not db._ensureTableExists(AutoWorkflow):
+ return {"runs": [], "total": 0, "limit": limit, "offset": offset}
+
+ wfFilter: dict = {}
+ if targetInstanceId:
+ if targetInstanceId not in accessibleInstanceIds:
+ raise HTTPException(status_code=403, detail=routeApiMsg("Access denied to target instance"))
+ wfFilter["targetFeatureInstanceId"] = targetInstanceId
+ workflows = db.getRecordset(AutoWorkflow, recordFilter=wfFilter or None) or []
+
+ visibleWfIds: set[str] = set()
+ wfMap: dict = {}
+ for wf in workflows:
+ wfDict = dict(wf)
+ tid = wfDict.get("targetFeatureInstanceId") or wfDict.get("featureInstanceId")
+ if tid and tid in accessibleInstanceIds:
+ wfId = wfDict.get("id")
+ if wfId:
+ visibleWfIds.add(wfId)
+ wfMap[wfId] = wfDict
+
+ if workflowId:
+ if workflowId not in visibleWfIds:
+ return {"runs": [], "total": 0, "limit": limit, "offset": offset}
+ visibleWfIds = {workflowId}
+
+ if not visibleWfIds:
+ return {"runs": [], "total": 0, "limit": limit, "offset": offset}
+
+ allRuns = db.getRecordset(AutoRun, recordFilter={}) or []
+ filtered = []
+ for r in allRuns:
+ row = dict(r)
+ if row.get("workflowId") not in visibleWfIds:
+ continue
+ if scope == "mine" and row.get("ownerId") != userId:
+ continue
+ if status and row.get("status") != status:
+ continue
+ filtered.append(row)
+
+ filtered.sort(
+ key=lambda x: x.get("startedAt") or x.get("sysCreatedAt") or 0,
+ reverse=True,
+ )
+ total = len(filtered)
+ page = filtered[offset: offset + limit]
+
+ from modules.routes.routeHelpers import enrichRowsWithFkLabels, resolveMandateLabels, resolveInstanceLabels
+
+ for row in page:
+ wf = wfMap.get(row.get("workflowId"), {})
+ row["workflowLabel"] = row.get("label") or wf.get("label") or row.get("workflowId", "")
+ row["targetFeatureInstanceId"] = wf.get("targetFeatureInstanceId") or wf.get("featureInstanceId")
+
+ enrichRowsWithFkLabels(
+ page,
+ labelResolvers={
+ "mandateId": resolveMandateLabels,
+ "targetFeatureInstanceId": resolveInstanceLabels,
+ },
+ )
+ for row in page:
+ row["targetInstanceLabel"] = row.pop("targetFeatureInstanceIdLabel", None)
+ row["mandateLabel"] = row.pop("mandateIdLabel", None)
+
+ return {"runs": page, "total": total, "limit": limit, "offset": offset}
+
+
+@router.get("/{runId}/detail")
+@limiter.limit("60/minute")
+def getWorkspaceRunDetail(
+ request: Request,
+ runId: str = Path(..., description="Run ID"),
+ context: RequestContext = Depends(getRequestContext),
+) -> dict:
+ """Get full detail for a single run: metadata, step logs, linked files."""
+ db = _getDb()
+ userId = str(context.user.id) if context.user else None
+ if not userId:
+ raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required"))
+
+ if not db._ensureTableExists(AutoRun):
+ raise HTTPException(status_code=404, detail=routeApiMsg("Run not found"))
+
+ runs = db.getRecordset(AutoRun, recordFilter={"id": runId})
+ if not runs:
+ raise HTTPException(status_code=404, detail=routeApiMsg("Run not found"))
+ run = dict(runs[0])
+
+ wfId = run.get("workflowId")
+ workflow: dict = {}
+ if wfId and db._ensureTableExists(AutoWorkflow):
+ wfs = db.getRecordset(AutoWorkflow, recordFilter={"id": wfId})
+ if wfs:
+ workflow = dict(wfs[0])
+
+ tid = workflow.get("targetFeatureInstanceId") or workflow.get("featureInstanceId")
+ accessibleIds = _getUserAccessibleInstanceIds(userId)
+ isOwner = run.get("ownerId") == userId
+
+ if not isOwner and (not tid or tid not in accessibleIds) and not context.isPlatformAdmin:
+ raise HTTPException(status_code=403, detail=routeApiMsg("Access denied"))
+
+ steps: list = []
+ if db._ensureTableExists(AutoStepLog):
+ stepRecords = db.getRecordset(AutoStepLog, recordFilter={"runId": runId}) or []
+ steps = [dict(s) for s in stepRecords]
+ steps.sort(key=lambda s: s.get("startedAt") or 0)
+
+ allFileIds: set[str] = set()
+ perStepFileIds: list[tuple[set[str], set[str]]] = []
+ for step in steps:
+ inputIds: set[str] = set()
+ outputIds: set[str] = set()
+ _extractFileIdsFromValue(step.get("inputSnapshot") or {}, inputIds)
+ _extractFileIdsFromValue(step.get("output") or {}, outputIds)
+ perStepFileIds.append((inputIds, outputIds))
+ allFileIds.update(inputIds)
+ allFileIds.update(outputIds)
+
+ nodeOutputs = run.get("nodeOutputs") or {}
+ runLevelIds: set[str] = set()
+ _extractFileIdsFromValue(nodeOutputs, runLevelIds)
+ allFileIds.update(runLevelIds)
+
+ fileMetaById: dict[str, dict] = {}
+ try:
+ from modules.datamodels.datamodelFiles import FileItem
+ from modules.interfaces.interfaceDbManagement import ComponentObjects
+ mgmtDb = ComponentObjects().db
+ if mgmtDb._ensureTableExists(FileItem):
+ for fid in allFileIds:
+ try:
+ rec = mgmtDb.getRecord(FileItem, fid)
+ if rec:
+ recDict = dict(rec)
+ fileMetaById[fid] = {
+ "id": fid,
+ "fileName": recDict.get("fileName") or recDict.get("name"),
+ }
+ except Exception:
+ pass
+ except Exception as e:
+ logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e)
+
+ def _resolveFileList(ids: set[str]) -> list[dict]:
+ return [fileMetaById[fid] for fid in ids if fid in fileMetaById]
+
+ assignedFileIds: set[str] = set()
+ for step, (inputIds, outputIds) in zip(steps, perStepFileIds):
+ step["inputFiles"] = _resolveFileList(inputIds)
+ step["outputFiles"] = _resolveFileList(outputIds)
+ assignedFileIds.update(inputIds)
+ assignedFileIds.update(outputIds)
+
+ unassignedFiles = _resolveFileList(allFileIds - assignedFileIds)
+ allFiles = _resolveFileList(allFileIds)
+
+ run["workflowLabel"] = run.get("label") or workflow.get("label") or wfId
+ run["targetFeatureInstanceId"] = tid
+
+ targetInstanceLabel = None
+ if tid:
+ try:
+ from modules.routes.routeHelpers import resolveInstanceLabels
+ labelMap = resolveInstanceLabels([tid])
+ targetInstanceLabel = labelMap.get(tid)
+ except Exception:
+ pass
+ run["targetInstanceLabel"] = targetInstanceLabel
+
+ return {
+ "run": run,
+ "workflow": {
+ "id": workflow.get("id"),
+ "label": workflow.get("label"),
+ "targetFeatureInstanceId": tid,
+ "featureInstanceId": workflow.get("featureInstanceId"),
+ "tags": workflow.get("tags", []),
+ } if workflow else None,
+ "steps": steps,
+ "files": allFiles,
+ "unassignedFiles": unassignedFiles,
+ }
diff --git a/modules/routes/routeClickup.py b/modules/routes/routeClickup.py
index ccf1c481..c3f4b976 100644
--- a/modules/routes/routeClickup.py
+++ b/modules/routes/routeClickup.py
@@ -57,8 +57,8 @@ def _svc_for_connection(current_user: User, connection: UserConnection):
services = getServices(current_user, None)
if not services.clickup.setAccessTokenFromConnection(connection):
raise HTTPException(
- status_code=status.HTTP_401_UNAUTHORIZED,
- detail=routeApiMsg("Failed to set ClickUp access token"),
+ status_code=status.HTTP_502_BAD_GATEWAY,
+ detail=routeApiMsg("Failed to set ClickUp access token. Connection may be expired or invalid."),
)
return services.clickup
diff --git a/modules/routes/routeDataConnections.py b/modules/routes/routeDataConnections.py
index 8e7a730d..124d2fb4 100644
--- a/modules/routes/routeDataConnections.py
+++ b/modules/routes/routeDataConnections.py
@@ -152,10 +152,28 @@ async def get_connections(
- GET /api/connections/?mode=filterValues&column=status
- GET /api/connections/?mode=ids
"""
- from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
+ from modules.routes.routeHelpers import (
+ handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
+ handleGroupingInRequest, applyGroupScopeFilter,
+ )
+
+ CONTEXT_KEY = "connections"
+
+ # Parse pagination params early — needed for grouping in all modes
+ paginationParams = None
+ if pagination:
+ try:
+ paginationDict = json.loads(pagination)
+ if paginationDict:
+ paginationDict = normalize_pagination_dict(paginationDict)
+ paginationParams = PaginationParams(**paginationDict)
+ except (json.JSONDecodeError, ValueError) as e:
+ raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
+
+ interface = getInterface(currentUser)
+ groupCtx = handleGroupingInRequest(paginationParams, interface, CONTEXT_KEY)
def _buildEnhancedItems():
- interface = getInterface(currentUser)
connections = interface.getUserConnections(currentUser.id)
items = []
for connection in connections:
@@ -182,6 +200,7 @@ async def get_connections(
try:
items = _buildEnhancedItems()
enrichRowsWithFkLabels(items, UserConnection)
+ items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
except Exception as e:
logger.error(f"Error getting filter values for connections: {str(e)}")
@@ -189,63 +208,40 @@ async def get_connections(
if mode == "ids":
try:
- return handleIdsInMemory(_buildEnhancedItems(), pagination)
+ items = applyGroupScopeFilter(_buildEnhancedItems(), groupCtx.itemIds)
+ return handleIdsInMemory(items, pagination)
except Exception as e:
logger.error(f"Error getting IDs for connections: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
try:
- interface = getInterface(currentUser)
-
# NOTE: Cannot use db.getRecordsetPaginated() here because each connection
# is enriched with computed tokenStatus/tokenExpiresAt (requires per-row DB lookup).
# Token refresh also may trigger re-fetch. Connections per user are typically < 10,
# so in-memory pagination is acceptable.
-
- # Parse pagination parameter
- paginationParams = None
- if pagination:
- try:
- paginationDict = json.loads(pagination)
- if paginationDict:
- # Normalize pagination dict (handles top-level "search" field)
- paginationDict = normalize_pagination_dict(paginationDict)
- paginationParams = PaginationParams(**paginationDict)
- except (json.JSONDecodeError, ValueError) as e:
- raise HTTPException(
- status_code=400,
- detail=f"Invalid pagination parameter: {str(e)}"
- )
-
+
# SECURITY FIX: All users (including admins) can only see their own connections
- # This prevents admin from seeing other users' connections and causing confusion
connections = interface.getUserConnections(currentUser.id)
-
+
# Perform silent token refresh for expired OAuth connections
try:
refresh_result = await token_refresh_service.refresh_expired_tokens(currentUser.id)
if refresh_result.get("refreshed", 0) > 0:
logger.info(f"Silently refreshed {refresh_result['refreshed']} tokens for user {currentUser.id}")
- # Re-fetch connections to get updated token status
connections = interface.getUserConnections(currentUser.id)
except Exception as e:
logger.warning(f"Silent token refresh failed for user {currentUser.id}: {str(e)}")
- # Continue with original connections even if refresh fails
-
- # Enhance each connection with token status information and convert to dict
+
enhanced_connections_dict = []
for connection in connections:
- # Get token status for this connection
tokenStatus, tokenExpiresAt = getTokenStatusForConnection(interface, connection.id)
-
- # Convert to dict for filtering/sorting
connection_dict = {
"id": connection.id,
"userId": connection.userId,
"authority": connection.authority.value if hasattr(connection.authority, 'value') else str(connection.authority),
"externalId": connection.externalId,
"externalUsername": connection.externalUsername or "",
- "externalEmail": connection.externalEmail, # Keep None instead of converting to empty string
+ "externalEmail": connection.externalEmail,
"status": connection.status.value if hasattr(connection.status, 'value') else str(connection.status),
"connectedAt": connection.connectedAt,
"lastChecked": connection.lastChecked,
@@ -254,24 +250,26 @@ async def get_connections(
"tokenExpiresAt": tokenExpiresAt
}
enhanced_connections_dict.append(connection_dict)
-
+
enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection)
+ enhanced_connections_dict = applyGroupScopeFilter(enhanced_connections_dict, groupCtx.itemIds)
if paginationParams is None:
return {
"items": enhanced_connections_dict,
"pagination": None,
+ "groupTree": groupCtx.groupTree,
}
-
+
# Apply filtering if provided
if paginationParams.filters:
component_interface = ComponentObjects()
component_interface.setUserContext(currentUser)
enhanced_connections_dict = component_interface._applyFilters(
- enhanced_connections_dict,
+ enhanced_connections_dict,
paginationParams.filters
)
-
+
# Apply sorting if provided
if paginationParams.sort:
component_interface = ComponentObjects()
@@ -280,14 +278,14 @@ async def get_connections(
enhanced_connections_dict,
paginationParams.sort
)
-
+
totalItems = len(enhanced_connections_dict)
totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0
-
+
startIdx = (paginationParams.page - 1) * paginationParams.pageSize
endIdx = startIdx + paginationParams.pageSize
paged_connections = enhanced_connections_dict[startIdx:endIdx]
-
+
return {
"items": paged_connections,
"pagination": PaginationMetadata(
@@ -298,6 +296,7 @@ async def get_connections(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
+ "groupTree": groupCtx.groupTree,
}
except HTTPException:
@@ -351,11 +350,18 @@ def create_connection(
externalUsername="", # Will be set after OAuth
status=ConnectionStatus.PENDING # Start with PENDING status
)
-
+
+ # Apply knowledge consent + preferences from request body before persisting
+ knowledge_enabled = connection_data.get("knowledgeIngestionEnabled")
+ if isinstance(knowledge_enabled, bool):
+ connection.knowledgeIngestionEnabled = knowledge_enabled
+ knowledge_prefs = connection_data.get("knowledgePreferences")
+ if isinstance(knowledge_prefs, dict):
+ connection.knowledgePreferences = knowledge_prefs
+
# Save connection record - models now handle timestamp serialization automatically
interface.db.recordModify(UserConnection, connection.id, connection.model_dump())
-
-
+
return connection
except HTTPException:
@@ -586,8 +592,25 @@ def disconnect_service(
detail=routeApiMsg("Connection not found")
)
- # Update connection status
- connection.status = ConnectionStatus.INACTIVE
+ # Fire revoked event BEFORE DB status change so knowledge purge and
+ # status mutation form one logical step; subscribers see the
+ # connection as it was. INACTIVE does not exist on the enum — REVOKED
+ # is the correct terminal-but-retained state (deleted rows are
+ # handled in DELETE /{id}).
+ try:
+ from modules.shared.callbackRegistry import callbackRegistry
+
+ callbackRegistry.trigger(
+ "connection.revoked",
+ connectionId=connectionId,
+ authority=str(getattr(connection.authority, "value", connection.authority) or ""),
+ userId=str(currentUser.id),
+ reason="disconnected",
+ )
+ except Exception as _cbErr:
+ logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
+
+ connection.status = ConnectionStatus.REVOKED
connection.lastChecked = getUtcTimestamp()
# Update connection record - models now handle timestamp serialization automatically
@@ -636,6 +659,23 @@ def delete_connection(
detail=routeApiMsg("Connection not found")
)
+ # Fire revoked event BEFORE the row disappears so consumers still
+ # have authority/connection context for observability; purge itself
+ # targets FileContentIndex rows by connectionId which are unaffected
+ # by the UserConnection delete.
+ try:
+ from modules.shared.callbackRegistry import callbackRegistry
+
+ callbackRegistry.trigger(
+ "connection.revoked",
+ connectionId=connectionId,
+ authority=str(getattr(connection.authority, "value", connection.authority) or ""),
+ userId=str(currentUser.id),
+ reason="deleted",
+ )
+ except Exception as _cbErr:
+ logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
+
# Remove the connection - only need connectionId since permissions are verified
interface.removeUserConnection(connectionId)
diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py
index 90431ba2..3394b5c5 100644
--- a/modules/routes/routeDataFiles.py
+++ b/modules/routes/routeDataFiles.py
@@ -11,8 +11,7 @@ from modules.auth import limiter, getCurrentUser, getRequestContext, RequestCont
# Import interfaces
import modules.interfaces.interfaceDbManagement as interfaceDbManagement
-from modules.datamodels.datamodelFiles import FileItem, FilePreview
-from modules.datamodels.datamodelFileFolder import FileFolder
+from modules.datamodels.datamodelFiles import FileItem, FilePreview, FileFolder
from modules.shared.attributeUtils import getModelAttributeDefinitions
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
@@ -73,14 +72,18 @@ def _resolveFileWithScope(currentUser: User, context: RequestContext, fileId: st
return scopedMgmt, fileItem
-async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
+async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user, *, mandateId: str = None, featureInstanceId: str = None):
"""Background task: pre-scan + extraction + knowledge indexing.
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
Step 2: Content extraction via runExtraction -> ContentParts
- Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
+ Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store"""
userId = user.id if hasattr(user, "id") else str(user)
try:
- mgmtInterface = interfaceDbManagement.getInterface(user)
+ mgmtInterface = interfaceDbManagement.getInterface(
+ user,
+ mandateId=mandateId or None,
+ featureInstanceId=featureInstanceId or None,
+ )
mgmtInterface.updateFile(fileId, {"status": "processing"})
rawBytes = mgmtInterface.getFileData(fileId)
@@ -122,9 +125,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
f"{contentIndex.totalObjects} objects"
)
- # Persist FileContentIndex immediately
+ # Persist FileContentIndex immediately.
+ # IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any
+ # prior successful run — otherwise this upsert wipes the idempotency cache
+ # and requestIngestion cannot detect duplicates (AC4 breaks).
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
+ try:
+ _existing = knowledgeDb.getFileContentIndex(fileId)
+ except Exception:
+ _existing = None
+ if _existing:
+ _existingStruct = (
+ _existing.get("structure") if isinstance(_existing, dict)
+ else getattr(_existing, "structure", {})
+ ) or {}
+ _existingStatus = (
+ _existing.get("status") if isinstance(_existing, dict)
+ else getattr(_existing, "status", "")
+ ) or ""
+ if "_ingestion" in _existingStruct:
+ contentIndex.structure = dict(contentIndex.structure or {})
+ contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"]
+ if _existingStatus == "indexed":
+ contentIndex.status = "indexed"
knowledgeDb.upsertFileContentIndex(contentIndex)
# Step 2: Content extraction (AI-free, produces ContentParts)
@@ -134,7 +158,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
- options = ExtractionOptions()
+ # mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion.
+ # The default MergeStrategy concatenates all text parts into a single blob, which
+ # collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval.
+ options = ExtractionOptions(mergeStrategy=None)
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
@@ -181,15 +208,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
)
knowledgeService = getService("knowledge", ctx)
- await knowledgeService.indexFile(
- fileId=fileId,
- fileName=fileName,
- mimeType=mimeType,
- userId=userId,
- featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
- mandateId=str(mandate_id) if mandate_id else "",
- contentObjects=contentObjects,
- structure=contentIndex.structure,
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="file",
+ sourceId=fileId,
+ fileName=fileName,
+ mimeType=mimeType,
+ userId=userId,
+ featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
+ mandateId=str(mandate_id) if mandate_id else "",
+ contentObjects=contentObjects,
+ structure=contentIndex.structure,
+ provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"},
+ )
)
# Re-acquire interface after await to avoid stale user context from the singleton
@@ -221,6 +254,213 @@ router = APIRouter(
}
)
+
+@router.get("/folders/tree")
+@limiter.limit("120/minute")
+def get_folder_tree(
+ request: Request,
+ owner: str = Query("me", description="'me' | 'shared'"),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ o = (owner or "me").strip().lower()
+ if o == "me":
+ return managementInterface.getOwnFolderTree()
+ if o == "shared":
+ return managementInterface.getSharedFolderTree()
+ raise HTTPException(status_code=400, detail="owner must be 'me' or 'shared'")
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"get_folder_tree error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/folders", status_code=status.HTTP_201_CREATED)
+@limiter.limit("30/minute")
+def create_folder(
+ request: Request,
+ body: Dict[str, Any] = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ name = body.get("name")
+ if not name or not str(name).strip():
+ raise HTTPException(status_code=400, detail="name is required")
+ parentId = body.get("parentId") or None
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ return managementInterface.createFolder(str(name).strip(), parentId)
+ except PermissionError as e:
+ raise HTTPException(status_code=403, detail=str(e))
+ except interfaceDbManagement.FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"create_folder error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.patch("/folders/{folderId}")
+@limiter.limit("30/minute")
+def rename_folder(
+ request: Request,
+ folderId: str = Path(...),
+ body: Dict[str, Any] = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ name = body.get("name")
+ if not name or not str(name).strip():
+ raise HTTPException(status_code=400, detail="name is required")
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ return managementInterface.renameFolder(folderId, str(name).strip())
+ except PermissionError as e:
+ raise HTTPException(status_code=403, detail=str(e))
+ except interfaceDbManagement.FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"rename_folder error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/folders/{folderId}/move")
+@limiter.limit("30/minute")
+def move_folder(
+ request: Request,
+ folderId: str = Path(...),
+ body: Dict[str, Any] = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ newParentId = body.get("parentId")
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ return managementInterface.moveFolder(folderId, newParentId or None)
+ except ValueError as e:
+ raise HTTPException(status_code=400, detail=str(e))
+ except PermissionError as e:
+ raise HTTPException(status_code=403, detail=str(e))
+ except interfaceDbManagement.FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"move_folder error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.delete("/folders/{folderId}")
+@limiter.limit("30/minute")
+def delete_folder(
+ request: Request,
+ folderId: str = Path(...),
+ cascade: bool = Query(True, description="Cascade delete sub-folders and files"),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ return managementInterface.deleteFolderCascade(folderId)
+ except PermissionError as e:
+ raise HTTPException(status_code=403, detail=str(e))
+ except interfaceDbManagement.FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"delete_folder error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.patch("/folders/{folderId}/scope")
+@limiter.limit("30/minute")
+def patch_folder_scope(
+ request: Request,
+ folderId: str = Path(...),
+ body: Dict[str, Any] = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ scope = body.get("scope")
+ if not scope:
+ raise HTTPException(status_code=400, detail="scope is required")
+ cascadeToFiles = body.get("cascadeToFiles", False)
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ return managementInterface.patchFolderScope(folderId, scope, cascadeToFiles)
+ except ValueError as e:
+ raise HTTPException(status_code=400, detail=str(e))
+ except PermissionError as e:
+ raise HTTPException(status_code=403, detail=str(e))
+ except interfaceDbManagement.FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"patch_folder_scope error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.patch("/folders/{folderId}/neutralize")
+@limiter.limit("30/minute")
+def patch_folder_neutralize(
+ request: Request,
+ folderId: str = Path(...),
+ body: Dict[str, Any] = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ try:
+ neutralize = body.get("neutralize")
+ if neutralize is None:
+ raise HTTPException(status_code=400, detail="neutralize is required")
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ return managementInterface.patchFolderNeutralize(folderId, bool(neutralize))
+ except PermissionError as e:
+ raise HTTPException(status_code=403, detail=str(e))
+ except interfaceDbManagement.FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"patch_folder_neutralize error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
@router.get("/list")
@limiter.limit("120/minute")
def get_files(
@@ -249,7 +489,6 @@ def get_files(
try:
paginationDict = json.loads(pagination)
if paginationDict:
- # Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
@@ -257,51 +496,43 @@ def get_files(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
-
+
from modules.routes.routeHelpers import (
handleIdsMode,
handleFilterValuesInMemory,
+ handleGroupingInRequest, applyGroupScopeFilter,
)
+ import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
+ appInterface = _appIface.getInterface(currentUser)
+ groupCtx = handleGroupingInRequest(paginationParams, appInterface, "files/list")
+
+ def _filesToDicts(fileItems):
+ return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in fileItems]
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
allFiles = managementInterface.getAllFiles()
items = allFiles if isinstance(allFiles, list) else (allFiles.items if hasattr(allFiles, "items") else [])
- itemDicts = [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items]
+ itemDicts = _filesToDicts(items)
enrichRowsWithFkLabels(itemDicts, FileItem)
+ itemDicts = applyGroupScopeFilter(itemDicts, groupCtx.itemIds)
return handleFilterValuesInMemory(itemDicts, column, pagination)
if mode == "ids":
recordFilter = {"sysCreatedBy": managementInterface.userId}
return handleIdsMode(managementInterface.db, FileItem, pagination, recordFilter)
- recordFilter = None
- if paginationParams and paginationParams.filters and "folderId" in paginationParams.filters:
- fVal = paginationParams.filters.get("folderId")
- # For a concrete folderId we use recordFilter (exact equality).
- # For null / empty (= "root") we keep it in pagination.filters so the
- # connector applies `IS NULL OR = ''` – files predating the folderId
- # fix were stored with an empty string instead of NULL.
- if fVal is None or (isinstance(fVal, str) and fVal.strip() == ""):
- paginationParams.filters["folderId"] = None
- else:
- paginationParams.filters.pop("folderId")
- recordFilter = {"folderId": fVal}
-
- result = managementInterface.getAllFiles(pagination=paginationParams, recordFilter=recordFilter)
-
- def _filesToDicts(items):
- return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items]
+ result = managementInterface.getAllFiles(pagination=paginationParams)
if paginationParams:
- enriched = enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem)
+ enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem), groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@@ -312,11 +543,12 @@ def get_files(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
+ "groupTree": groupCtx.groupTree,
}
else:
items = result if isinstance(result, list) else (result.items if hasattr(result, "items") else [result])
- enriched = enrichRowsWithFkLabels(_filesToDicts(items), FileItem)
- return {"items": enriched, "pagination": None}
+ enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(items), FileItem), groupCtx.itemIds)
+ return {"items": enriched, "pagination": None, "groupTree": groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:
@@ -327,6 +559,36 @@ def get_files(
)
+def _addFileToGroup(appInterface, fileId: str, groupId: str, contextKey: str = "files/list"):
+ """Add a file to a group in the persisted groupTree (upsert)."""
+ from modules.routes.routeHelpers import _collectItemIds
+ try:
+ existing = appInterface.getTableGrouping(contextKey)
+ if not existing:
+ return
+ nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups]
+ def _add(nds):
+ for nd in nds:
+ nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
+ if nid == groupId:
+ itemIds = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
+ if fileId not in itemIds:
+ itemIds.append(fileId)
+ if isinstance(nd, dict):
+ nd["itemIds"] = itemIds
+ else:
+ nd.itemIds = itemIds
+ return True
+ subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
+ if _add(subs):
+ return True
+ return False
+ _add(nodes)
+ appInterface.upsertTableGrouping(contextKey, nodes)
+ except Exception as e:
+ logger.warning(f"_addFileToGroup failed: {e}")
+
+
@router.post("/upload", status_code=status.HTTP_201_CREATED)
@limiter.limit("10/minute")
async def upload_file(
@@ -334,7 +596,7 @@ async def upload_file(
file: UploadFile = File(...),
workflowId: Optional[str] = Form(None),
featureInstanceId: Optional[str] = Form(None),
- folderId: Optional[str] = Form(None),
+ groupId: Optional[str] = Form(None),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
) -> JSONResponse:
@@ -358,31 +620,22 @@ async def upload_file(
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"File too large. Maximum size: {interfaceDbManagement.APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB"
)
-
- # Normalize folderId: empty string / "null" / "root" → None (root folder)
- normalizedFolderId: Optional[str] = folderId
- if isinstance(normalizedFolderId, str):
- trimmed = normalizedFolderId.strip()
- if not trimmed or trimmed.lower() in {"null", "none", "root"}:
- normalizedFolderId = None
- else:
- normalizedFolderId = trimmed
# Save file via LucyDOM interface in the database
fileItem, duplicateType = managementInterface.saveUploadedFile(
- fileContent, file.filename, folderId=normalizedFolderId
+ fileContent, file.filename
)
if featureInstanceId and not fileItem.featureInstanceId:
managementInterface.updateFile(fileItem.id, {"featureInstanceId": featureInstanceId})
fileItem.featureInstanceId = featureInstanceId
- # For exact duplicates we keep the existing record, but move it into the
- # target folder so the user actually sees their upload land where they expect.
- if duplicateType == "exact_duplicate" and normalizedFolderId != getattr(fileItem, "folderId", None):
- managementInterface.updateFile(fileItem.id, {"folderId": normalizedFolderId})
- fileItem.folderId = normalizedFolderId
-
+ # Add to group if groupId was provided
+ if groupId:
+ import modules.interfaces.interfaceDbApp as _appIface
+ appInterface = _appIface.getInterface(currentUser)
+ _addFileToGroup(appInterface, fileItem.id, groupId)
+
# Determine response message based on duplicate type
if duplicateType == "exact_duplicate":
message = f"File '{file.filename}' already exists with identical content. Reusing existing file."
@@ -420,6 +673,8 @@ async def upload_file(
fileName=fileItem.fileName,
mimeType=fileItem.mimeType,
user=currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
))
except Exception as indexErr:
logger.warning(f"Auto-index trigger failed (non-blocking): {indexErr}")
@@ -447,347 +702,6 @@ async def upload_file(
detail=f"Error during file upload: {str(e)}"
)
-# ── Folder endpoints (MUST be before /{fileId} catch-all) ─────────────────────
-
-@router.get("/folders", response_model=List[Dict[str, Any]])
-@limiter.limit("30/minute")
-def list_folders(
- request: Request,
- parentId: Optional[str] = Query(None, description="Parent folder ID (omit for all folders)"),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> List[Dict[str, Any]]:
- """List folders for the current user."""
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- if parentId is not None:
- return mgmt.listFolders(parentId=parentId)
- return mgmt.listFolders()
- except Exception as e:
- logger.error(f"Error listing folders: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.post("/folders", status_code=status.HTTP_201_CREATED)
-@limiter.limit("10/minute")
-def create_folder(
- request: Request,
- body: Dict[str, Any] = Body(...),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Dict[str, Any]:
- """Create a new folder."""
- name = body.get("name", "")
- parentId = body.get("parentId")
- if not name:
- raise HTTPException(status_code=400, detail=routeApiMsg("name is required"))
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- return mgmt.createFolder(name=name, parentId=parentId)
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
- except Exception as e:
- logger.error(f"Error creating folder: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.put("/folders/{folderId}")
-@limiter.limit("10/minute")
-def rename_folder(
- request: Request,
- folderId: str = Path(...),
- body: Dict[str, Any] = Body(...),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Dict[str, Any]:
- """Rename a folder."""
- newName = body.get("name", "")
- if not newName:
- raise HTTPException(status_code=400, detail=routeApiMsg("name is required"))
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- mgmt.renameFolder(folderId, newName)
- return {"success": True, "folderId": folderId, "name": newName}
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
- except Exception as e:
- logger.error(f"Error renaming folder: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.delete("/folders/{folderId}")
-@limiter.limit("10/minute")
-def delete_folder(
- request: Request,
- folderId: str = Path(...),
- recursive: bool = Query(False, description="Delete folder contents recursively"),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Dict[str, Any]:
- """Delete a folder. Use recursive=true to delete non-empty folders."""
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- return mgmt.deleteFolder(folderId, recursive=recursive)
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
- except Exception as e:
- logger.error(f"Error deleting folder: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.post("/folders/{folderId}/move")
-@limiter.limit("10/minute")
-def move_folder(
- request: Request,
- folderId: str = Path(...),
- body: Dict[str, Any] = Body(...),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Dict[str, Any]:
- """Move a folder to a new parent."""
- targetParentId = body.get("targetParentId")
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- mgmt.moveFolder(folderId, targetParentId)
- return {"success": True, "folderId": folderId, "parentId": targetParentId}
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
- except Exception as e:
- logger.error(f"Error moving folder: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.patch("/folders/{folderId}/scope")
-@limiter.limit("10/minute")
-def _updateFolderScope(
- request: Request,
- folderId: str = Path(..., description="ID of the folder"),
- scope: str = Body(..., embed=True),
- context: RequestContext = Depends(getRequestContext),
-) -> Dict[str, Any]:
- """Update the scope of a folder. Propagates to all files inside (recursively). Global scope requires sysAdmin."""
- validScopes = {"personal", "featureInstance", "mandate", "global"}
- if scope not in validScopes:
- raise HTTPException(status_code=400, detail=f"Invalid scope: {scope}. Must be one of {validScopes}")
- if scope == "global" and not context.isSysAdmin:
- raise HTTPException(status_code=403, detail=routeApiMsg("Only sysadmins can set global scope"))
- try:
- mgmt = interfaceDbManagement.getInterface(
- context.user,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- folder = mgmt.getFolder(folderId)
- if not folder:
- raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found"))
- mgmt.updateFolder(folderId, {"scope": scope})
- fileIds = _collectFolderFileIds(mgmt, folderId)
- for fid in fileIds:
- try:
- mgmt.updateFile(fid, {"scope": scope})
- except Exception as e:
- logger.error("Folder scope propagation: failed to update file %s: %s", fid, e)
- logger.info("Updated scope=%s for folder %s: %d files affected", scope, folderId, len(fileIds))
- return {"folderId": folderId, "scope": scope, "filesUpdated": len(fileIds)}
- except HTTPException:
- raise
- except Exception as e:
- logger.error(f"Error updating folder scope: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.patch("/folders/{folderId}/neutralize")
-@limiter.limit("10/minute")
-def updateFolderNeutralize(
- request: Request,
- background_tasks: BackgroundTasks,
- folderId: str = Path(..., description="ID of the folder"),
- neutralize: bool = Body(..., embed=True),
- context: RequestContext = Depends(getRequestContext),
-) -> Dict[str, Any]:
- """Toggle neutralization on a folder. Propagates to all files inside (recursively).
-
- When turning ON: all files in the folder get ``neutralize=True``, their
- knowledge indexes are purged synchronously, and background re-indexing
- is triggered.
- When turning OFF: files revert to ``neutralize=False`` unless they were
- individually marked (not implemented yet -- all are reverted).
- """
- try:
- mgmt = interfaceDbManagement.getInterface(
- context.user,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
-
- folder = mgmt.getFolder(folderId)
- if not folder:
- raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found"))
-
- mgmt.updateFolder(folderId, {"neutralize": neutralize})
-
- fileIds = _collectFolderFileIds(mgmt, folderId)
- logger.info("Folder neutralize toggle %s for folder %s: %d files affected", neutralize, folderId, len(fileIds))
-
- from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
- knowledgeDb = getKnowledgeInterface()
-
- for fid in fileIds:
- try:
- mgmt.updateFile(fid, {"neutralize": neutralize})
- if neutralize:
- try:
- knowledgeDb.deleteFileContentIndex(fid)
- except Exception as e:
- logger.warning("Folder neutralize: failed to purge index for file %s: %s", fid, e)
- else:
- try:
- from modules.datamodels.datamodelKnowledge import FileContentIndex
- indices = knowledgeDb.db.getRecordset(FileContentIndex, recordFilter={"id": fid})
- for idx in indices:
- idxId = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
- if idxId:
- knowledgeDb.db.recordModify(FileContentIndex, idxId, {
- "neutralizationStatus": "original",
- "isNeutralized": False,
- })
- except Exception as e:
- logger.warning("Folder neutralize OFF: metadata update failed for %s: %s", fid, e)
- except Exception as e:
- logger.error("Folder neutralize: failed to update file %s: %s", fid, e)
-
- for fid in fileIds:
- fileMeta = mgmt.getFile(fid)
- if fileMeta:
- fn = fileMeta.fileName if hasattr(fileMeta, "fileName") else fileMeta.get("fileName", "")
- mt = fileMeta.mimeType if hasattr(fileMeta, "mimeType") else fileMeta.get("mimeType", "")
-
- async def _reindex(fileId=fid, fileName=fn, mimeType=mt):
- try:
- await _autoIndexFile(fileId=fileId, fileName=fileName, mimeType=mimeType, user=context.user)
- except Exception as ex:
- logger.error("Folder neutralize re-index failed for %s: %s", fileId, ex)
-
- background_tasks.add_task(_reindex)
-
- return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": len(fileIds)}
- except HTTPException:
- raise
- except Exception as e:
- logger.error(f"Error updating folder neutralize flag: {e}")
- raise HTTPException(status_code=500, detail=str(e))
-
-
-def _collectFolderFileIds(mgmt, folderId: str) -> List[str]:
- """Recursively collect all file IDs in a folder and its sub-folders."""
- fileIds = []
- try:
- files = mgmt.listFiles(folderId=folderId)
- if isinstance(files, dict):
- files = files.get("files", [])
- for f in (files or []):
- fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
- if fid:
- fileIds.append(fid)
- except Exception as e:
- logger.warning("_collectFolderFileIds: listFiles failed for folder %s: %s", folderId, e)
-
- try:
- subFolders = mgmt.listFolders(parentId=folderId)
- for sf in (subFolders or []):
- sfId = sf.get("id") if isinstance(sf, dict) else getattr(sf, "id", None)
- if sfId:
- fileIds.extend(_collectFolderFileIds(mgmt, sfId))
- except Exception as e:
- logger.warning("_collectFolderFileIds: listFolders failed for folder %s: %s", folderId, e)
-
- return fileIds
-
-
-@router.get("/folders/{folderId}/download")
-@limiter.limit("10/minute")
-def download_folder(
- request: Request,
- folderId: str = Path(..., description="ID of the folder to download as ZIP"),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Response:
- """Download a folder (including subfolders) as a ZIP archive."""
- import io
- import zipfile
- import urllib.parse
-
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
-
- folder = mgmt.getFolder(folderId)
- if not folder:
- raise HTTPException(status_code=404, detail=f"Folder {folderId} not found")
-
- folderName = folder.get("name", "download")
-
- def _collectFiles(parentId: str, pathPrefix: str):
- """Recursively collect (zipPath, fileId) tuples."""
- entries = []
- for f in mgmt._getFilesByCurrentUser(recordFilter={"folderId": parentId}):
- fname = f.get("fileName") or f.get("name") or f.get("id", "file")
- entries.append((f"{pathPrefix}{fname}", f["id"]))
- for sub in mgmt.listFolders(parentId=parentId):
- subName = sub.get("name", sub["id"])
- entries.extend(_collectFiles(sub["id"], f"{pathPrefix}{subName}/"))
- return entries
-
- fileEntries = _collectFiles(folderId, "")
- if not fileEntries:
- raise HTTPException(status_code=404, detail=routeApiMsg("Folder is empty"))
-
- buf = io.BytesIO()
- with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
- for zipPath, fileId in fileEntries:
- data = mgmt.getFileData(fileId)
- if data:
- zf.writestr(zipPath, data)
-
- buf.seek(0)
- zipBytes = buf.getvalue()
- encodedName = urllib.parse.quote(f"{folderName}.zip")
-
- return Response(
- content=zipBytes,
- media_type="application/zip",
- headers={
- "Content-Disposition": f"attachment; filename*=UTF-8''{encodedName}"
- }
- )
- except HTTPException:
- raise
- except Exception as e:
- logger.error(f"Error downloading folder as ZIP: {e}")
- raise HTTPException(status_code=500, detail=f"Error downloading folder: {str(e)}")
@router.post("/batch-delete")
@@ -798,13 +712,11 @@ def batch_delete_items(
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
- """Batch delete files/folders with a single SQL-backed operation per type."""
+ """Batch delete files."""
fileIds = body.get("fileIds") or []
- folderIds = body.get("folderIds") or []
- recursiveFolders = bool(body.get("recursiveFolders", True))
- if not isinstance(fileIds, list) or not isinstance(folderIds, list):
- raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays"))
+ if not isinstance(fileIds, list):
+ raise HTTPException(status_code=400, detail=routeApiMsg("fileIds must be an array"))
try:
mgmt = interfaceDbManagement.getInterface(
@@ -813,17 +725,12 @@ def batch_delete_items(
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
- result = {"deletedFiles": 0, "deletedFolders": 0}
+ result = {"deletedFiles": 0}
if fileIds:
fileResult = mgmt.deleteFilesBatch(fileIds)
result["deletedFiles"] += fileResult.get("deletedFiles", 0)
- if folderIds:
- folderResult = mgmt.deleteFoldersBatch(folderIds, recursive=recursiveFolders)
- result["deletedFiles"] += folderResult.get("deletedFiles", 0)
- result["deletedFolders"] += folderResult.get("deletedFolders", 0)
-
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
@@ -832,22 +739,23 @@ def batch_delete_items(
raise HTTPException(status_code=500, detail=str(e))
-@router.post("/batch-move")
+@router.post("/batch-download")
@limiter.limit("10/minute")
-def batch_move_items(
+def batchDownload(
request: Request,
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Dict[str, Any]:
- """Batch move files/folders with a single SQL-backed operation per type."""
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Download multiple files and/or folders as a single ZIP archive,
+ preserving the folder hierarchy as ZIP paths."""
+ import io, zipfile
+
fileIds = body.get("fileIds") or []
folderIds = body.get("folderIds") or []
- targetFolderId = body.get("targetFolderId")
- targetParentId = body.get("targetParentId")
- if not isinstance(fileIds, list) or not isinstance(folderIds, list):
- raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays"))
+ if not fileIds and not folderIds:
+ raise HTTPException(status_code=400, detail="fileIds or folderIds required")
try:
mgmt = interfaceDbManagement.getInterface(
@@ -856,21 +764,268 @@ def batch_move_items(
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
- result = {"movedFiles": 0, "movedFolders": 0}
+ folderCache: dict[str, dict] = {}
- if fileIds:
- fileResult = mgmt.moveFilesBatch(fileIds, targetFolderId=targetFolderId)
- result["movedFiles"] += fileResult.get("movedFiles", 0)
+ def _getFolder(fid: str):
+ if fid not in folderCache:
+ f = mgmt.getFolder(fid)
+ folderCache[fid] = f if f else {}
+ return folderCache[fid]
- if folderIds:
- folderResult = mgmt.moveFoldersBatch(folderIds, targetParentId=targetParentId)
- result["movedFolders"] += folderResult.get("movedFolders", 0)
+ def _folderPath(fid: str) -> str:
+ """Build the full path for a folder by walking up parentId."""
+ parts: list[str] = []
+ current = fid
+ visited: set[str] = set()
+ while current and current not in visited:
+ visited.add(current)
+ folder = _getFolder(current)
+ if not folder:
+ break
+ parts.append(folder.get("name", current))
+ current = folder.get("parentId")
+ parts.reverse()
+ return "/".join(parts)
- return result
- except ValueError as e:
- raise HTTPException(status_code=400, detail=str(e))
+ # Collect files from requested folders (recursive)
+ fileEntries: list[tuple[str, str]] = []
+ seenFileIds: set[str] = set()
+
+ for fid in folderIds:
+ childFolderIds = mgmt._collectChildFolderIds(fid)
+ for cfid in childFolderIds:
+ prefix = _folderPath(cfid)
+ items = mgmt.db.getRecordset(FileItem, recordFilter={"folderId": cfid})
+ for item in items:
+ itemId = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
+ if itemId and itemId not in seenFileIds:
+ seenFileIds.add(itemId)
+ fileEntries.append((itemId, prefix))
+
+ # Loose files (not via folder selection)
+ for fid in fileIds:
+ if fid in seenFileIds:
+ continue
+ seenFileIds.add(fid)
+ fileMeta = mgmt.getFile(fid)
+ if not fileMeta:
+ continue
+ fileFolderId = fileMeta.get("folderId") if isinstance(fileMeta, dict) else getattr(fileMeta, "folderId", None)
+ prefix = _folderPath(fileFolderId) if fileFolderId else ""
+ fileEntries.append((fid, prefix))
+
+ if not fileEntries:
+ raise HTTPException(status_code=404, detail="No downloadable files found")
+
+ buf = io.BytesIO()
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+ for fid, prefix in fileEntries:
+ try:
+ fileMeta = mgmt.getFile(fid)
+ fileData = mgmt.getFileData(fid)
+ if fileMeta and fileData:
+ name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid
+ zipPath = f"{prefix}/{name}" if prefix else name
+ zf.writestr(zipPath, fileData)
+ except Exception as fe:
+ logger.warning(f"batch_download: skipping file {fid}: {fe}")
+ buf.seek(0)
+ from fastapi.responses import StreamingResponse
+ return StreamingResponse(
+ buf,
+ media_type="application/zip",
+ headers={"Content-Disposition": 'attachment; filename="download.zip"'},
+ )
+ except HTTPException:
+ raise
except Exception as e:
- logger.error(f"Error in batch move: {e}")
+ logger.error(f"batch_download error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+# ── Group bulk endpoints ──────────────────────────────────────────────────────
+
+def _get_group_item_ids(contextKey: str, groupId: str, appInterface) -> set:
+ """Collect all file IDs in a group and its sub-groups from the stored groupTree."""
+ from modules.routes.routeHelpers import _collectItemIds
+ try:
+ existing = appInterface.getTableGrouping(contextKey)
+ if not existing:
+ return set()
+ nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups]
+ result = _collectItemIds(nodes, groupId)
+ return result or set()
+ except Exception as e:
+ logger.error(f"_get_group_item_ids failed for groupId={groupId}: {e}")
+ return set()
+
+
+@router.patch("/groups/{groupId}/scope")
+@limiter.limit("60/minute")
+def patch_group_scope(
+ request: Request,
+ groupId: str = Path(..., description="Group ID"),
+ body: dict = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Set scope for all files in a group (recursive)."""
+ scope = body.get("scope")
+ if not scope:
+ raise HTTPException(status_code=400, detail="scope is required")
+ try:
+ import modules.interfaces.interfaceDbApp as _appIface
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ appInterface = _appIface.getInterface(currentUser)
+ fileIds = _get_group_item_ids("files/list", groupId, appInterface)
+ updated = 0
+ for fid in fileIds:
+ try:
+ managementInterface.updateFile(fid, {"scope": scope})
+ updated += 1
+ except Exception as e:
+ logger.error(f"patch_group_scope: failed to update file {fid}: {e}")
+ return {"groupId": groupId, "scope": scope, "filesUpdated": updated}
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"patch_group_scope error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.patch("/groups/{groupId}/neutralize")
+@limiter.limit("60/minute")
+def patch_group_neutralize(
+ request: Request,
+ groupId: str = Path(..., description="Group ID"),
+ body: dict = Body(...),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Toggle neutralize for all files in a group (recursive, incl. knowledge purge/reindex)."""
+ neutralize = body.get("neutralize")
+ if neutralize is None:
+ raise HTTPException(status_code=400, detail="neutralize is required")
+ try:
+ import modules.interfaces.interfaceDbApp as _appIface
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ appInterface = _appIface.getInterface(currentUser)
+ fileIds = _get_group_item_ids("files/list", groupId, appInterface)
+ updated = 0
+ for fid in fileIds:
+ try:
+ managementInterface.updateFile(fid, {"neutralize": neutralize})
+ if not neutralize:
+ try:
+ from modules.interfaces import interfaceDbKnowledge
+ kIface = interfaceDbKnowledge.getInterface(currentUser)
+ kIface.purgeFileKnowledge(fid)
+ except Exception as ke:
+ logger.warning(f"patch_group_neutralize: knowledge purge failed for {fid}: {ke}")
+ updated += 1
+ except Exception as e:
+ logger.error(f"patch_group_neutralize: failed for file {fid}: {e}")
+ return {"groupId": groupId, "neutralize": neutralize, "filesUpdated": updated}
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"patch_group_neutralize error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/groups/{groupId}/download")
+@limiter.limit("20/minute")
+async def download_group_zip(
+ request: Request,
+ groupId: str = Path(..., description="Group ID"),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Download all files in a group as a ZIP archive."""
+ import io, zipfile
+ try:
+ import modules.interfaces.interfaceDbApp as _appIface
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ appInterface = _appIface.getInterface(currentUser)
+ fileIds = _get_group_item_ids("files/list", groupId, appInterface)
+ if not fileIds:
+ raise HTTPException(status_code=404, detail="Group not found or empty")
+ buf = io.BytesIO()
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+ for fid in fileIds:
+ try:
+ fileMeta = managementInterface.getFile(fid)
+ fileData = managementInterface.getFileData(fid)
+ if fileMeta and fileData:
+ name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid
+ zf.writestr(name, fileData)
+ except Exception as fe:
+ logger.warning(f"download_group_zip: skipping file {fid}: {fe}")
+ buf.seek(0)
+ from fastapi.responses import StreamingResponse
+ return StreamingResponse(
+ buf,
+ media_type="application/zip",
+ headers={"Content-Disposition": f'attachment; filename="group-{groupId}.zip"'},
+ )
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"download_group_zip error: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.delete("/groups/{groupId}")
+@limiter.limit("30/minute")
+def delete_group(
+ request: Request,
+ groupId: str = Path(..., description="Group ID"),
+ deleteItems: bool = Query(False, description="If true, also delete all files in the group"),
+ currentUser: User = Depends(getCurrentUser),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Remove a group from the groupTree. Optionally delete all its files."""
+ try:
+ import modules.interfaces.interfaceDbApp as _appIface
+ appInterface = _appIface.getInterface(currentUser)
+ fileIds = _get_group_item_ids("files/list", groupId, appInterface)
+ # Remove group from tree
+ existing = appInterface.getTableGrouping("files/list")
+ if existing:
+ from modules.routes.routeHelpers import _removeGroupFromTree
+ newRoots = _removeGroupFromTree([n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups], groupId)
+ appInterface.upsertTableGrouping("files/list", newRoots)
+ # Optionally delete files
+ deletedFiles = 0
+ if deleteItems:
+ managementInterface = interfaceDbManagement.getInterface(
+ currentUser,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
+ for fid in fileIds:
+ try:
+ managementInterface.deleteFile(fid)
+ deletedFiles += 1
+ except Exception as e:
+ logger.error(f"delete_group: failed to delete file {fid}: {e}")
+ return {"groupId": groupId, "deletedFiles": deletedFiles}
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"delete_group error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@@ -921,7 +1076,11 @@ def updateFileScope(
async def _runReindexAfterScopeChange():
try:
- await _autoIndexFile(fileId=fileId, fileName=fn, mimeType=mt, user=context.user)
+ await _autoIndexFile(
+ fileId=fileId, fileName=fn, mimeType=mt, user=context.user,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
except Exception as ex:
logger.warning("Re-index after scope change failed for %s: %s", fileId, ex)
@@ -999,7 +1158,11 @@ def updateFileNeutralize(
async def _runReindexAfterNeutralizeToggle():
try:
- await _autoIndexFile(fileId=fileId, fileName=fn, mimeType=mt, user=context.user)
+ await _autoIndexFile(
+ fileId=fileId, fileName=fn, mimeType=mt, user=context.user,
+ mandateId=str(context.mandateId) if context.mandateId else None,
+ featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
+ )
except Exception as ex:
logger.error("Re-index after neutralize toggle failed for %s: %s (file has NO index until next re-index)", fileId, ex)
@@ -1071,7 +1234,7 @@ def update_file(
) -> FileItem:
"""Update file info"""
try:
- _EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "folderId", "neutralize"}
+ _EDITABLE_FIELDS = {"fileName", "folderId", "scope", "tags", "description", "neutralize"}
safeData = {k: v for k, v in file_info.items() if k in _EDITABLE_FIELDS}
if not safeData:
raise HTTPException(status_code=400, detail=routeApiMsg("No editable fields provided"))
@@ -1226,37 +1389,3 @@ def preview_file(
)
-@router.post("/{fileId}/move")
-@limiter.limit("10/minute")
-def move_file(
- request: Request,
- fileId: str = Path(...),
- body: Dict[str, Any] = Body(...),
- currentUser: User = Depends(getCurrentUser),
- context: RequestContext = Depends(getRequestContext)
-) -> Dict[str, Any]:
- """Move a file to a different folder."""
- targetFolderId = body.get("targetFolderId")
- try:
- mgmt = interfaceDbManagement.getInterface(
- currentUser,
- mandateId=str(context.mandateId) if context.mandateId else None,
- featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
- )
- mgmt.updateFile(fileId, {"folderId": targetFolderId})
-
- if targetFolderId:
- try:
- targetFolder = mgmt.getFolder(targetFolderId)
- folderNeut = (targetFolder.get("neutralize") if isinstance(targetFolder, dict)
- else getattr(targetFolder, "neutralize", False)) if targetFolder else False
- if folderNeut:
- mgmt.updateFile(fileId, {"neutralize": True})
- logger.info("File %s moved to neutralized folder %s — inherited neutralize=True", fileId, targetFolderId)
- except Exception as e:
- logger.warning("File move: folder neutralize inheritance check failed for %s: %s", fileId, e)
-
- return {"success": True, "fileId": fileId, "folderId": targetFolderId}
- except Exception as e:
- logger.error(f"Error moving file: {e}")
- raise HTTPException(status_code=500, detail=str(e))
diff --git a/modules/routes/routeDataMandates.py b/modules/routes/routeDataMandates.py
index ef058ed9..47eaee02 100644
--- a/modules/routes/routeDataMandates.py
+++ b/modules/routes/routeDataMandates.py
@@ -112,8 +112,8 @@ def get_mandates(
status_code=status.HTTP_403_FORBIDDEN,
detail=routeApiMsg("Admin role required")
)
-
- # Parse pagination parameter
+
+ # Parse pagination parameter early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@@ -126,14 +126,24 @@ def get_mandates(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
-
+
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory,
handleFilterValuesMode, handleIdsMode,
parseCrossFilterPagination,
+ handleGroupingInRequest, applyGroupScopeFilter,
)
appInterface = interfaceDbApp.getRootInterface()
+ groupCtx = handleGroupingInRequest(paginationParams, appInterface, "mandates")
+
+ def _mandateItemsForAdmin():
+ items = []
+ for mid in adminMandateIds:
+ m = appInterface.getMandate(mid)
+ if m and getattr(m, "enabled", True):
+ items.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
+ return items
if mode == "filterValues":
if not column:
@@ -144,54 +154,42 @@ def get_mandates(
values = appInterface.db.getDistinctColumnValues(Mandate, column, crossPagination)
return JSONResponse(content=sorted(values, key=lambda v: str(v).lower()))
else:
- mandateItems = []
- for mid in adminMandateIds:
- m = appInterface.getMandate(mid)
- if m and getattr(m, "enabled", True):
- mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
+ mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleFilterValuesInMemory(mandateItems, column, pagination)
if mode == "ids":
if isPlatformAdmin:
return handleIdsMode(appInterface.db, Mandate, pagination)
else:
- mandateItems = []
- for mid in adminMandateIds:
- m = appInterface.getMandate(mid)
- if m and getattr(m, "enabled", True):
- mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
+ mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleIdsInMemory(mandateItems, pagination)
if isPlatformAdmin:
result = appInterface.getAllMandates(pagination=paginationParams)
- else:
- allMandates = []
- for mandateId in adminMandateIds:
- mandate = appInterface.getMandate(mandateId)
- if mandate and getattr(mandate, "enabled", True):
- mandateDict = mandate if isinstance(mandate, dict) else mandate.model_dump() if hasattr(mandate, 'model_dump') else vars(mandate)
- allMandates.append(mandateDict)
- result = allMandates
- paginationParams = None
-
- if paginationParams and hasattr(result, 'items'):
- return PaginatedResponse(
- items=result.items,
- pagination=PaginationMetadata(
- currentPage=paginationParams.page,
- pageSize=paginationParams.pageSize,
- totalItems=result.totalItems,
- totalPages=result.totalPages,
- sort=paginationParams.sort,
- filters=paginationParams.filters
+ items = result.items if hasattr(result, 'items') else (result if isinstance(result, list) else [])
+ items = applyGroupScopeFilter(
+ [i.model_dump() if hasattr(i, 'model_dump') else (i if isinstance(i, dict) else vars(i)) for i in items],
+ groupCtx.itemIds,
+ )
+ if paginationParams and hasattr(result, 'items'):
+ return PaginatedResponse(
+ items=items,
+ pagination=PaginationMetadata(
+ currentPage=paginationParams.page,
+ pageSize=paginationParams.pageSize,
+ totalItems=result.totalItems,
+ totalPages=result.totalPages,
+ sort=paginationParams.sort,
+ filters=paginationParams.filters
+ ),
+ groupTree=groupCtx.groupTree,
)
- )
+ else:
+ return PaginatedResponse(items=items, pagination=None, groupTree=groupCtx.groupTree)
else:
- items = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else result)
- return PaginatedResponse(
- items=items,
- pagination=None
- )
+ mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
+ return PaginatedResponse(items=mandateItems, pagination=None, groupTree=groupCtx.groupTree)
+
except HTTPException:
raise
except Exception as e:
diff --git a/modules/routes/routeDataPrompts.py b/modules/routes/routeDataPrompts.py
index ee99b912..84559ebb 100644
--- a/modules/routes/routeDataPrompts.py
+++ b/modules/routes/routeDataPrompts.py
@@ -44,27 +44,15 @@ def get_prompts(
- filterValues: distinct values for a column (cross-filtered)
- ids: all IDs matching current filters
"""
- from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
+ from modules.routes.routeHelpers import (
+ handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
+ handleGroupingInRequest, applyGroupScopeFilter,
+ )
+ from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
- def _promptsToEnrichedDicts(promptItems):
- dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
- enrichRowsWithFkLabels(dicts, Prompt)
- return dicts
-
- if mode == "filterValues":
- if not column:
- raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
- managementInterface = interfaceDbManagement.getInterface(currentUser)
- result = managementInterface.getAllPrompts(pagination=None)
- items = _promptsToEnrichedDicts(result)
- return handleFilterValuesInMemory(items, column, pagination)
-
- if mode == "ids":
- managementInterface = interfaceDbManagement.getInterface(currentUser)
- result = managementInterface.getAllPrompts(pagination=None)
- items = _promptsToEnrichedDicts(result)
- return handleIdsInMemory(items, pagination)
+ CONTEXT_KEY = "prompts"
+ # Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@@ -74,12 +62,35 @@ def get_prompts(
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
-
+
+ appInterface = getAppInterface(currentUser)
+ groupCtx = handleGroupingInRequest(paginationParams, appInterface, CONTEXT_KEY)
+
+ def _promptsToEnrichedDicts(promptItems):
+ dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
+ enrichRowsWithFkLabels(dicts, Prompt)
+ return dicts
+
managementInterface = interfaceDbManagement.getInterface(currentUser)
+
+ if mode == "filterValues":
+ if not column:
+ raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
+ result = managementInterface.getAllPrompts(pagination=None)
+ items = _promptsToEnrichedDicts(result)
+ items = applyGroupScopeFilter(items, groupCtx.itemIds)
+ return handleFilterValuesInMemory(items, column, pagination)
+
+ if mode == "ids":
+ result = managementInterface.getAllPrompts(pagination=None)
+ items = _promptsToEnrichedDicts(result)
+ items = applyGroupScopeFilter(items, groupCtx.itemIds)
+ return handleIdsInMemory(items, pagination)
+
result = managementInterface.getAllPrompts(pagination=paginationParams)
-
+
if paginationParams:
- items = _promptsToEnrichedDicts(result.items)
+ items = applyGroupScopeFilter(_promptsToEnrichedDicts(result.items), groupCtx.itemIds)
return {
"items": items,
"pagination": PaginationMetadata(
@@ -90,12 +101,14 @@ def get_prompts(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
+ "groupTree": groupCtx.groupTree,
}
else:
- items = _promptsToEnrichedDicts(result)
+ items = applyGroupScopeFilter(_promptsToEnrichedDicts(result), groupCtx.itemIds)
return {
"items": items,
"pagination": None,
+ "groupTree": groupCtx.groupTree,
}
diff --git a/modules/routes/routeDataUsers.py b/modules/routes/routeDataUsers.py
index 6d72b763..25d20c39 100644
--- a/modules/routes/routeDataUsers.py
+++ b/modules/routes/routeDataUsers.py
@@ -208,6 +208,21 @@ def get_users(
- GET /api/users/ (no pagination - returns all users in mandate)
- GET /api/users/?pagination={"page":1,"pageSize":10,"sort":[]}
"""
+ # Parse pagination early — needed for grouping in all modes
+ _paginationParams = None
+ if pagination:
+ try:
+ _pd = json.loads(pagination)
+ if _pd:
+ _pd = normalize_pagination_dict(_pd)
+ _paginationParams = PaginationParams(**_pd)
+ except (json.JSONDecodeError, ValueError) as e:
+ raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
+
+ from modules.routes.routeHelpers import handleGroupingInRequest as _handleGrouping, applyGroupScopeFilter as _applyGroupScope
+ _appInterfaceForGrouping = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
+ _groupCtx = _handleGrouping(_paginationParams, _appInterfaceForGrouping, "users")
+
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
@@ -217,27 +232,15 @@ def get_users(
return _getUserFilterOrIds(context, pagination, idsMode=True)
try:
- paginationParams = None
- if pagination:
- try:
- paginationDict = json.loads(pagination)
- if paginationDict:
- paginationDict = normalize_pagination_dict(paginationDict)
- paginationParams = PaginationParams(**paginationDict)
- except (json.JSONDecodeError, ValueError) as e:
- raise HTTPException(
- status_code=400,
- detail=f"Invalid pagination parameter: {str(e)}"
- )
-
- appInterface = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
+ paginationParams = _paginationParams
+ appInterface = _appInterfaceForGrouping
if context.mandateId:
# Get users for specific mandate using getUsersByMandate
result = appInterface.getUsersByMandate(str(context.mandateId), paginationParams)
-
+
if paginationParams and hasattr(result, 'items'):
- enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
+ enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@@ -248,17 +251,18 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
+ "groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else result.items if hasattr(result, 'items') else []
- enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
- return {"items": enriched, "pagination": None}
+ enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
+ return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
elif context.isPlatformAdmin:
# PlatformAdmin without mandateId — DB-level pagination via interface
result = appInterface.getAllUsers(paginationParams)
-
+
if paginationParams and hasattr(result, 'items'):
- enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
+ enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@@ -269,11 +273,12 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
+ "groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else [])
- enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
- return {"items": enriched, "pagination": None}
+ enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
+ return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
else:
# Non-SysAdmin without mandateId: aggregate users across all admin mandates
rootInterface = getRootInterface()
@@ -313,16 +318,16 @@ def get_users(
]
from modules.routes.routeHelpers import applyFiltersAndSort as _applyFiltersAndSortHelper
- filteredUsers = _applyFiltersAndSortHelper(allUsers, paginationParams)
+ filteredUsers = _applyGroupScope(_applyFiltersAndSortHelper(allUsers, paginationParams), _groupCtx.itemIds)
enriched = enrichRowsWithFkLabels(filteredUsers, User)
-
+
if paginationParams:
import math
totalItems = len(enriched)
totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0
startIdx = (paginationParams.page - 1) * paginationParams.pageSize
endIdx = startIdx + paginationParams.pageSize
-
+
return {
"items": enriched[startIdx:endIdx],
"pagination": PaginationMetadata(
@@ -333,9 +338,10 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
+ "groupTree": _groupCtx.groupTree,
}
else:
- return {"items": enriched, "pagination": None}
+ return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:
diff --git a/modules/routes/routeHelpers.py b/modules/routes/routeHelpers.py
index 37bfa3b2..9e8644ca 100644
--- a/modules/routes/routeHelpers.py
+++ b/modules/routes/routeHelpers.py
@@ -701,3 +701,157 @@ def paginateInMemory(
offset = (paginationParams.page - 1) * paginationParams.pageSize
pageItems = items[offset:offset + paginationParams.pageSize]
return pageItems, totalItems
+
+
+# ---------------------------------------------------------------------------
+# Table Grouping helpers
+# ---------------------------------------------------------------------------
+
+from dataclasses import dataclass, field as dc_field
+
+
+@dataclass
+class GroupingContext:
+ """
+ Result of handleGroupingInRequest.
+ Carries the group tree for the response and the resolved item-ID set for
+ group-scope filtering (None = no active group scope).
+ """
+ groupTree: Optional[list] # List[TableGroupNode] serialised as dicts — for response
+ itemIds: Optional[set] # Set[str] when groupId was set, else None
+
+
+def _collectItemIds(nodes: list, groupId: str) -> Optional[set]:
+ """
+ Recursively search *nodes* for a node whose id == groupId and collect
+ all itemIds from it and all its descendant subGroups.
+ Returns None if the group is not found.
+ """
+ for node in nodes:
+ nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
+ if nodeId == groupId:
+ ids: set = set()
+ _collectAllIds(node, ids)
+ return ids
+ subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
+ result = _collectItemIds(subGroups, groupId)
+ if result is not None:
+ return result
+ return None
+
+
+def _collectAllIds(node, ids: set) -> None:
+ """Collect itemIds from a node and all its descendants into ids."""
+ nodeItemIds = node.get("itemIds", []) if isinstance(node, dict) else getattr(node, "itemIds", [])
+ for iid in nodeItemIds:
+ ids.add(str(iid))
+ subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
+ for child in subGroups:
+ _collectAllIds(child, ids)
+
+
+def _removeGroupFromTree(nodes: list, groupId: str) -> list:
+ """Remove a group node (and all descendants) from the tree by id."""
+ result = []
+ for node in nodes:
+ nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
+ if nodeId == groupId:
+ continue # skip this node (remove it)
+ subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
+ filtered_sub = _removeGroupFromTree(subGroups, groupId)
+ if isinstance(node, dict):
+ node = {**node, "subGroups": filtered_sub}
+ result.append(node)
+ return result
+
+
+def handleGroupingInRequest(
+ paginationParams: Optional[PaginationParams],
+ interface,
+ contextKey: str,
+) -> GroupingContext:
+ """
+ Central grouping handler — call at the start of every list route that
+ supports table grouping.
+
+ Steps (in order):
+ 1. If paginationParams.saveGroupTree is set:
+ persist the new tree via interface.upsertTableGrouping, then clear
+ saveGroupTree from paginationParams so it is not treated as a filter.
+ 2. Load the current group tree from the DB (used in step 3 and response).
+ 3. If paginationParams.groupId is set:
+ resolve it to a Set[str] of itemIds (including all sub-groups),
+ then clear groupId from paginationParams so it is not treated as a
+ normal filter field.
+ 4. Return a GroupingContext with groupTree (for the response) and itemIds
+ (for applyGroupScopeFilter).
+
+ The caller does NOT need to handle any grouping logic itself — just call
+ applyGroupScopeFilter(items, groupCtx.itemIds) and embed groupCtx.groupTree
+ in the response dict.
+ """
+ from modules.datamodels.datamodelPagination import TableGroupNode
+
+ groupTree = None
+ itemIds = None
+
+ if paginationParams is None:
+ try:
+ existing = interface.getTableGrouping(contextKey)
+ if existing:
+ groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
+ except Exception as e:
+ logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
+ return GroupingContext(groupTree=groupTree, itemIds=None)
+
+ # Step 1: persist saveGroupTree if present
+ if paginationParams.saveGroupTree is not None:
+ try:
+ saved = interface.upsertTableGrouping(contextKey, paginationParams.saveGroupTree)
+ groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in saved.rootGroups]
+ except Exception as e:
+ logger.error(f"handleGroupingInRequest: upsertTableGrouping failed: {e}")
+ paginationParams.saveGroupTree = None
+
+ # Step 2: load current tree (only if not already set from save above)
+ if groupTree is None:
+ try:
+ existing = interface.getTableGrouping(contextKey)
+ if existing:
+ groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
+ except Exception as e:
+ logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
+
+ # Step 3: resolve groupId to itemIds set
+ if paginationParams.groupId is not None:
+ targetGroupId = paginationParams.groupId
+ paginationParams.groupId = None # remove so it is not treated as a normal filter
+ if groupTree:
+ itemIds = _collectItemIds(groupTree, targetGroupId)
+ if itemIds is None:
+ logger.warning(
+ f"handleGroupingInRequest: groupId={targetGroupId!r} not found in tree "
+ f"for contextKey={contextKey!r} — returning empty set"
+ )
+ itemIds = set() # unknown group → show nothing rather than everything
+ else:
+ # groupId sent but no tree saved yet → return empty (nothing belongs to any group)
+ logger.warning(
+ f"handleGroupingInRequest: groupId={targetGroupId!r} set but no tree exists "
+ f"for contextKey={contextKey!r} — returning empty set"
+ )
+ itemIds = set()
+
+ return GroupingContext(groupTree=groupTree, itemIds=itemIds)
+
+
+def applyGroupScopeFilter(items: List[Dict[str, Any]], itemIds: Optional[set]) -> List[Dict[str, Any]]:
+ """
+ Filter items to those whose "id" field is in itemIds.
+ Returns items unchanged when itemIds is None (no active group scope).
+ Works for both normal list items and for mode=ids / mode=filterValues flows
+ — call it before handleIdsInMemory / handleFilterValuesInMemory.
+ """
+ if itemIds is None:
+ return items
+ return [item for item in items if str(item.get("id", "")) in itemIds]
diff --git a/modules/routes/routeSecurityClickup.py b/modules/routes/routeSecurityClickup.py
index ca787391..d6f71d20 100644
--- a/modules/routes/routeSecurityClickup.py
+++ b/modules/routes/routeSecurityClickup.py
@@ -241,6 +241,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
+ try:
+ from modules.shared.callbackRegistry import callbackRegistry
+
+ if connection.knowledgeIngestionEnabled:
+ callbackRegistry.trigger(
+ "connection.established",
+ connectionId=connection.id,
+ authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"),
+ userId=str(user.id),
+ )
+ else:
+ logger.info(
+ "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
+ extra={
+ "event": "ingestion.connection.bootstrap.skipped",
+ "connectionId": connection.id,
+ "authority": "clickup",
+ "reason": "consent_disabled",
+ },
+ )
+ except Exception as _cbErr:
+ logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
+
return HTMLResponse(
content=f"""
diff --git a/modules/routes/routeSecurityGoogle.py b/modules/routes/routeSecurityGoogle.py
index 523523ee..7b6c1c64 100644
--- a/modules/routes/routeSecurityGoogle.py
+++ b/modules/routes/routeSecurityGoogle.py
@@ -479,6 +479,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
+ try:
+ from modules.shared.callbackRegistry import callbackRegistry
+
+ if connection.knowledgeIngestionEnabled:
+ callbackRegistry.trigger(
+ "connection.established",
+ connectionId=connection.id,
+ authority=str(getattr(connection.authority, "value", connection.authority) or "google"),
+ userId=str(user.id),
+ )
+ else:
+ logger.info(
+ "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
+ extra={
+ "event": "ingestion.connection.bootstrap.skipped",
+ "connectionId": connection.id,
+ "authority": "google",
+ "reason": "consent_disabled",
+ },
+ )
+ except Exception as _cbErr:
+ logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
+
return HTMLResponse(
content=f"""
diff --git a/modules/routes/routeSecurityMsft.py b/modules/routes/routeSecurityMsft.py
index cc4cb87b..a2768a2b 100644
--- a/modules/routes/routeSecurityMsft.py
+++ b/modules/routes/routeSecurityMsft.py
@@ -420,6 +420,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
+ try:
+ from modules.shared.callbackRegistry import callbackRegistry
+
+ if connection.knowledgeIngestionEnabled:
+ callbackRegistry.trigger(
+ "connection.established",
+ connectionId=connection.id,
+ authority=str(getattr(connection.authority, "value", connection.authority) or "msft"),
+ userId=str(user.id),
+ )
+ else:
+ logger.info(
+ "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
+ extra={
+ "event": "ingestion.connection.bootstrap.skipped",
+ "connectionId": connection.id,
+ "authority": "msft",
+ "reason": "consent_disabled",
+ },
+ )
+ except Exception as _cbErr:
+ logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
+
return HTMLResponse(
content=f"""
diff --git a/modules/routes/routeSharepoint.py b/modules/routes/routeSharepoint.py
index e42611ac..1ee21900 100644
--- a/modules/routes/routeSharepoint.py
+++ b/modules/routes/routeSharepoint.py
@@ -128,7 +128,7 @@ async def getSharepointFolderOptionsByReference(
# Set access token on SharePoint service
if not services.sharepoint.setAccessTokenFromConnection(connection):
raise HTTPException(
- status_code=status.HTTP_401_UNAUTHORIZED,
+ status_code=status.HTTP_502_BAD_GATEWAY,
detail=routeApiMsg("Failed to set SharePoint access token. Connection may be expired or invalid.")
)
diff --git a/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py b/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py
index cee81618..56ba791a 100644
--- a/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py
+++ b/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py
@@ -3,7 +3,7 @@
"""ActionToolAdapter: wraps existing workflow actions (dynamicMode=True) as agent tools."""
import logging
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Optional
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
ToolDefinition, ToolResult
@@ -44,7 +44,7 @@ class ActionToolAdapter:
compoundName = f"{shortName}_{actionName}"
toolDef = _buildToolDefinition(compoundName, actionDef, actionInfo)
- handler = _createDispatchHandler(self._actionExecutor, shortName, actionName)
+ handler = _createDispatchHandler(self._actionExecutor, shortName, actionName, self._actionExecutor.services)
toolRegistry.registerFromDefinition(toolDef, handler)
self._registeredTools.append(compoundName)
registered += 1
@@ -186,7 +186,7 @@ def _catalogTypeToJsonSchema(typeStr: str, _depth: int = 0) -> Dict[str, Any]:
return {"type": "string", "description": f"unknown type '{typeStr}' (defaulted to string)"}
-def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
+def _createDispatchHandler(actionExecutor, methodName: str, actionName: str, services=None):
"""Create an async handler that dispatches to the ActionExecutor.
Parameter validation and Ref-payload normalization (collapsing
@@ -204,7 +204,7 @@ def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
if "mandateId" not in args and context.get("mandateId"):
args["mandateId"] = context["mandateId"]
result = await actionExecutor.executeAction(methodName, actionName, args)
- data = _formatActionResult(result)
+ data = _formatActionResult(result, services, context)
return ToolResult(
toolCallId="",
toolName=f"{methodName}_{actionName}",
@@ -223,9 +223,65 @@ def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
return _handler
-def _formatActionResult(result) -> str:
- """Format an ActionResult into a text representation for the agent."""
+_INLINE_CONTENT_LIMIT = 2000
+
+
+def _persistLargeDocument(doc, services, context: Dict[str, Any]) -> Optional[str]:
+ """Save an ActionDocument with large content as a workspace file.
+
+ Returns a formatted result line (with file id + docItem ref) or None
+ if persistence is not possible.
+ """
+ if not services:
+ return None
+ chatService = getattr(services, "chat", None)
+ if not chatService:
+ return None
+ docData = getattr(doc, "documentData", None)
+ if not docData or not isinstance(docData, str):
+ return None
+ docName = getattr(doc, "documentName", "unnamed")
+ docBytes = docData.encode("utf-8")
+ try:
+ fileItem, _ = chatService.interfaceDbComponent.saveUploadedFile(docBytes, docName)
+ fiId = context.get("featureInstanceId") or getattr(services, "featureInstanceId", "")
+ if fiId:
+ chatService.interfaceDbComponent.updateFile(fileItem.id, {"featureInstanceId": fiId})
+
+ from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
+ _attachFileAsChatDocument,
+ _formatToolFileResult,
+ _getOrCreateTempFolder,
+ )
+ tempFolderId = _getOrCreateTempFolder(chatService)
+ if tempFolderId:
+ chatService.interfaceDbComponent.updateFile(fileItem.id, {"folderId": tempFolderId})
+
+ chatDocId = _attachFileAsChatDocument(
+ services, fileItem,
+ label=f"action_doc:{docName}",
+ userMessage=f"Action document: {docName}",
+ )
+ return _formatToolFileResult(
+ fileItem=fileItem,
+ chatDocId=chatDocId,
+ actionLabel="Produced",
+ extraInfo="Use readFile to read the content.",
+ )
+ except Exception as e:
+ logger.warning(f"_persistLargeDocument failed for {docName}: {e}")
+ return None
+
+
+def _formatActionResult(result, services=None, context: Optional[Dict[str, Any]] = None) -> str:
+ """Format an ActionResult into a text representation for the agent.
+
+ Documents whose content exceeds the inline limit are persisted as
+ workspace files so the agent can access them via readFile /
+ ai_process / searchInFileContent.
+ """
parts = []
+ ctx = context or {}
if result.resultLabel:
parts.append(f"Result: {result.resultLabel}")
@@ -238,10 +294,19 @@ def _formatActionResult(result) -> str:
for doc in result.documents:
docName = getattr(doc, "documentName", "unnamed")
docType = getattr(doc, "mimeType", "unknown")
- parts.append(f" - {docName} ({docType})")
docData = getattr(doc, "documentData", None)
- if docData and isinstance(docData, str) and len(docData) < 2000:
- parts.append(f" Content: {docData[:2000]}")
+
+ isLarge = docData and isinstance(docData, str) and len(docData) >= _INLINE_CONTENT_LIMIT
+ if isLarge:
+ persistedLine = _persistLargeDocument(doc, services, ctx)
+ if persistedLine:
+ parts.append(f" - {docName} ({docType})")
+ parts.append(f" {persistedLine}")
+ continue
+
+ parts.append(f" - {docName} ({docType})")
+ if docData and isinstance(docData, str) and len(docData) < _INLINE_CONTENT_LIMIT:
+ parts.append(f" Content: {docData[:_INLINE_CONTENT_LIMIT]}")
if not parts:
parts.append("Action completed successfully." if result.success else "Action failed.")
diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py
index 96ee31bb..c1191c1f 100644
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py
@@ -198,7 +198,10 @@ def _registerDataSourceTools(registry: ToolRegistry, services):
if isinstance(result, _DR):
fileBytes = result.data
- fileName = result.fileName or fileName
+ resolvedName = result.fileName or fileName
+ if resolvedName != fileName:
+ logger.debug(f"downloadFromDataSource: connector fileName={result.fileName!r} overrides arg fileName={fileName!r}")
+ fileName = resolvedName
else:
fileBytes = result
diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
index a48e53b3..62413103 100644
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
@@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_getOrCreateTempFolder,
- _looksLikeBinary,
- _resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@@ -392,65 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
if chunkMime:
mimeType = chunkMime
- # 2) File not yet indexed -> trigger extraction via ExtractionService, then retry
- if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId):
- try:
- chatService = services.chat
- fileInfo = chatService.getFileInfo(fileId)
- fileContent = chatService.getFileContent(fileId)
- if fileContent and fileInfo:
- rawData = fileContent.get("data", "")
- if isinstance(rawData, str) and len(rawData) > 100:
- rawBytes = _b64.b64decode(rawData)
- elif isinstance(rawData, bytes):
- rawBytes = rawData
- else:
- rawBytes = None
-
- if rawBytes:
- from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
- from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
- from modules.datamodels.datamodelExtraction import ExtractionOptions
-
- fileMime = fileInfo.get("mimeType", "application/octet-stream")
- fileName = fileInfo.get("fileName", fileId)
- extracted = runExtraction(
- ExtractorRegistry(), None,
- rawBytes, fileName, fileMime, ExtractionOptions(),
- )
-
- contentObjects = []
- for part in extracted.parts:
- tg = (part.typeGroup or "").lower()
- ct = "image" if tg == "image" else "text"
- if not part.data or not part.data.strip():
- continue
- contentObjects.append({
- "contentObjectId": part.id,
- "contentType": ct,
- "data": part.data,
- "contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})},
- })
-
- if contentObjects:
- _diFiId, _diMId = _resolveFileScope(fileId, context)
- await knowledgeService.indexFile(
- fileId=fileId, fileName=fileName, mimeType=fileMime,
- userId=context.get("userId", ""), contentObjects=contentObjects,
- featureInstanceId=_diFiId,
- mandateId=_diMId,
- )
-
- chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
- imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
- if pageIndex is not None:
- imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex]
- if imageChunks:
- imageData = imageChunks[0].get("data", "")
- except Exception as extractErr:
- logger.warning(f"describeImage: on-demand extraction failed: {extractErr}")
-
- # 3) Direct image file (not a container) - use raw file data
+ # 2) Direct image file (not a container) - use raw file data
if not imageData:
chatService = services.chat
fileContent = chatService.getFileContent(fileId)
@@ -460,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
imageData = fileContent.get("data", "")
mimeType = fileMimeType
- # 4) PDF page rendering: render the requested page as an image via PyMuPDF
+ # 3) PDF page rendering: render the requested page as an image via PyMuPDF
if not imageData:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None
diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py
index 129de517..37116ee5 100644
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py
@@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
-"""Shared helpers for core agent tools (file scope, binary detection, temp folder)."""
+"""Shared helpers for core agent tools (file scope, binary detection, group helpers)."""
import logging
import uuid
@@ -46,39 +46,60 @@ def _looksLikeBinary(data: bytes, sampleSize: int = 1024) -> bool:
return nonPrintable / len(sample) > 0.10
-def _getOrCreateInstanceFolder(chatService, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
- """Return the folder ID for a feature instance, creating it on first use.
-
- Delegates to interfaceDbManagement._ensureFeatureInstanceFolder.
- AI tools call this when saving a file without an explicit folderId
- so that instance-produced files land in a named folder automatically.
- """
- try:
- dbMgmt = chatService.interfaceDbComponent
- return dbMgmt._ensureFeatureInstanceFolder(featureInstanceId, mandateId)
- except Exception as e:
- logger.warning(f"Could not get/create instance folder for {featureInstanceId}: {e}")
- return None
-
-
def _getOrCreateTempFolder(chatService) -> Optional[str]:
- """Return the ID of the root-level 'Temp' folder, creating it if it doesn't exist."""
+ """Deprecated stub: folder-based organisation has been replaced by grouping.
+
+ Returns None unconditionally so callers skip the (now removed) folderId
+ assignment. Remove callers incrementally and delete this stub afterwards.
+ """
+ logger.debug("_getOrCreateTempFolder called – folder support removed, returning None")
+ return None
+
+
+async def _getOrCreateInstanceGroup(
+ appInterface,
+ featureInstanceId: str,
+ contextKey: str = "files/list",
+) -> Optional[str]:
+ """Return groupId of the default group for a feature instance; create if needed."""
try:
- allFolders = chatService.interfaceDbComponent.listFolders()
- tempFolder = next(
- (f for f in allFolders
- if f.get("name") == "Temp" and not f.get("parentId")),
- None,
- )
- if tempFolder:
- return tempFolder.get("id")
- newFolder = chatService.interfaceDbComponent.createFolder("Temp", parentId=None)
- return newFolder.get("id") if newFolder else None
+ existing = appInterface.getTableGrouping(contextKey)
+ nodes = [
+ n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
+ for n in (existing.rootGroups if existing else [])
+ ]
+
+ def _find(nds):
+ for nd in nds:
+ meta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
+ if (meta or {}).get("featureInstanceId") == featureInstanceId:
+ return nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
+ found = _find(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []))
+ if found:
+ return found
+ return None
+
+ found = _find(nodes)
+ if found:
+ return found
+ newId = str(uuid.uuid4())
+ nodes.append({"id": newId, "name": featureInstanceId, "itemIds": [], "subGroups": [], "meta": {"featureInstanceId": featureInstanceId}})
+ appInterface.upsertTableGrouping(contextKey, nodes)
+ return newId
except Exception as e:
- logger.warning(f"Could not get/create Temp folder: {e}")
+ logger.error(f"_getOrCreateInstanceGroup: {e}")
return None
+async def _getOrCreateTempGroup(
+ appInterface,
+ sessionId: str,
+ contextKey: str = "files/list",
+) -> Optional[str]:
+ """Return groupId of a temporary group for a session; create if needed."""
+ return await _getOrCreateInstanceGroup(appInterface, f"_temp_{sessionId}", contextKey)
+
+
def _attachFileAsChatDocument(
services: Any,
fileItem: Any,
diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py
index c2a4842b..adb79ecf 100644
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py
@@ -25,142 +25,11 @@ def _registerMediaTools(registry: ToolRegistry, services):
# ---- Document rendering tool ----
def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
- """Convert markdown content to the standard document JSON format expected by renderers."""
- import re as _re
-
- sections = []
- order = 0
- lines = markdown.split("\n")
- i = 0
-
- def _nextId():
- nonlocal order
- order += 1
- return f"s_{order}"
-
- while i < len(lines):
- line = lines[i]
-
- # --- Headings ---
- headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line)
- if headingMatch:
- level = len(headingMatch.group(1))
- text = headingMatch.group(2).strip()
- sections.append({
- "id": _nextId(), "content_type": "heading", "order": order,
- "elements": [{"content": {"text": text, "level": level}}],
- })
- i += 1
- continue
-
- # --- Fenced code blocks ---
- codeMatch = _re.match(r'^```(\w*)', line)
- if codeMatch:
- lang = codeMatch.group(1) or "text"
- codeLines = []
- i += 1
- while i < len(lines) and not lines[i].startswith("```"):
- codeLines.append(lines[i])
- i += 1
- i += 1
- sections.append({
- "id": _nextId(), "content_type": "code_block", "order": order,
- "elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
- })
- continue
-
- # --- Tables ---
- tableMatch = _re.match(r'^\|(.+)\|$', line)
- if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
- headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
- i += 2
- rows = []
- while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]):
- rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
- rows.append(rowCells)
- i += 1
- sections.append({
- "id": _nextId(), "content_type": "table", "order": order,
- "elements": [{"content": {"headers": headerCells, "rows": rows}}],
- })
- continue
-
- # --- Bullet / numbered lists ---
- listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line)
- if listMatch:
- isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2)))
- items = []
- while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]):
- m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i])
- items.append({"text": m.group(3).strip()})
- i += 1
- sections.append({
- "id": _nextId(), "content_type": "bullet_list", "order": order,
- "elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
- })
- continue
-
- # --- Empty lines (skip) ---
- if not line.strip():
- i += 1
- continue
-
- # --- Images:  or  ---
- imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line)
- if imgMatch:
- altText = imgMatch.group(1).strip() or "Image"
- src = imgMatch.group(2).strip()
- fileId = ""
- if src.startswith("file:"):
- fileId = src[5:]
- sections.append({
- "id": _nextId(), "content_type": "image", "order": order,
- "elements": [{
- "content": {
- "altText": altText,
- "base64Data": "",
- "_fileRef": fileId,
- "_srcUrl": src if not fileId else "",
- }
- }],
- })
- i += 1
- continue
-
- # --- Paragraph (collect consecutive non-empty lines) ---
- paraLines = []
- while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]):
- paraLines.append(lines[i])
- i += 1
- if paraLines:
- sections.append({
- "id": _nextId(), "content_type": "paragraph", "order": order,
- "elements": [{"content": {"text": " ".join(paraLines)}}],
- })
- continue
-
- i += 1
-
- if not sections:
- sections.append({
- "id": _nextId(), "content_type": "paragraph", "order": order,
- "elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
- })
-
- return {
- "metadata": {
- "split_strategy": "single_document",
- "source_documents": [],
- "extraction_method": "agent_rendering",
- "title": title,
- "language": language,
- },
- "documents": [{
- "id": "doc_1",
- "title": title,
- "sections": sections,
- }],
- }
+ """Delegate to the consolidated parser in subDocumentUtility."""
+ from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson
+ result = markdownToDocumentJson(markdown, title, language)
+ result["metadata"]["extraction_method"] = "agent_rendering"
+ return result
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry."""
@@ -245,35 +114,75 @@ def _registerMediaTools(registry: ToolRegistry, services):
except Exception as e:
logger.warning(f"renderDocument: knowledge service unavailable: {e}")
resolvedImages = 0
+
+ def _resolveImageRef(targetObj, fileRefKey="_fileRef", fileIdKey="fileId"):
+ """Resolve a single image reference dict to base64Data in-place."""
+ nonlocal resolvedImages
+ fileRef = targetObj.get(fileRefKey, "") or targetObj.get(fileIdKey, "")
+ if not fileRef or targetObj.get("base64Data"):
+ return
+ if knowledgeService:
+ chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
+ imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
+ if imageChunks:
+ targetObj["base64Data"] = imageChunks[0].get("data", "")
+ chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
+ targetObj["mimeType"] = chunkMime
+ resolvedImages += 1
+ if not targetObj.get("base64Data"):
+ try:
+ rawBytes = services.chat.getFileData(fileRef)
+ if rawBytes:
+ import base64 as _b64
+ targetObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
+ targetObj["mimeType"] = "image/png"
+ resolvedImages += 1
+ except Exception as e:
+ logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}")
+ targetObj.pop("_fileRef", None)
+ targetObj.pop("_srcUrl", None)
+
+ def _resolveInlineRuns(runsList):
+ """Scan a list of inline runs and resolve any image runs with fileId."""
+ for run in runsList:
+ if run.get("type") == "image" and run.get("fileId") and not run.get("base64Data"):
+ _resolveImageRef(run, fileRefKey="fileId", fileIdKey="fileId")
+
for doc in structuredContent.get("documents", []):
for section in doc.get("sections", []):
- if section.get("content_type") != "image":
+ cType = section.get("content_type")
+ # Block-level image sections
+ if cType == "image":
+ for element in section.get("elements", []):
+ contentObj = element.get("content", {})
+ _resolveImageRef(contentObj)
continue
- for element in section.get("elements", []):
- contentObj = element.get("content", {})
- fileRef = contentObj.get("_fileRef", "")
- if not fileRef or contentObj.get("base64Data"):
- continue
- if knowledgeService:
- chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
- imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
- if imageChunks:
- contentObj["base64Data"] = imageChunks[0].get("data", "")
- chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
- contentObj["mimeType"] = chunkMime
- resolvedImages += 1
- if not contentObj.get("base64Data"):
- try:
- rawBytes = services.chat.getFileData(fileRef)
- if rawBytes:
- import base64 as _b64
- contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
- contentObj["mimeType"] = "image/png"
- resolvedImages += 1
- except Exception as e:
- logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}")
- contentObj.pop("_fileRef", None)
- contentObj.pop("_srcUrl", None)
+ # Paragraphs with inlineRuns
+ if cType == "paragraph":
+ for element in section.get("elements", []):
+ runs = element.get("content", {}).get("inlineRuns")
+ if runs:
+ _resolveInlineRuns(runs)
+ continue
+ # Bullet lists - items are List[List[InlineRun]]
+ if cType == "bullet_list":
+ for element in section.get("elements", []):
+ items = element.get("content", {}).get("items", [])
+ for item in items:
+ if isinstance(item, list):
+ _resolveInlineRuns(item)
+ continue
+ # Tables - headers and row cells are List[InlineRun]
+ if cType == "table":
+ for element in section.get("elements", []):
+ contentObj = element.get("content", {})
+ for cell in contentObj.get("headers", []):
+ if isinstance(cell, list):
+ _resolveInlineRuns(cell)
+ for row in contentObj.get("rows", []):
+ for cell in row:
+ if isinstance(cell, list):
+ _resolveInlineRuns(cell)
sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", []))
logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}")
@@ -285,6 +194,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
language=language,
title=title,
userPrompt=content,
+ style=args.get("style"),
)
if not documents:
@@ -367,6 +277,20 @@ def _registerMediaTools(registry: ToolRegistry, services):
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
+ "style": {
+ "type": "object",
+ "description": (
+ "Optional style overrides for the rendered document. Supports nested keys: "
+ "fonts (primary, monospace), colors (primary, secondary, accent, background), "
+ "headings (h1-h4 with sizePt, weight, color, spaceBeforePt, spaceAfterPt), "
+ "paragraph (sizePt, lineSpacing, color), table (headerBg, headerFg, headerSizePt, "
+ "bodySizePt, rowBandingEven, rowBandingOdd, borderColor, borderWidthPt), "
+ "list (bulletChar, indentPt, sizePt), image (defaultWidthPt, maxWidthPt, alignment), "
+ "codeBlock (fontSizePt, background, borderColor), "
+ "page (format, marginsPt, showPageNumbers, headerHeight, footerHeight, headerLogo, headerText, footerText). "
+ "Only provided keys override defaults; omitted keys keep their default values."
+ ),
+ },
},
},
readOnly=False,
@@ -912,7 +836,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="executeCode", success=False, error=f"Language '{language}' not supported. Only 'python' is available.")
try:
from modules.serviceCenter.services.serviceAgent.sandboxExecutor import executePython
- result = await executePython(code)
+ result = await executePython(code, services=services)
if result.get("success"):
output = result.get("output", "(no output)")
return ToolResult(toolCallId="", toolName="executeCode", success=True, data=output)
@@ -962,12 +886,17 @@ def _registerMediaTools(registry: ToolRegistry, services):
readOnly=True
)
+ from modules.serviceCenter.services.serviceAgent.sandboxExecutor import SANDBOX_ALLOWED_MODULES
+ moduleList = ", ".join(sorted(SANDBOX_ALLOWED_MODULES | {"io"}))
registry.register(
"executeCode", _executeCode,
description=(
- "Execute Python code in a sandboxed environment for calculations and data analysis. "
- "Available modules: math, statistics, json, csv, re, datetime, collections, itertools, functools, decimal, fractions, random. "
- "No file system, network, or OS access. Max 30s execution time. "
+ f"Execute Python code in a sandboxed environment for calculations and data analysis. "
+ f"Available modules: {moduleList}. "
+ "io is restricted to StringIO and BytesIO only (no file access). "
+ "Built-in readFile(fileId) returns UTF-8 content of a workspace file by its file ID "
+ "(use the 'file id' from tool outputs, e.g. data = readFile('019af...')). "
+ "No other file system, network, or OS access. Max 30s execution time. "
"Use print() to produce output."
),
parameters={
diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
index 9a6af658..3b9f5945 100644
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
@@ -11,10 +11,9 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_attachFileAsChatDocument,
_formatToolFileResult,
- _getOrCreateInstanceFolder,
- _getOrCreateTempFolder,
+ _getOrCreateInstanceGroup,
+ _getOrCreateTempGroup,
_looksLikeBinary,
- _resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required")
try:
knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None
+ fileStatus = None
# 1) Knowledge Store: return already-extracted text chunks
if knowledgeService:
@@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]",
)
- # 2) Not indexed yet: try on-demand extraction
+ # 2) Not indexed yet: inspect file type to decide how to serve the agent
+ # (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline)
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
if not fileInfo:
@@ -100,83 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
isBinary = _looksLikeBinary(rawBytes)
if isBinary:
- try:
- from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
- from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
- from modules.datamodels.datamodelExtraction import ExtractionOptions
-
- extracted = runExtraction(
- ExtractorRegistry(), ChunkerRegistry(),
- rawBytes, fileName, mimeType, ExtractionOptions(),
- )
-
- contentObjects = []
- for part in extracted.parts:
- tg = (part.typeGroup or "").lower()
- ct = "image" if tg == "image" else "text"
- if not part.data or not part.data.strip():
- continue
- contentObjects.append({
- "contentObjectId": part.id,
- "contentType": ct,
- "data": part.data,
- "contextRef": {
- "containerPath": fileName,
- "location": part.label or "file",
- **(part.metadata or {}),
- },
- })
-
- if contentObjects:
- if knowledgeService:
- try:
- userId = context.get("userId", "")
- _fiId, _mId = _resolveFileScope(fileId, context)
- await knowledgeService.indexFile(
- fileId=fileId, fileName=fileName, mimeType=mimeType,
- userId=userId, contentObjects=contentObjects,
- featureInstanceId=_fiId,
- mandateId=_mId,
- )
- except Exception as e:
- logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
-
- joined = ""
- if knowledgeService:
- _chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
- _textChunks = [
- c for c in (_chunks or [])
- if c.get("contentType") != "image" and c.get("data")
- ]
- if _textChunks:
- joined = "\n\n".join(c["data"] for c in _textChunks)
- if not joined:
- textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"]
- joined = "\n\n".join(textParts) if textParts else ""
- if joined:
- chunked = _applyOffsetLimit(joined, offset, limit)
- if chunked is not None:
- return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked)
- if len(joined) > _MAX_TOOL_RESULT_CHARS:
- joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated – showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]"
- return ToolResult(
- toolCallId="", toolName="readFile", success=True,
- data=joined,
- )
- imgCount = sum(1 for o in contentObjects if o["contentType"] == "image")
- return ToolResult(
- toolCallId="", toolName="readFile", success=True,
- data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' "
- f"({imgCount} images, no readable text). "
- f"Use describeImage(fileId='{fileId}') to analyze visual content.]",
- )
- except Exception as extractErr:
- logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}")
-
return ToolResult(
toolCallId="", toolName="readFile", success=True,
- data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. "
- f"Text extraction not available. Use describeImage for images.]",
+ data=(
+ f"[File '{fileName}' ({mimeType}) is not yet indexed "
+ f"(status: {fileStatus or 'unknown'}). Indexing runs automatically "
+ f"on upload. Please wait a few seconds and retry, or re-upload the file. "
+ f"For visual content use describeImage(fileId='{fileId}').]"
+ ),
)
# 3) Text file: decode raw bytes
@@ -237,7 +169,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
try:
chatService = services.chat
files = chatService.listFiles(
- folderId=args.get("folderId"),
tags=args.get("tags"),
search=args.get("search"),
)
@@ -290,18 +221,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="searchInFileContent", success=False, error=str(e))
- async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]):
- try:
- chatService = services.chat
- folders = chatService.listFolders(parentId=args.get("parentId"))
- folderList = "\n".join(
- f"- {f.get('name', 'unnamed')} (id: {f.get('id', '?')})"
- for f in folders
- ) if folders else "No folders found."
- return ToolResult(toolCallId="", toolName="listFolders", success=True, data=folderList)
- except Exception as e:
- return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e))
-
async def _webSearch(args: Dict[str, Any], context: Dict[str, Any]):
query = args.get("query", "")
if not query:
@@ -339,35 +258,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="tagFile", success=False, error=str(e))
- async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]):
- fileId = args.get("fileId", "")
- targetFolderId = args.get("targetFolderId")
- if not fileId:
- return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required")
- try:
- chatService = services.chat
- chatService.interfaceDbComponent.updateFile(fileId, {"folderId": targetFolderId})
- return ToolResult(
- toolCallId="", toolName="moveFile", success=True,
- data=f"File {fileId} moved to folder {targetFolderId or 'root'}"
- )
- except Exception as e:
- return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e))
-
- async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]):
- name = args.get("name", "")
- if not name:
- return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required")
- try:
- chatService = services.chat
- folder = chatService.createFolder(name=name, parentId=args.get("parentId"))
- return ToolResult(
- toolCallId="", toolName="createFolder", success=True,
- data=f"Folder '{name}' created (id: {folder.get('id', '?')})"
- )
- except Exception as e:
- return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e))
-
async def _writeFile(args: Dict[str, Any], context: Dict[str, Any]):
content = args.get("content", "")
mode = args.get("mode", "create")
@@ -422,12 +312,52 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "")
if fiId:
dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId})
- if args.get("folderId"):
- dbMgmt.updateFile(fileItem.id, {"folderId": args["folderId"]})
+ if args.get("groupId"):
+ try:
+ appIface = chatService.interfaceDbApp
+ existing = appIface.getTableGrouping("files/list")
+ nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
+ def _addToGroup(nds, gid, fid):
+ for nd in nds:
+ nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
+ if nid == gid:
+ ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
+ if fid not in ids:
+ ids.append(fid)
+ if isinstance(nd, dict):
+ nd["itemIds"] = ids
+ return True
+ if _addToGroup(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
+ return True
+ return False
+ _addToGroup(nodes, args["groupId"], fileItem.id)
+ appIface.upsertTableGrouping("files/list", nodes)
+ except Exception as _ge:
+ logger.warning(f"writeFile: failed to add file to group {args['groupId']}: {_ge}")
elif fiId:
- instanceFolderId = _getOrCreateInstanceFolder(chatService, fiId, context.get("mandateId", ""))
- if instanceFolderId:
- dbMgmt.updateFile(fileItem.id, {"folderId": instanceFolderId})
+ try:
+ appIface = chatService.interfaceDbApp
+ instanceGroupId = await _getOrCreateInstanceGroup(appIface, fiId)
+ if instanceGroupId:
+ existing = appIface.getTableGrouping("files/list")
+ nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
+ def _addToGroup2(nds, gid, fid):
+ for nd in nds:
+ nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
+ if nid == gid:
+ ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
+ if fid not in ids:
+ ids.append(fid)
+ if isinstance(nd, dict):
+ nd["itemIds"] = ids
+ return True
+ if _addToGroup2(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
+ return True
+ return False
+ _addToGroup2(nodes, instanceGroupId, fileItem.id)
+ appIface.upsertTableGrouping("files/list", nodes)
+ except Exception as _ge:
+ logger.warning(f"writeFile: failed to add file to instance group for {fiId}: {_ge}")
if args.get("tags"):
dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]})
@@ -480,13 +410,13 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
registry.register(
"listFiles", _listFiles,
description=(
- "List files in the local workspace. Filter by folder, tags, or search term. "
+ "List files in the local workspace. Filter by tags or search term. "
+ "To filter by group, use listItemsInGroup. "
"For external data sources, use browseDataSource instead."
),
parameters={
"type": "object",
"properties": {
- "folderId": {"type": "string", "description": "Filter by folder ID"},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Filter by tags (any match)"},
"search": {"type": "string", "description": "Search in file names and descriptions"},
}
@@ -513,18 +443,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
- registry.register(
- "listFolders", _listFolders,
- description="List folders in the local workspace. For external data sources, use browseDataSource instead.",
- parameters={
- "type": "object",
- "properties": {
- "parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
- }
- },
- readOnly=True
- )
-
registry.register(
"webSearch", _webSearch,
description="Search the web for general information. Use readUrl to fetch content from a known URL instead.",
@@ -550,34 +468,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
- registry.register(
- "moveFile", _moveFile,
- description="Move a file to a different folder in the local workspace.",
- parameters={
- "type": "object",
- "properties": {
- "fileId": {"type": "string", "description": "The file ID to move"},
- "targetFolderId": {"type": "string", "description": "Target folder ID (null for root)"},
- },
- "required": ["fileId"]
- },
- readOnly=False
- )
-
- registry.register(
- "createFolder", _createFolder,
- description="Create a new folder in the local workspace.",
- parameters={
- "type": "object",
- "properties": {
- "name": {"type": "string", "description": "Folder name"},
- "parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
- },
- "required": ["name"]
- },
- readOnly=False
- )
-
registry.register(
"writeFile", _writeFile,
description=(
@@ -598,7 +488,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"content": {"type": "string", "description": "Content to write/append"},
"mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"},
"fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"},
- "folderId": {"type": "string", "description": "Target folder ID (mode=create only)"},
+ "groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"},
},
"required": ["content"]
@@ -758,55 +648,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
- # ---- Phase 2: deleteFolder, renameFolder, moveFolder, copyFile, editFile ----
-
- async def _deleteFolder(args: Dict[str, Any], context: Dict[str, Any]):
- folderId = args.get("folderId", "")
- recursive = args.get("recursive", False)
- if not folderId:
- return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error="folderId is required")
- try:
- chatService = services.chat
- result = chatService.interfaceDbComponent.deleteFolder(folderId, recursive=recursive)
- summary = f"Deleted {result.get('deletedFolders', 1)} folder(s) and {result.get('deletedFiles', 0)} file(s)"
- return ToolResult(
- toolCallId="", toolName="deleteFolder", success=True, data=summary,
- sideEvents=[{"type": "folderDeleted", "data": {"folderId": folderId, **result}}],
- )
- except Exception as e:
- return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error=str(e))
-
- async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]):
- folderId = args.get("folderId", "")
- newName = args.get("newName", "")
- if not folderId or not newName:
- return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required")
- try:
- chatService = services.chat
- chatService.interfaceDbComponent.renameFolder(folderId, newName)
- return ToolResult(
- toolCallId="", toolName="renameFolder", success=True,
- data=f"Folder {folderId} renamed to '{newName}'",
- sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "name": newName}}],
- )
- except Exception as e:
- return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e))
-
- async def _moveFolder(args: Dict[str, Any], context: Dict[str, Any]):
- folderId = args.get("folderId", "")
- targetParentId = args.get("targetParentId")
- if not folderId:
- return ToolResult(toolCallId="", toolName="moveFolder", success=False, error="folderId is required")
- try:
- chatService = services.chat
- chatService.interfaceDbComponent.moveFolder(folderId, targetParentId)
- return ToolResult(
- toolCallId="", toolName="moveFolder", success=True,
- data=f"Folder {folderId} moved to {targetParentId or 'root'}",
- sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "parentId": targetParentId}}],
- )
- except Exception as e:
- return ToolResult(toolCallId="", toolName="moveFolder", success=False, error=str(e))
+ # ---- Phase 2: copyFile, editFile ----
async def _copyFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
@@ -816,7 +658,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
chatService = services.chat
copiedFile = chatService.interfaceDbComponent.copyFile(
fileId,
- targetFolderId=args.get("targetFolderId"),
newFileName=args.get("newFileName"),
)
return ToolResult(
@@ -891,48 +732,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="replaceInFile", success=False, error=str(e))
- registry.register(
- "deleteFolder", _deleteFolder,
- description="Delete a folder from the local workspace. Set recursive=true to delete all contents.",
- parameters={
- "type": "object",
- "properties": {
- "folderId": {"type": "string", "description": "The folder ID to delete"},
- "recursive": {"type": "boolean", "description": "If true, delete folder and all contents (files and subfolders). Default: false"},
- },
- "required": ["folderId"]
- },
- readOnly=False
- )
-
- registry.register(
- "renameFolder", _renameFolder,
- description="Rename a folder in the local workspace.",
- parameters={
- "type": "object",
- "properties": {
- "folderId": {"type": "string", "description": "The folder ID to rename"},
- "newName": {"type": "string", "description": "New folder name"},
- },
- "required": ["folderId", "newName"]
- },
- readOnly=False
- )
-
- registry.register(
- "moveFolder", _moveFolder,
- description="Move a folder to a different parent in the local workspace.",
- parameters={
- "type": "object",
- "properties": {
- "folderId": {"type": "string", "description": "The folder ID to move"},
- "targetParentId": {"type": "string", "description": "Target parent folder ID (null/omit for root)"},
- },
- "required": ["folderId"]
- },
- readOnly=False
- )
-
registry.register(
"copyFile", _copyFile,
description="Create an independent copy of a file in the local workspace.",
@@ -940,7 +739,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to copy"},
- "targetFolderId": {"type": "string", "description": "Target folder for the copy (default: same folder)"},
"newFileName": {"type": "string", "description": "New file name (default: same name, auto-numbered if duplicate)"},
},
"required": ["fileId"]
@@ -948,6 +746,137 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
+ # ---- Group tools (replaces folder-based tools) ----
+
+ async def _listGroups(args: Dict[str, Any], context: Dict[str, Any]):
+ contextKey = args.get("contextKey", "files/list")
+ try:
+ chatService = services.chat
+ appInterface = chatService.interfaceDbApp
+ existing = appInterface.getTableGrouping(contextKey)
+ if not existing:
+ return ToolResult(toolCallId="", toolName="listGroups", success=True, data="No groups found.")
+
+ def _flatten(nodes, depth=0):
+ result = []
+ for n in nodes:
+ nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
+ result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
+ result.extend(_flatten(nd.get("subGroups", []), depth + 1))
+ return result
+
+ groups = _flatten(existing.rootGroups)
+ lines = "\n".join(
+ f"{' ' * g['depth']}- {g['name']} (id: {g['id']}, items: {g['itemCount']})"
+ for g in groups
+ ) if groups else "No groups found."
+ return ToolResult(toolCallId="", toolName="listGroups", success=True, data=lines)
+ except Exception as e:
+ return ToolResult(toolCallId="", toolName="listGroups", success=False, error=str(e))
+
+ async def _listItemsInGroup(args: Dict[str, Any], context: Dict[str, Any]):
+ groupId = args.get("groupId", "")
+ contextKey = args.get("contextKey", "files/list")
+ if not groupId:
+ return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error="groupId is required")
+ try:
+ from modules.routes.routeHelpers import _collectItemIds
+ chatService = services.chat
+ appInterface = chatService.interfaceDbApp
+ existing = appInterface.getTableGrouping(contextKey)
+ if not existing:
+ return ToolResult(toolCallId="", toolName="listItemsInGroup", success=True, data="No groups found.")
+ nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
+ ids = _collectItemIds(nodes, groupId)
+ itemList = list(ids) if ids else []
+ return ToolResult(
+ toolCallId="", toolName="listItemsInGroup", success=True,
+ data="\n".join(f"- {fid}" for fid in itemList) if itemList else "No items in group.",
+ )
+ except Exception as e:
+ return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error=str(e))
+
+ async def _addItemsToGroup(args: Dict[str, Any], context: Dict[str, Any]):
+ groupId = args.get("groupId", "")
+ itemIds = args.get("itemIds", [])
+ contextKey = args.get("contextKey", "files/list")
+ if not groupId:
+ return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="groupId is required")
+ if not itemIds:
+ return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="itemIds is required")
+ try:
+ chatService = services.chat
+ appInterface = chatService.interfaceDbApp
+ existing = appInterface.getTableGrouping(contextKey)
+ nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
+
+ def _add(nds):
+ for nd in nds:
+ nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
+ if nid == groupId:
+ existing_ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
+ for fid in itemIds:
+ if fid not in existing_ids:
+ existing_ids.append(fid)
+ if isinstance(nd, dict):
+ nd["itemIds"] = existing_ids
+ return True
+ if _add(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])):
+ return True
+ return False
+
+ found = _add(nodes)
+ if not found:
+ return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=f"Group {groupId} not found")
+ appInterface.upsertTableGrouping(contextKey, nodes)
+ return ToolResult(
+ toolCallId="", toolName="addItemsToGroup", success=True,
+ data=f"Added {len(itemIds)} item(s) to group {groupId}",
+ )
+ except Exception as e:
+ return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=str(e))
+
+ registry.register(
+ "listGroups", _listGroups,
+ description="List all groups in the file grouping tree. Groups replace folders for organising files.",
+ parameters={
+ "type": "object",
+ "properties": {
+ "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
+ }
+ },
+ readOnly=True
+ )
+
+ registry.register(
+ "listItemsInGroup", _listItemsInGroup,
+ description="List all file IDs assigned to a specific group (includes sub-groups recursively).",
+ parameters={
+ "type": "object",
+ "properties": {
+ "groupId": {"type": "string", "description": "The group ID to inspect"},
+ "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
+ },
+ "required": ["groupId"]
+ },
+ readOnly=True
+ )
+
+ registry.register(
+ "addItemsToGroup", _addItemsToGroup,
+ description="Add one or more file IDs to an existing group.",
+ parameters={
+ "type": "object",
+ "properties": {
+ "groupId": {"type": "string", "description": "The group ID to add files to"},
+ "itemIds": {"type": "array", "items": {"type": "string"}, "description": "List of file IDs to add"},
+ "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
+ },
+ "required": ["groupId", "itemIds"]
+ },
+ readOnly=False
+ )
+
registry.register(
"replaceInFile", _replaceInFile,
description=(
diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
index fdf172aa..17eb83e4 100644
--- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
+++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
@@ -69,7 +69,15 @@ class _ServicesAdapter:
@property
def workflow(self):
- return self._context.workflow
+ return getattr(self, "_workflow_override", None) or self._context.workflow
+
+ @workflow.setter
+ def workflow(self, value):
+ self._workflow_override = value
+ try:
+ self._context.workflow = value
+ except (AttributeError, TypeError):
+ pass
@property
def ai(self):
@@ -95,6 +103,13 @@ class _ServicesAdapter:
def extraction(self):
return self._getService("extraction")
+ @property
+ def interfaceDbComponent(self):
+ try:
+ return self.chat.interfaceDbComponent
+ except Exception:
+ return None
+
@property
def rbac(self):
"""Same RbacClass as workflow hub (MethodBase permission checks during discoverMethods)."""
@@ -268,24 +283,19 @@ class AgentService:
info = chatService.getFileInfo(fid)
if not info:
- folderInfo = chatService.interfaceDbComponent.getFolder(fid)
- if folderInfo:
- folderName = folderInfo.get("name", fid)
- folderFiles = chatService.listFiles(folderId=fid)
- desc = f"### Folder: {folderName}\n - id: {fid}\n - type: folder\n - contains: {len(folderFiles)} file(s)"
- if folderFiles:
- desc += "\n - files:"
- for ff in folderFiles[:30]:
- ffName = ff.get("fileName", "?")
- ffId = ff.get("id", "?")
- ffMime = ff.get("mimeType", "?")
- ffSize = ff.get("fileSize", ff.get("size", "?"))
- desc += f"\n * {ffName} (id: {ffId}, type: {ffMime}, size: {ffSize} bytes)"
- if len(folderFiles) > 30:
- desc += f"\n ... and {len(folderFiles) - 30} more files"
- desc += f'\nUse `listFiles(folderId="{fid}")` to get the full file list, then `readFile(fileId)` to read individual files.'
- fileDescriptions.append(desc)
- continue
+ # Check if fid is a group ID
+ try:
+ groupFileIds = chatService.listFilesInGroup(fid)
+ if groupFileIds:
+ allGroups = chatService.listGroups()
+ groupInfo = next((g for g in allGroups if g.get("id") == fid), None)
+ groupName = groupInfo.get("name", fid) if groupInfo else fid
+ desc = f"### Group: {groupName}\n - id: {fid}\n - type: group\n - contains: {len(groupFileIds)} file(s)"
+ desc += f'\nUse `listItemsInGroup(groupId="{fid}")` to get file IDs, then `readFile(fileId)` to read each.'
+ fileDescriptions.append(desc)
+ continue
+ except Exception:
+ pass
fileDescriptions.append(f"### File id: {fid}")
continue
@@ -333,7 +343,7 @@ class AgentService:
"These files/folders have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n"
- "For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n"
+ "For groups, use `listItemsInGroup(groupId)` to get the file IDs inside, then `readFile(fileId)` for each.\n"
"For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with "
"`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n"
"For small docs you may pass `content` inline. Embed images with `` in markdown.\n\n"
diff --git a/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py b/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py
index 15362e65..e4671a70 100644
--- a/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py
+++ b/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py
@@ -10,8 +10,8 @@ from typing import Dict, Any
logger = logging.getLogger(__name__)
-_PYTHON_ALLOWED_MODULES = {
- "math", "statistics", "json", "csv", "re", "datetime",
+SANDBOX_ALLOWED_MODULES = {
+ "math", "statistics", "json", "csv", "re", "datetime", "time",
"collections", "itertools", "functools", "decimal", "fractions",
"random", "string", "textwrap", "operator", "copy",
}
@@ -19,17 +19,33 @@ _PYTHON_ALLOWED_MODULES = {
_PYTHON_BLOCKED_BUILTINS = {
"open", "exec", "eval", "compile", "__import__", "globals", "locals",
"getattr", "setattr", "delattr", "breakpoint", "exit", "quit",
- "input", "memoryview", "type",
+ "input", "memoryview",
}
_MAX_EXECUTION_TIME_S = 30
_MAX_OUTPUT_CHARS = 50000
+_RESTRICTED_IO = None
+
+def _getRestrictedIo():
+ """Return a restricted ``io`` module exposing only StringIO/BytesIO."""
+ global _RESTRICTED_IO
+ if _RESTRICTED_IO is None:
+ import types
+ m = types.ModuleType("io")
+ m.StringIO = io.StringIO
+ m.BytesIO = io.BytesIO
+ _RESTRICTED_IO = m
+ return _RESTRICTED_IO
+
+
def _safeImport(name, *args, **kwargs):
"""Restricted import that only allows whitelisted modules."""
- if name not in _PYTHON_ALLOWED_MODULES:
- raise ImportError(f"Module '{name}' is not allowed. Permitted: {', '.join(sorted(_PYTHON_ALLOWED_MODULES))}")
+ if name == "io":
+ return _getRestrictedIo()
+ if name not in SANDBOX_ALLOWED_MODULES:
+ raise ImportError(f"Module '{name}' is not allowed. Permitted: io (StringIO/BytesIO only), {', '.join(sorted(SANDBOX_ALLOWED_MODULES))}")
return __builtins__["__import__"](name, *args, **kwargs) if isinstance(__builtins__, dict) else __import__(name, *args, **kwargs)
@@ -48,7 +64,7 @@ def _buildRestrictedGlobals() -> Dict[str, Any]:
safeBuiltins["__name__"] = "__sandbox__"
safeBuiltins["__builtins__"] = safeBuiltins
- for modName in _PYTHON_ALLOWED_MODULES:
+ for modName in SANDBOX_ALLOWED_MODULES:
try:
safeBuiltins[modName] = __import__(modName)
except ImportError:
@@ -57,12 +73,27 @@ def _buildRestrictedGlobals() -> Dict[str, Any]:
return {"__builtins__": safeBuiltins}
-async def executePython(code: str) -> Dict[str, Any]:
+def _makeReadFile(services):
+ """Create a readFile(fileId) closure bound to the current services context."""
+ def readFile(fileId: str) -> str:
+ mgmt = getattr(services, 'interfaceDbComponent', None) if services else None
+ if not mgmt:
+ raise RuntimeError("readFile: no file store available in this session")
+ data = mgmt.getFileData(str(fileId))
+ if data is None:
+ raise FileNotFoundError(f"File '{fileId}' not found in workspace")
+ return data.decode("utf-8")
+ return readFile
+
+
+async def executePython(code: str, *, services=None) -> Dict[str, Any]:
"""Execute Python code in a restricted sandbox. Returns {success, output, error}."""
import asyncio
def _run():
restrictedGlobals = _buildRestrictedGlobals()
+ if services:
+ restrictedGlobals["__builtins__"]["readFile"] = _makeReadFile(services)
capturedOutput = io.StringIO()
oldStdout = sys.stdout
oldStderr = sys.stderr
diff --git a/modules/serviceCenter/services/serviceAi/mainServiceAi.py b/modules/serviceCenter/services/serviceAi/mainServiceAi.py
index 6428bed3..bcdb9552 100644
--- a/modules/serviceCenter/services/serviceAi/mainServiceAi.py
+++ b/modules/serviceCenter/services/serviceAi/mainServiceAi.py
@@ -51,6 +51,10 @@ class _ServicesAdapter:
def workflow(self):
return self._context.workflow
+ @workflow.setter
+ def workflow(self, value):
+ self._context.workflow = value
+
@property
def chat(self):
return self._get_service("chat")
@@ -86,7 +90,7 @@ class _ServicesAdapter:
return getattr(w, "featureCode", None) if w else None
def __getattr__(self, name: str):
- if name in ("allowedProviders", "preferredProviders", "currentUserLanguage"):
+ if name in ("allowedProviders", "allowedModels", "preferredProviders", "currentUserLanguage"):
return getattr(self.workflow, name, None) if self.workflow else None
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
@@ -164,12 +168,29 @@ class AiService:
# SPEECH_TEAMS: Dedicated pipeline, bypasses standard model selection
if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS:
return await self._handleSpeechTeams(request)
-
- # FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
- self._preflightBillingCheck()
-
- # Balance & provider permission checks
- await self._checkBillingBeforeAiCall()
+
+ _opType = request.options.operationType if request.options else None
+ _isNeutralizationCall = _opType in (
+ OperationTypeEnum.NEUTRALIZATION_TEXT,
+ OperationTypeEnum.NEUTRALIZATION_IMAGE,
+ )
+
+ if not _isNeutralizationCall:
+ # FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
+ self._preflightBillingCheck()
+ # Balance & provider permission checks
+ await self._checkBillingBeforeAiCall()
+ else:
+ # Neutralization calls are system-level operations (connector anonymization).
+ # They run without a mandate context (e.g. personal-scope connections) and
+ # are billed the same way as embedding calls: best-effort, skipped when no
+ # billing settings exist for an empty mandate.
+ logger.debug(
+ "callAi: skipping billing preflight for neutralization call "
+ "(operationType=%s, user=%s)",
+ _opType,
+ getattr(getattr(self.services, 'user', None), 'id', 'unknown'),
+ )
# Calculate effective allowedProviders: RBAC ∩ Workflow
effectiveProviders = self._calculateEffectiveProviders()
@@ -177,6 +198,11 @@ class AiService:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
logger.debug(f"Effective allowedProviders for AI request: {effectiveProviders}")
+ # Calculate effective allowedModels: Workflow ∩ Request (node-level)
+ effectiveModels = self._calculateEffectiveModels(request)
+ if effectiveModels and request.options:
+ request.options = request.options.model_copy(update={'allowedModels': effectiveModels})
+
# Neutralize prompt if enabled (before AI call)
_wasNeutralized = False
_excludedDocs: List[str] = []
@@ -218,13 +244,25 @@ class AiService:
Rehydration happens on the final AiCallResponse (not on individual str deltas).
"""
await self.ensureAiObjectsInitialized()
- self._preflightBillingCheck()
- await self._checkBillingBeforeAiCall()
+
+ _streamOpType = request.options.operationType if request.options else None
+ _isNeutralizationStream = _streamOpType in (
+ OperationTypeEnum.NEUTRALIZATION_TEXT,
+ OperationTypeEnum.NEUTRALIZATION_IMAGE,
+ )
+ if not _isNeutralizationStream:
+ self._preflightBillingCheck()
+ await self._checkBillingBeforeAiCall()
effectiveProviders = self._calculateEffectiveProviders()
if effectiveProviders and request.options:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
+ # Calculate effective allowedModels: Workflow ∩ Request (node-level)
+ effectiveModels = self._calculateEffectiveModels(request)
+ if effectiveModels and request.options:
+ request.options = request.options.model_copy(update={'allowedModels': effectiveModels})
+
# Neutralize prompt if enabled (before streaming)
_wasNeutralized = False
_excludedDocs: List[str] = []
@@ -1240,6 +1278,43 @@ detectedIntent-Werte:
logger.warning(f"Error calculating effective providers: {e}")
return None
+ def _calculateEffectiveModels(self, request: AiCallRequest = None) -> Optional[List[str]]:
+ """
+ Calculate effective allowed models: Workflow.allowedModels ∩ request.options.allowedModels.
+
+ AND-logic intersection:
+ - If workflow specifies allowedModels, start with those.
+ - If request (node-level) also specifies allowedModels, intersect.
+ - Returns None if no model filtering is needed.
+ """
+ try:
+ effectiveModels = None
+
+ # Workflow-level allowedModels (from automation config)
+ workflowModels = getattr(self.services, 'allowedModels', None)
+ if workflowModels:
+ effectiveModels = list(workflowModels)
+
+ # Request-level (node-level) allowedModels
+ requestModels = None
+ if request and request.options and request.options.allowedModels:
+ requestModels = request.options.allowedModels
+
+ if requestModels:
+ if effectiveModels:
+ effectiveModels = [m for m in effectiveModels if m in requestModels]
+ else:
+ effectiveModels = list(requestModels)
+
+ if effectiveModels:
+ logger.debug(f"Model filter: Workflow={workflowModels}, Request={requestModels}, Effective={effectiveModels}")
+
+ return effectiveModels if effectiveModels else None
+
+ except Exception as e:
+ logger.warning(f"Error calculating effective models: {e}")
+ return None
+
async def ensureAiObjectsInitialized(self):
"""Ensure aiObjects is initialized and submodules are ready."""
if self.aiObjects is None:
diff --git a/modules/serviceCenter/services/serviceChat/mainServiceChat.py b/modules/serviceCenter/services/serviceChat/mainServiceChat.py
index 0630c83b..0e69344a 100644
--- a/modules/serviceCenter/services/serviceChat/mainServiceChat.py
+++ b/modules/serviceCenter/services/serviceChat/mainServiceChat.py
@@ -199,13 +199,8 @@ class ChatService:
label = parts[1]
messageFound = None
for message in workflow.messages:
- # Validate message belongs to this workflow
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
- if msgWorkflowId:
- logger.warning(f"Message {message.id} has workflowId {msgWorkflowId} but belongs to workflow {workflowId}. Skipping.")
- else:
- logger.warning(f"Message {message.id} has no workflowId. Skipping.")
continue
msgLabel = getattr(message, 'documentsLabel', None)
@@ -213,7 +208,6 @@ class ChatService:
messageFound = message
break
- # If found, add documents
if messageFound and messageFound.documents:
allDocuments.extend(messageFound.documents)
else:
@@ -419,7 +413,7 @@ class ChatService:
return None
def getFileInfo(self, fileId: str) -> Dict[str, Any]:
- """Get file information including new fields (tags, folderId, description, status)."""
+ """Get file information including new fields (tags, description, status)."""
fileItem = self.interfaceDbComponent.getFile(fileId)
if fileItem:
return {
@@ -430,7 +424,6 @@ class ChatService:
"fileHash": fileItem.fileHash,
"creationDate": fileItem.sysCreatedAt,
"tags": getattr(fileItem, "tags", None),
- "folderId": getattr(fileItem, "folderId", None),
"description": getattr(fileItem, "description", None),
"status": getattr(fileItem, "status", None),
}
@@ -449,14 +442,12 @@ class ChatService:
def listFiles(
self,
- folderId: str = None,
tags: List[str] = None,
search: str = None,
) -> List[Dict[str, Any]]:
"""List files for the current user with optional filters.
Args:
- folderId: Filter by folder (None = root / all).
tags: Filter by tags (any match).
search: Search in fileName and description.
@@ -469,10 +460,6 @@ class ChatService:
allFiles = self.interfaceDbComponent.getAllFiles()
results = []
for fileItem in allFiles:
- if folderId is not None:
- if fileItem.get("folderId") != folderId:
- continue
-
if tags:
itemTags = fileItem.get("tags") or []
if not any(t in itemTags for t in tags):
@@ -492,27 +479,40 @@ class ChatService:
"fileSize": fileItem.get("fileSize"),
"creationDate": fileItem.get("sysCreatedAt"),
"tags": fileItem.get("tags"),
- "folderId": fileItem.get("folderId"),
"description": fileItem.get("description"),
"status": fileItem.get("status"),
})
return results
- def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]:
- """List file folders for the current user.
+ def listGroups(self, contextKey: str = "files/list") -> list:
+ """List all groups in the groupTree for the current context."""
+ try:
+ existing = self.interfaceDbApp.getTableGrouping(contextKey)
+ if not existing:
+ return []
+ def _flatten(nodes, depth=0):
+ result = []
+ for n in nodes:
+ nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
+ result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
+ result.extend(_flatten(nd.get("subGroups", []), depth + 1))
+ return result
+ return _flatten(existing.rootGroups)
+ except Exception as e:
+ return []
- Args:
- parentId: Optional parent folder ID to filter by.
- None = return ALL folders (for tree building).
-
- Returns:
- List of folder dicts.
- """
- return self.interfaceDbComponent.listFolders(parentId=parentId)
-
- def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]:
- """Create a new file folder with unique name validation."""
- return self.interfaceDbComponent.createFolder(name=name, parentId=parentId)
+ def listFilesInGroup(self, groupId: str, contextKey: str = "files/list") -> list:
+ """List file IDs in a specific group (recursive)."""
+ try:
+ from modules.routes.routeHelpers import _collectItemIds
+ existing = self.interfaceDbApp.getTableGrouping(contextKey)
+ if not existing:
+ return []
+ nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
+ ids = _collectItemIds(nodes, groupId)
+ return list(ids) if ids else []
+ except Exception:
+ return []
# ---- DataSource CRUD ----
diff --git a/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py b/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py
index 6093e1bd..5bcd1d52 100644
--- a/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py
+++ b/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py
@@ -166,12 +166,28 @@ class ClickupService:
page: int = 0,
include_closed: bool = False,
subtasks: bool = True,
+ dateCreatedGt: Optional[int] = None,
+ dateCreatedLt: Optional[int] = None,
+ dateUpdatedGt: Optional[int] = None,
+ dateUpdatedLt: Optional[int] = None,
+ customFields: Optional[List[Dict[str, Any]]] = None,
) -> Dict[str, Any]:
params: Dict[str, Any] = {
"page": page,
"subtasks": str(subtasks).lower(),
"include_closed": str(include_closed).lower(),
}
+ if dateCreatedGt is not None:
+ params["date_created_gt"] = dateCreatedGt
+ if dateCreatedLt is not None:
+ params["date_created_lt"] = dateCreatedLt
+ if dateUpdatedGt is not None:
+ params["date_updated_gt"] = dateUpdatedGt
+ if dateUpdatedLt is not None:
+ params["date_updated_lt"] = dateUpdatedLt
+ if customFields:
+ import json as _json
+ params["custom_fields"] = _json.dumps(customFields)
return await self._request("GET", f"/list/{list_id}/task", params=params)
async def getTask(self, task_id: str, *, include_subtasks: bool = True) -> Dict[str, Any]:
diff --git a/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py b/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py
index b9377404..6afcc0a8 100644
--- a/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py
@@ -14,6 +14,7 @@ from .subDocumentUtility import (
detectMimeTypeFromData,
convertDocumentDataToString
)
+from .styleDefaults import resolveStyle
logger = logging.getLogger(__name__)
@@ -382,7 +383,7 @@ class GenerationService:
'workflowId': 'unknown'
}
- async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None) -> List[RenderedDocument]:
+ async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None, style: Optional[Dict[str, Any]] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to the specified output format.
Processes EACH document separately and calls renderer for each.
@@ -399,12 +400,14 @@ class GenerationService:
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
parentOperationId: Optional parent operation ID for hierarchical logging
+ style: Optional style overrides (deep-merged with DEFAULT_STYLE)
Returns:
List of RenderedDocument objects.
Each RenderedDocument represents one rendered file (main document or supporting file)
"""
try:
+ resolvedStyle = resolveStyle(style)
# Validate JSON input
if not isinstance(extractedContent, dict):
raise ValueError("extractedContent must be a JSON dictionary")
@@ -469,7 +472,7 @@ class GenerationService:
docTitle = doc.get("title", title)
# Render this document (can return multiple files, e.g., HTML + images)
- renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService)
+ renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService, style=resolvedStyle)
allRenderedDocuments.extend(renderedDocs)
logger.info(f"Rendered {len(documents)} document(s) into {len(allRenderedDocuments)} file(s)")
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py b/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py
index b080ce88..583c423c 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py
@@ -84,7 +84,7 @@ class BaseRenderer(ABC):
return list(supportedSectionTypes)
@abstractmethod
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to multiple documents.
Each renderer must implement this method.
@@ -95,6 +95,9 @@ class BaseRenderer(ABC):
title: Report title
userPrompt: Original user prompt for context
aiService: AI service instance for additional processing
+ style: Fully-resolved unified style dict from styleDefaults.resolveStyle().
+ When provided, renderers use these values instead of their
+ own defaults / AI-generated styles.
Returns:
List of RenderedDocument objects.
@@ -102,6 +105,112 @@ class BaseRenderer(ABC):
Even if only one document is returned, it must be wrapped in a list.
"""
pass
+
+ def _convertUnifiedStyleToInternal(self, style: Dict[str, Any]) -> Dict[str, Any]:
+ """Convert the unified resolvedStyle dict (from styleDefaults) into
+ the renderer-internal style-set format that all rendering methods already
+ consume. Override in subclasses for format-specific tweaks."""
+ h1 = style["headings"]["h1"]
+ h2 = style["headings"]["h2"]
+ h3 = style["headings"].get("h3", h2)
+ h4 = style["headings"].get("h4", h3)
+ tbl = style["table"]
+ para = style["paragraph"]
+ lst = style["list"]
+ cb = style["codeBlock"]
+ return {
+ "title": {
+ "font_size": h1["sizePt"], "color": h1["color"],
+ "bold": h1.get("weight") == "bold", "align": "left",
+ },
+ "heading1": {
+ "font_size": h1["sizePt"], "color": h1["color"],
+ "bold": h1.get("weight") == "bold", "align": "left",
+ },
+ "heading2": {
+ "font_size": h2["sizePt"], "color": h2["color"],
+ "bold": h2.get("weight") == "bold", "align": "left",
+ },
+ "heading3": {
+ "font_size": h3["sizePt"], "color": h3["color"],
+ "bold": h3.get("weight") == "bold", "align": "left",
+ },
+ "heading4": {
+ "font_size": h4["sizePt"], "color": h4["color"],
+ "bold": h4.get("weight") == "bold", "align": "left",
+ },
+ "paragraph": {
+ "font_size": para["sizePt"], "color": para["color"],
+ "bold": False, "align": "left",
+ },
+ "table_header": {
+ "background": tbl["headerBg"], "text_color": tbl["headerFg"],
+ "bold": True, "align": "center",
+ },
+ "table_cell": {
+ "background": tbl["rowBandingOdd"], "text_color": para["color"],
+ "bold": False, "align": "left",
+ },
+ "table_border": {
+ "style": "grid", "color": tbl["borderColor"],
+ },
+ "bullet_list": {
+ "font_size": lst["sizePt"], "color": para["color"],
+ "indent": lst["indentPt"],
+ },
+ "code_block": {
+ "font": style["fonts"]["monospace"],
+ "font_size": cb["fontSizePt"], "color": para["color"],
+ "background": cb["background"],
+ },
+ }
+
+ @staticmethod
+ def _inlineRunsFromContent(content: Dict[str, Any], *, itemsKey: str = None) -> Any:
+ """Extract inline runs from new-format content, falling back to old format.
+
+ For paragraphs (itemsKey=None):
+ new: content["inlineRuns"] -> List[InlineRun]
+ old: content["text"] -> wrapped in [{"type":"text","value":text}]
+
+ For list items (itemsKey="items"):
+ new: content["items"] is List[List[InlineRun]]
+ old: content["items"] is List[str] or List[{"text":…}]
+ Returns the items list (caller decides per-item conversion).
+
+ For table headers/cells:
+ new: each header/cell is List[InlineRun]
+ old: each header/cell is a plain str
+ Caller handles per-cell.
+ """
+ if itemsKey:
+ return content.get(itemsKey, [])
+ inlineRuns = content.get("inlineRuns")
+ if inlineRuns:
+ return inlineRuns
+ text = content.get("text", "")
+ if text:
+ return [{"type": "text", "value": text}]
+ return []
+
+ @staticmethod
+ def _inlineRunsForCell(cell) -> list:
+ """Normalize a single table header or cell value to List[InlineRun].
+ Accepts either a plain string or an already-correct list of run dicts."""
+ if isinstance(cell, list):
+ return cell
+ return [{"type": "text", "value": str(cell) if cell is not None else ""}]
+
+ @staticmethod
+ def _inlineRunsForListItem(item) -> list:
+ """Normalize a single list item to List[InlineRun].
+ Accepts a plain string, a dict with 'text', or an already-correct list of run dicts."""
+ if isinstance(item, list):
+ return item
+ if isinstance(item, dict):
+ text = item.get("text", "")
+ return [{"type": "text", "value": text}]
+ return [{"type": "text", "value": str(item)}]
def _determineFilename(self, title: str, mimeType: str) -> str:
"""Determine filename from title and mimeType."""
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py
index 962b8f04..cb6d77ca 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py
@@ -79,7 +79,7 @@ class RendererCodeCsv(BaseCodeRenderer):
return renderedDocs
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py
index 924ba861..dff849ef 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py
@@ -91,7 +91,7 @@ class RendererCodeJson(BaseCodeRenderer):
return renderedDocs
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py
index edab8f8e..6967f746 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py
@@ -78,7 +78,7 @@ class RendererCodeXml(BaseCodeRenderer):
return renderedDocs
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
For XML, we only support code generation (no document renderer exists yet).
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py
index 91312299..f5ee252b 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py
@@ -39,7 +39,7 @@ class RendererCsv(BaseRenderer):
"""
return ["table", "code_block"]
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to CSV format. Produces one CSV file per table section."""
try:
# Validate JSON structure
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py
index 7a1277ca..ab37f756 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py
@@ -53,18 +53,17 @@ class RendererDocx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER")
try:
if not DOCX_AVAILABLE:
- # Fallback to HTML if python-docx not available
from .rendererHtml import RendererHtml
htmlRenderer = RendererHtml()
- return await htmlRenderer.render(extractedContent, title, userPrompt, aiService)
+ return await htmlRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate DOCX using AI-analyzed styling
- docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
+ docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@@ -114,23 +113,27 @@ class RendererDocx(BaseRenderer):
)
]
- async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
+ async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, unifiedStyle: Dict[str, Any] = None) -> str:
"""Generate DOCX content from structured JSON document."""
import time
start_time = time.time()
try:
self.logger.debug("_generateDocxFromJson: Starting document generation")
- # Create new document
doc = Document()
self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s")
- # Get style set: use styles from metadata if available, otherwise enhance with AI
- template_from_metadata = None
- if json_content and isinstance(json_content.get("metadata"), dict):
- template_from_metadata = json_content["metadata"].get("templateName")
+ # Phase 3: prefer unified style when provided
style_start = time.time()
self.logger.debug("_generateDocxFromJson: About to get style set")
- styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata)
+ if unifiedStyle:
+ styleSet = self._convertUnifiedStyleToInternal(unifiedStyle)
+ self._unifiedStyle = unifiedStyle
+ else:
+ template_from_metadata = None
+ if json_content and isinstance(json_content.get("metadata"), dict):
+ template_from_metadata = json_content["metadata"].get("templateName")
+ styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata)
+ self._unifiedStyle = None
self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s")
# Setup basic document styles and create all styles from style set
@@ -298,11 +301,11 @@ class RendererDocx(BaseRenderer):
def _setupBasicDocumentStyles(self, doc: Document) -> None:
"""Set up basic document styles."""
try:
- # Set default font
style = doc.styles['Normal']
font = style.font
- font.name = 'Calibri'
- font.size = Pt(11)
+ us = getattr(self, '_unifiedStyle', None)
+ font.name = us["fonts"]["primary"] if us else 'Calibri'
+ font.size = Pt(us["paragraph"]["sizePt"] if us else 11)
except Exception as e:
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
@@ -421,6 +424,8 @@ class RendererDocx(BaseRenderer):
def _addMarkdownInlineRuns(self, paragraph, text: str) -> None:
"""Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph."""
pos = 0
+ us = getattr(self, '_unifiedStyle', None)
+ monoFont = us["fonts"]["monospace"] if us else "Courier New"
for m in self._MD_INLINE_RE.finditer(text):
if m.start() > pos:
paragraph.add_run(text[pos:m.start()])
@@ -434,12 +439,45 @@ class RendererDocx(BaseRenderer):
paragraph.add_run(m.group(6)).italic = True
elif m.group(7):
run = paragraph.add_run(m.group(7))
- run.font.name = "Courier New"
+ run.font.name = monoFont
run.font.size = Pt(9)
pos = m.end()
if pos < len(text):
paragraph.add_run(text[pos:])
+ def _renderInlineRuns(self, runs: list, paragraph, styleSet: Dict[str, Any]) -> None:
+ """Process a list of InlineRun dicts into python-docx Runs on a paragraph."""
+ us = getattr(self, '_unifiedStyle', None)
+ monoFont = us["fonts"]["monospace"] if us else "Courier New"
+ for run in runs:
+ runType = run.get("type", "text")
+ value = run.get("value", "")
+ if runType == "text":
+ paragraph.add_run(value)
+ elif runType == "bold":
+ paragraph.add_run(value).bold = True
+ elif runType == "italic":
+ paragraph.add_run(value).italic = True
+ elif runType == "code":
+ r = paragraph.add_run(value)
+ r.font.name = monoFont
+ r.font.size = Pt(9)
+ elif runType == "link":
+ r = paragraph.add_run(value)
+ r.font.underline = True
+ r.font.color.rgb = RGBColor(0x29, 0x80, 0xB9)
+ elif runType == "image":
+ b64 = run.get("base64Data", "")
+ if b64:
+ try:
+ imgBytes = base64.b64decode(b64)
+ imgStream = io.BytesIO(imgBytes)
+ paragraph.add_run().add_picture(imgStream, width=Inches(2))
+ except Exception:
+ paragraph.add_run(f"[Image: {run.get('altText', '')}]")
+ else:
+ paragraph.add_run(value)
+
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""
Render a JSON table to DOCX using AI-generated styles.
@@ -485,7 +523,7 @@ class RendererDocx(BaseRenderer):
except Exception as e:
self.logger.error(f"Error rendering table: {str(e)}", exc_info=True)
- def _renderTableFastXml(self, doc: Document, headers: List[str], rows: List[List[Any]], styles: Dict[str, Any]) -> None:
+ def _renderTableFastXml(self, doc: Document, headers: list, rows: list, styles: Dict[str, Any]) -> None:
"""
High-performance table rendering using direct XML manipulation.
@@ -546,24 +584,34 @@ class RendererDocx(BaseRenderer):
# Build all rows using fast XML
rows_start = time.time()
- # Header row
- headerRow = self._createTableRowXml(headers, isHeader=True)
+ # Resolve header style colors
+ tableStyle = styles.get("table_header", {})
+ headerBg = tableStyle.get("background", "")
+ headerFg = tableStyle.get("text_color", "")
+
+ # Flatten inline-run headers to plain strings for fast XML path
+ flatHeaders = []
+ for h in headers:
+ runs = self._inlineRunsForCell(h)
+ flatHeaders.append("".join(r.get("value", "") for r in runs))
+
+ headerRow = self._createTableRowXml(flatHeaders, isHeader=True, headerBgHex=headerBg or None, headerFgHex=headerFg or None)
tbl.append(headerRow)
-
+
header_time = time.time() - rows_start
self.logger.debug(f"_renderTableFastXml: Header row created in {header_time:.3f}s")
-
- # Data rows - batch process for performance
+
data_start = time.time()
rowCount = len(rows)
-
+
for idx, rowData in enumerate(rows):
- # Convert all cells to strings
- cellTexts = [str(cell) if cell is not None else '' for cell in rowData]
- # Pad if needed
- while len(cellTexts) < len(headers):
+ cellTexts = []
+ for cell in rowData:
+ runs = self._inlineRunsForCell(cell)
+ cellTexts.append("".join(r.get("value", "") for r in runs))
+ while len(cellTexts) < len(flatHeaders):
cellTexts.append('')
-
+
row = self._createTableRowXml(cellTexts, isHeader=False)
tbl.append(row)
@@ -641,74 +689,64 @@ class RendererDocx(BaseRenderer):
return tblBorders
- def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any:
- """
- Create a table row XML element with cells.
-
- This is the core fast-path: builds the row XML directly without
- going through python-docx's slow cell.text assignment.
- """
+ def _createTableRowXml(self, cells: list, isHeader: bool = False, headerBgHex: str = None, headerFgHex: str = None) -> Any:
+ """Create a table row XML element with cells.
+ Fast-path: builds row XML directly via lxml."""
from docx.oxml.shared import OxmlElement, qn
-
+
+ if headerBgHex is None:
+ us = getattr(self, '_unifiedStyle', None)
+ headerBgHex = us["table"]["headerBg"].lstrip('#') if us else '1F3864'
+ else:
+ headerBgHex = headerBgHex.lstrip('#')
+ if headerFgHex is None:
+ us = getattr(self, '_unifiedStyle', None)
+ headerFgHex = us["table"]["headerFg"].lstrip('#') if us else 'FFFFFF'
+ else:
+ headerFgHex = headerFgHex.lstrip('#')
+
tr = OxmlElement('w:tr')
-
- # Row properties for header
if isHeader:
trPr = OxmlElement('w:trPr')
- tblHeader = OxmlElement('w:tblHeader')
- trPr.append(tblHeader)
+ trPr.append(OxmlElement('w:tblHeader'))
tr.append(trPr)
-
+
for cellText in cells:
- # Create cell
tc = OxmlElement('w:tc')
-
- # Cell properties
tcPr = OxmlElement('w:tcPr')
tcW = OxmlElement('w:tcW')
tcW.set(qn('w:type'), 'auto')
tcW.set(qn('w:w'), '0')
tcPr.append(tcW)
-
- # Header cell styling - light blue background
+
if isHeader:
shd = OxmlElement('w:shd')
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
- shd.set(qn('w:fill'), '4472C4') # Professional blue
+ shd.set(qn('w:fill'), headerBgHex)
tcPr.append(shd)
-
+
tc.append(tcPr)
-
- # Paragraph with text
p = OxmlElement('w:p')
-
- # Add run with text
r = OxmlElement('w:r')
-
- # Header text styling - bold and white
+
if isHeader:
rPr = OxmlElement('w:rPr')
- b = OxmlElement('w:b')
- rPr.append(b)
- # White text color
+ rPr.append(OxmlElement('w:b'))
color = OxmlElement('w:color')
- color.set(qn('w:val'), 'FFFFFF')
+ color.set(qn('w:val'), headerFgHex)
rPr.append(color)
r.append(rPr)
-
- # Text element
+
t = OxmlElement('w:t')
- # Preserve spaces if text starts/ends with whitespace
if cellText and (cellText[0] == ' ' or cellText[-1] == ' '):
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
t.text = cellText
r.append(t)
-
p.append(r)
tc.append(p)
tr.append(tc)
-
+
return tr
def _applyHorizontalBordersOnly(self, table) -> None:
@@ -836,47 +874,37 @@ class RendererDocx(BaseRenderer):
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON bullet list to DOCX using AI-generated styles - OPTIMIZED for performance."""
try:
- # Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return
items = content.get("items", [])
bullet_style = styles.get("bullet_list", {})
-
- # Pre-calculate and cache style objects to avoid repeated parsing
- font_size_pt = None
+
+ font_size_pt = Pt(bullet_style["font_size"]) if bullet_style.get("font_size") else None
text_color_rgb = None
- if bullet_style:
- if "font_size" in bullet_style:
- font_size_pt = Pt(bullet_style["font_size"])
- if "color" in bullet_style:
- color_hex = bullet_style["color"].lstrip('#')
- text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
-
+ if bullet_style.get("color"):
+ color_hex = bullet_style["color"].lstrip('#')
+ text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
+
for item in items:
- itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "")
- if not itemText:
+ itemRuns = self._inlineRunsForListItem(item)
+ if not itemRuns or not any(r.get("value") for r in itemRuns):
continue
para = doc.add_paragraph(style='List Bullet')
- self._addMarkdownInlineRuns(para, itemText)
-
- # Apply bullet list styling from style set - use cached objects
- if bullet_style and para.runs:
- # Use direct access instead of iterating
- if len(para.runs) > 0:
- run = para.runs[0]
- if font_size_pt:
- run.font.size = font_size_pt
- if text_color_rgb:
- run.font.color.rgb = text_color_rgb
- else:
- # Create run if none exists
- run = para.add_run()
- if font_size_pt:
- run.font.size = font_size_pt
- if text_color_rgb:
- run.font.color.rgb = text_color_rgb
-
+ isNewRunFormat = isinstance(item, list)
+ if isNewRunFormat:
+ self._renderInlineRuns(itemRuns, para, styles)
+ else:
+ itemText = "".join(r.get("value", "") for r in itemRuns)
+ self._addMarkdownInlineRuns(para, itemText)
+
+ if bullet_style and para.runs and len(para.runs) > 0:
+ run = para.runs[0]
+ if font_size_pt:
+ run.font.size = font_size_pt
+ if text_color_rgb:
+ run.font.color.rgb = text_color_rgb
+
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
@@ -905,90 +933,79 @@ class RendererDocx(BaseRenderer):
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON paragraph to DOCX using AI-generated styles."""
try:
- # Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
- text = content.get("text", "")
+ inlineRuns = self._inlineRunsFromContent(content)
elif isinstance(content, str):
- text = content
+ inlineRuns = [{"type": "text", "value": content}]
else:
- text = ""
-
- # CRITICAL: Prevent rendering base64 image data as text
- # Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
- if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
- (len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
- # This looks like base64 data - don't render as text
- self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
+ inlineRuns = []
+
+ if not inlineRuns:
+ return
+
+ plainText = "".join(r.get("value", "") for r in inlineRuns)
+ if plainText and (plainText.startswith("/9j/") or plainText.startswith("iVBORw0KGgo") or
+ (len(plainText) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in plainText[:100]))):
+ self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(plainText)})")
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
if para.runs:
- para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
+ para.runs[0].font.color.rgb = RGBColor(255, 0, 0)
return
-
- if text:
- para = doc.add_paragraph()
- self._addMarkdownInlineRuns(para, text)
- paragraph_style = styles.get("paragraph", {})
- if paragraph_style:
- # Pre-calculate and cache style objects
- font_size_pt = None
- text_color_rgb = None
- if "font_size" in paragraph_style:
- font_size_pt = Pt(paragraph_style["font_size"])
- if "color" in paragraph_style:
- color_hex = paragraph_style["color"].lstrip('#')
- text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
- bold = paragraph_style.get("bold", False)
-
- # Use direct access instead of iterating
- if len(para.runs) > 0:
- run = para.runs[0]
- if font_size_pt:
- run.font.size = font_size_pt
- run.font.bold = bold
- if text_color_rgb:
- run.font.color.rgb = text_color_rgb
+
+ para = doc.add_paragraph()
+ hasNewRuns = content.get("inlineRuns") if isinstance(content, dict) else None
+ if hasNewRuns:
+ self._renderInlineRuns(inlineRuns, para, styles)
+ else:
+ self._addMarkdownInlineRuns(para, plainText)
+
+ paragraph_style = styles.get("paragraph", {})
+ if paragraph_style:
+ font_size_pt = Pt(paragraph_style["font_size"]) if "font_size" in paragraph_style else None
+ text_color_rgb = None
+ if "color" in paragraph_style:
+ color_hex = paragraph_style["color"].lstrip('#')
+ text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
+ bold = paragraph_style.get("bold", False)
+ if len(para.runs) > 0:
+ run = para.runs[0]
+ if font_size_pt:
+ run.font.size = font_size_pt
+ run.font.bold = bold
+ if text_color_rgb:
+ run.font.color.rgb = text_color_rgb
+ if "align" in paragraph_style:
+ align = paragraph_style["align"]
+ if align == "center":
+ para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ elif align == "right":
+ para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
- # Create run if none exists
- run = para.add_run()
- if font_size_pt:
- run.font.size = font_size_pt
- run.font.bold = bold
- if text_color_rgb:
- run.font.color.rgb = text_color_rgb
-
- if "align" in paragraph_style:
- align = paragraph_style["align"]
- if align == "center":
- para.alignment = WD_ALIGN_PARAGRAPH.CENTER
- elif align == "right":
- para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
- else:
- para.alignment = WD_ALIGN_PARAGRAPH.LEFT
-
+ para.alignment = WD_ALIGN_PARAGRAPH.LEFT
+
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON code block to DOCX using AI-generated styles."""
try:
- # Extract from nested content structure
content = code_data.get("content", {})
if not isinstance(content, dict):
return
code = content.get("code", "")
language = content.get("language", "")
code_style = styles.get("code_block", {})
-
+ us = getattr(self, '_unifiedStyle', None)
+
if code:
if language:
lang_para = doc.add_paragraph(f"Code ({language}):")
if len(lang_para.runs) > 0:
lang_para.runs[0].bold = True
-
- # Pre-calculate and cache style objects
- code_font_name = code_style.get("font", "Courier New")
- code_font_size_pt = Pt(code_style.get("font_size", 9))
+
+ code_font_name = code_style.get("font", us["fonts"]["monospace"] if us else "Courier New")
+ code_font_size_pt = Pt(code_style.get("font_size", us["codeBlock"]["fontSizePt"] if us else 9))
code_text_color_rgb = None
if "color" in code_style:
color_hex = code_style["color"].lstrip('#')
diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py
index 58143ac2..b39efd50 100644
--- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py
+++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py
@@ -40,7 +40,7 @@ class RendererHtml(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
- async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
+ async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render HTML document with images as separate files.
Returns list of documents: [HTML document, image1, image2, ...]
@@ -54,7 +54,7 @@ class RendererHtml(BaseRenderer):
self._renderedImages = images
# Generate HTML using AI-analyzed styling
- htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
+ htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService, style=style)
# Replace base64 data URIs with relative file paths if images exist
if images:
@@ -107,11 +107,16 @@ class RendererHtml(BaseRenderer):
return resultDocuments
- async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
+ async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
- # Get style set: use styles from metadata if available, otherwise enhance with AI
- styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
+ # Use unified style when provided, otherwise fall back to existing flow
+ if style:
+ styles = self._convertUnifiedStyleToInternal(style)
+ self._unifiedStyle = style
+ else:
+ styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
+ self._unifiedStyle = None
# Validate JSON structure
if not self._validateJsonStructure(jsonContent):
@@ -272,6 +277,10 @@ class RendererHtml(BaseRenderer):
def _generateCssStyles(self, styles: Dict[str, Any]) -> str:
"""Generate CSS from style definitions."""
+ # When unified style is available, generate CSS directly from it
+ if getattr(self, "_unifiedStyle", None):
+ return self._generateCssFromUnifiedStyle(self._unifiedStyle)
+
css_parts = []
# Body styles
@@ -368,6 +377,164 @@ class RendererHtml(BaseRenderer):
return '\n'.join(css_parts)
+ def _generateCssFromUnifiedStyle(self, style: Dict[str, Any]) -> str:
+ """Generate CSS directly from unified style dict."""
+ fonts = style.get("fonts", {})
+ colors = style.get("colors", {})
+ headings = style.get("headings", {})
+ para = style.get("paragraph", {})
+ tbl = style.get("table", {})
+ lst = style.get("list", {})
+ cb = style.get("codeBlock", {})
+ page = style.get("page", {})
+
+ primaryFont = fonts.get("primary", "Arial, sans-serif")
+ monoFont = fonts.get("monospace", "Courier New, monospace")
+ bgColor = colors.get("background", "#FFFFFF")
+ primaryColor = colors.get("primary", "#1F3864")
+ paraColor = para.get("color", "#333333")
+ paraSizePt = para.get("sizePt", 11)
+ lineSpacing = para.get("lineSpacing", 1.15)
+
+ css_parts = []
+
+ # Body
+ css_parts.append("body {")
+ css_parts.append(f" font-family: {primaryFont};")
+ css_parts.append(f" background: {bgColor};")
+ css_parts.append(f" color: {paraColor};")
+ css_parts.append(f" font-size: {paraSizePt}pt;")
+ css_parts.append(f" line-height: {lineSpacing};")
+ margins = page.get("marginsPt", {})
+ if margins:
+ css_parts.append(f" margin: {margins.get('top', 60)}pt {margins.get('right', 60)}pt {margins.get('bottom', 60)}pt {margins.get('left', 60)}pt;")
+ else:
+ css_parts.append(" margin: 0; padding: 20px;")
+ css_parts.append("}")
+
+ # Document title (uses h1 style)
+ h1 = headings.get("h1", {})
+ css_parts.append(".document-title {")
+ css_parts.append(f" font-size: {h1.get('sizePt', 24)}pt;")
+ css_parts.append(f" color: {h1.get('color', primaryColor)};")
+ css_parts.append(f" font-weight: {h1.get('weight', 'bold')};")
+ css_parts.append(" margin: 0 0 1em 0;")
+ css_parts.append("}")
+
+ # Headings h1-h4
+ for level in range(1, 5):
+ key = f"h{level}"
+ h = headings.get(key, h1 if level == 1 else headings.get(f"h{level-1}", {}))
+ css_parts.append(f"h{level} {{")
+ css_parts.append(f" font-size: {h.get('sizePt', max(24 - (level-1)*4, 12))}pt;")
+ css_parts.append(f" color: {h.get('color', primaryColor)};")
+ css_parts.append(f" font-weight: {h.get('weight', 'bold')};")
+ css_parts.append(f" margin: 1.2em 0 0.4em 0;")
+ css_parts.append("}")
+
+ # Paragraphs
+ css_parts.append("p {")
+ css_parts.append(f" font-size: {paraSizePt}pt;")
+ css_parts.append(f" color: {paraColor};")
+ css_parts.append(f" line-height: {lineSpacing};")
+ css_parts.append(" margin: 0 0 1em 0;")
+ css_parts.append("}")
+
+ # Tables
+ borderColor = tbl.get("borderColor", "#DEE2E6")
+ css_parts.append("table {")
+ css_parts.append(f" border-collapse: collapse;")
+ css_parts.append(f" width: 100%;")
+ css_parts.append(f" margin: 1em 0;")
+ css_parts.append(f" border: 1px solid {borderColor};")
+ css_parts.append("}")
+
+ # Table headers
+ css_parts.append("th {")
+ css_parts.append(f" background: {tbl.get('headerBg', '#1F3864')};")
+ css_parts.append(f" color: {tbl.get('headerFg', '#FFFFFF')};")
+ css_parts.append(" font-weight: bold;")
+ css_parts.append(" text-align: center;")
+ css_parts.append(f" padding: 10px;")
+ css_parts.append(f" border: 1px solid {borderColor};")
+ css_parts.append("}")
+
+ # Table cells
+ css_parts.append("td {")
+ css_parts.append(f" color: {paraColor};")
+ css_parts.append(" padding: 8px;")
+ css_parts.append(f" border: 1px solid {borderColor};")
+ css_parts.append("}")
+
+ # Lists
+ css_parts.append("ul {")
+ css_parts.append(f" font-size: {lst.get('sizePt', paraSizePt)}pt;")
+ css_parts.append(f" color: {paraColor};")
+ css_parts.append(f" padding-left: {lst.get('indentPt', 18)}pt;")
+ css_parts.append(" margin: 0 0 1em 0;")
+ css_parts.append("}")
+
+ # Code blocks
+ css_parts.append("pre {")
+ css_parts.append(f" font-family: {monoFont};")
+ css_parts.append(f" font-size: {cb.get('fontSizePt', 9)}pt;")
+ css_parts.append(f" color: {paraColor};")
+ css_parts.append(f" background: {cb.get('background', '#F8F9FA')};")
+ css_parts.append(f" border: 1px solid {cb.get('borderColor', '#E2E8F0')};")
+ css_parts.append(" border-radius: 4px;")
+ css_parts.append(" padding: 1em;")
+ css_parts.append(" margin: 1em 0;")
+ css_parts.append(" overflow-x: auto;")
+ css_parts.append("}")
+
+ # Images
+ css_parts.append("img {")
+ css_parts.append(" max-width: 100%;")
+ css_parts.append(" height: auto;")
+ css_parts.append(" margin: 1em 0;")
+ css_parts.append(" border-radius: 4px;")
+ css_parts.append("}")
+
+ # Generated info
+ css_parts.append(".generated-info {")
+ css_parts.append(" font-size: 0.9em;")
+ css_parts.append(" color: #666;")
+ css_parts.append(" text-align: center;")
+ css_parts.append(" margin-top: 2em;")
+ css_parts.append(" padding-top: 1em;")
+ css_parts.append(" border-top: 1px solid #ddd;")
+ css_parts.append("}")
+
+ return '\n'.join(css_parts)
+
+ def _renderInlineRuns(self, runs: list) -> str:
+ """Convert inline runs to HTML markup."""
+ import html as htmlLib
+ parts = []
+ for run in runs:
+ runType = run.get("type", "text")
+ value = htmlLib.escape(run.get("value", ""))
+ if runType == "text":
+ parts.append(value)
+ elif runType == "bold":
+ parts.append(f"{value}")
+ elif runType == "italic":
+ parts.append(f"{value}")
+ elif runType == "code":
+ parts.append(f"{value}")
+ elif runType == "link":
+ href = htmlLib.escape(run.get("href", ""))
+ parts.append(f'{value}')
+ elif runType == "image":
+ b64 = run.get("base64Data", "")
+ mime = run.get("mimeType", "image/png")
+ alt = value
+ if b64:
+ parts.append(f'')
+ else:
+ parts.append(value)
+ return "".join(parts)
+
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a single JSON section to HTML using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
@@ -419,6 +586,11 @@ class RendererHtml(BaseRenderer):
# Regular paragraph element - extract from nested content structure (standard JSON format)
content = element.get("content", {})
if isinstance(content, dict):
+ # New format: inlineRuns
+ inlineRuns = content.get("inlineRuns")
+ if inlineRuns and isinstance(inlineRuns, list):
+ htmlParts.append(f'
{self._renderInlineRuns(inlineRuns)}
') + continue text = content.get("text", "") elif isinstance(content, str): text = content @@ -495,7 +667,8 @@ class RendererHtml(BaseRenderer): # Table header htmlParts.append('tags return '\n'.join(f'
{text}
' for text in texts) return "" elif isinstance(paragraphData, str): return f'{paragraphData}
' elif isinstance(paragraphData, dict): - # Handle nested content structure: element.content vs element.text # Extract from nested content structure content = paragraphData.get("content", {}) if isinstance(content, dict): + # New format: inlineRuns + inlineRuns = content.get("inlineRuns") + if inlineRuns and isinstance(inlineRuns, list): + return f'{self._renderInlineRuns(inlineRuns)}
' text = content.get("text", "") elif isinstance(content, str): text = content diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py index 2aff559f..8141b798 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py @@ -43,7 +43,7 @@ class RendererImage(BaseRenderer): """ return ["image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to image format using AI image generation.""" try: # Generate AI image from content diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py index 076210bc..470d4543 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py @@ -42,7 +42,7 @@ class RendererJson(BaseRenderer): # Return all types except image return [st for st in supportedSectionTypes if st != "image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to JSON format.""" try: # The extracted content should already be JSON from the AI diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py index a3b8b5b3..552266e9 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py @@ -40,7 +40,7 @@ class RendererMarkdown(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return [st for st in supportedSectionTypes if st != "image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to Markdown format.""" try: # Generate markdown from JSON structure diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py index df2aff10..7913a246 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py @@ -106,17 +106,17 @@ class RendererPdf(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to PDF format using AI-analyzed styling.""" try: if not REPORTLAB_AVAILABLE: # Fallback to HTML if reportlab not available from .rendererHtml import RendererHtml html_renderer = RendererHtml() - return await html_renderer.render(extractedContent, title, userPrompt, aiService) + return await html_renderer.render(extractedContent, title, userPrompt, aiService, style=style) # Generate PDF using AI-analyzed styling - pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService) + pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style) # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} @@ -163,11 +163,28 @@ class RendererPdf(BaseRenderer): ) ] - async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: + async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, unifiedStyle: Dict[str, Any] = None) -> str: """Generate PDF content from structured JSON document using AI-generated styling.""" try: - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(json_content, userPrompt, aiService) + # Get style set from unified style or legacy approach + if unifiedStyle: + styles = self._convertUnifiedStyleToInternal(unifiedStyle) + self._unifiedStyle = unifiedStyle + for level in range(1, 7): + hKey = f"heading{level}" + if hKey not in styles: + styles[hKey] = self._defaultHeadingStyleDef(level) + else: + styles[hKey].setdefault("space_after", 12) + styles[hKey].setdefault("space_before", 12) + styles["paragraph"].setdefault("space_after", 6) + styles["paragraph"].setdefault("line_height", unifiedStyle["paragraph"].get("lineSpacing", 1.2)) + styles["bullet_list"].setdefault("space_after", 3) + styles["code_block"].setdefault("space_after", 6) + styles["code_block"].setdefault("align", "left") + else: + styles = await self._getStyleSet(json_content, userPrompt, aiService) + self._unifiedStyle = None # Validate JSON structure if not self._validateJsonStructure(json_content): @@ -179,15 +196,13 @@ class RendererPdf(BaseRenderer): # Create a buffer to hold the PDF buffer = io.BytesIO() - # Create PDF document - doc = SimpleDocTemplate( - buffer, - pagesize=A4, - rightMargin=72, - leftMargin=72, - topMargin=72, - bottomMargin=18 - ) + # Create PDF document with unified page margins or defaults + pageCfg = unifiedStyle["page"] if unifiedStyle else None + if pageCfg: + m = pageCfg["marginsPt"] + doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=m["right"], leftMargin=m["left"], topMargin=m["top"], bottomMargin=m["bottom"]) + else: + doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18) # Build PDF content (no cover page — body starts on page 1; filename still uses `title`) story = [] @@ -232,13 +247,28 @@ class RendererPdf(BaseRenderer): removed = False for idx, flowable in enumerate(story): fRepr = repr(flowable) + if "Image" in fRepr and hasattr(flowable, 'drawWidth') and hasattr(flowable, 'drawHeight'): + from reportlab.platypus import Image as ReportLabImage + if isinstance(flowable, ReportLabImage): + frameH = 650.0 + frameW = 450.0 + if flowable.drawHeight > frameH or flowable.drawWidth > frameW: + scaleW = frameW / flowable.drawWidth if flowable.drawWidth > frameW else 1.0 + scaleH = frameH / flowable.drawHeight if flowable.drawHeight > frameH else 1.0 + s = min(scaleW, scaleH) * 0.9 + flowable.drawWidth = flowable.drawWidth * s + flowable.drawHeight = flowable.drawHeight * s + flowable._width = flowable.drawWidth + flowable._height = flowable.drawHeight + removed = True + break if "Table" in fRepr and hasattr(flowable, '_cellvalues'): try: nRows = len(flowable._cellvalues) nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0 if nRows == 1 and nCols == 1: errPara = Paragraph( - "[Code block omitted — content too large for PDF page]", + "[Code block omitted - content too large for PDF page]", self._createNormalStyle({}), ) story[idx] = errPara @@ -609,6 +639,31 @@ class RendererPdf(BaseRenderer): .replace(">", ">") ) + def _renderInlineRunsToPdfXml(self, runs: list) -> str: + """Convert inline runs to ReportLab Paragraph XML.""" + parts = [] + us = getattr(self, '_unifiedStyle', None) + monoFont = us["fonts"]["monospace"] if us else "Courier" + for run in runs: + runType = run.get("type", "text") + value = self._escapeReportlabXml(run.get("value", "")) + if runType == "text": + parts.append(value) + elif runType == "bold": + parts.append(f"{value}") + elif runType == "italic": + parts.append(f"{value}") + elif runType == "code": + parts.append(f'{value}') + elif runType == "link": + href = self._escapeReportlabXml(run.get("href", "")) + parts.append(f'{value}') + elif runType == "image": + parts.append(f"[Image: {value}]") + else: + parts.append(value) + return "".join(parts) + def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str: """Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately).""" if not text: @@ -744,10 +799,10 @@ class RendererPdf(BaseRenderer): return [] headers = content.get("headers", []) rows = content.get("rows", []) - + if not headers or not rows: return [] - + numCols = len(headers) colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1) colWidths = [colWidth] * numCols @@ -755,8 +810,12 @@ class RendererPdf(BaseRenderer): hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header") cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell") - def _cellPara(val, ps): - return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps) + def _cellPara(cell, ps): + runs = self._inlineRunsForCell(cell) + if isinstance(cell, list): + xml = self._renderInlineRunsToPdfXml(runs) + return Paragraph(_wrapEmojiSpansInXml(xml), ps) + return self._paragraphFromInlineMarkdown(str(cell) if cell is not None else "", ps) headerRow = [_cellPara(h, hdrPs) for h in headers] bodyRows = [] @@ -786,7 +845,7 @@ class RendererPdf(BaseRenderer): ] table.setStyle(TableStyle(table_style)) return [table, Spacer(1, 12)] - + except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return [] @@ -794,32 +853,29 @@ class RendererPdf(BaseRenderer): def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON bullet list to PDF elements using AI-generated styles.""" try: - # Extract from nested content structure content = list_data.get("content", {}) if not isinstance(content, dict): return [] items = content.get("items", []) - bullet_style_def = styles.get("bullet_list", {}) - + bulletStyleDef = styles.get("bullet_list", {}) + normalStyle = self._createNormalStyle(styles) + elements = [] for item in items: - if isinstance(item, str): - elements.append( - Paragraph(f"• {self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles)) - ) + runs = self._inlineRunsForListItem(item) + if isinstance(item, list): + xml = self._renderInlineRunsToPdfXml(runs) + elements.append(Paragraph(f"\u2022 {_wrapEmojiSpansInXml(xml)}", normalStyle)) + elif isinstance(item, str): + elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item)}", normalStyle)) elif isinstance(item, dict) and "text" in item: - elements.append( - Paragraph( - f"• {self._markdownInlineToReportlabXml(item['text'])}", - self._createNormalStyle(styles), - ) - ) - + elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item['text'])}", normalStyle)) + if elements: - elements.append(Spacer(1, bullet_style_def.get("space_after", 3))) - + elements.append(Spacer(1, bulletStyleDef.get("space_after", 3))) + return elements - + except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return [] @@ -848,20 +904,27 @@ class RendererPdf(BaseRenderer): def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON paragraph to PDF elements using AI-generated styles.""" try: - # Extract from nested content structure content = paragraph_data.get("content", {}) - if isinstance(content, dict): - text = content.get("text", "") - elif isinstance(content, str): - text = content - else: - text = "" - + if isinstance(content, str): + content = {"text": content} + if not isinstance(content, dict): + return [] + + normalStyle = self._createNormalStyle(styles) + + if "inlineRuns" in content: + runs = self._inlineRunsFromContent(content) + xml = self._renderInlineRunsToPdfXml(runs) + if xml: + return [Paragraph(_wrapEmojiSpansInXml(xml), normalStyle)] + return [] + + text = content.get("text", "") if text: - return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))] - + return [self._paragraphFromInlineMarkdown(text, normalStyle)] + return [] - + except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return [] @@ -1030,20 +1093,18 @@ class RendererPdf(BaseRenderer): pilImage = PILImage.open(imageStream) originalWidth, originalHeight = pilImage.size - # Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom) pageWidth = A4[0] # 595.27 points pageHeight = A4[1] # 841.89 points - leftMargin = 72 - rightMargin = 72 - topMargin = 72 - bottomMargin = 18 - - # Use actual frame dimensions from SimpleDocTemplate - # Frame is smaller than page minus margins due to internal spacing - # From error message: frame is 439.27559055118115 x 739.8897637795277 - # Use conservative values with safety margin - availableWidth = 430.0 # Slightly smaller than frame width for safety - availableHeight = 730.0 # Slightly smaller than frame height for safety + # Use page dimensions minus margins with generous safety buffer + # A4 = 595.27 x 841.89 pt; frame = page - margins - internal padding + _us = getattr(self, '_unifiedStyle', None) or {} + _pageMgn = (_us.get('page') or {}).get('marginsPt') or {} + marginTop = _pageMgn.get('top', 60) + marginBottom = _pageMgn.get('bottom', 60) + marginLeft = _pageMgn.get('left', 60) + marginRight = _pageMgn.get('right', 60) + availableWidth = pageWidth - marginLeft - marginRight - 20 # 20pt safety + availableHeight = pageHeight - marginTop - marginBottom - 80 # 80pt safety for header/footer # Convert original image size from pixels to points # PIL provides size in pixels, need to convert to points diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py index 3bdff7f1..49ee8048 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py @@ -59,7 +59,7 @@ class RendererPptx(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render content as PowerPoint presentation from JSON data. @@ -68,7 +68,7 @@ class RendererPptx(BaseRenderer): title: Title for the presentation userPrompt: User prompt for AI styling aiService: AI service for styling - **kwargs: Additional rendering options + style: Unified style dict from pipeline (preferred over AI-generated styles) Returns: Base64-encoded PowerPoint presentation as string @@ -81,8 +81,19 @@ class RendererPptx(BaseRenderer): from pptx.dml.color import RGBColor import re - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(extractedContent, userPrompt, aiService) + # Get style set: prefer unified style, then metadata, then AI-enhanced + if style: + internalStyle = self._convertUnifiedStyleToInternal(style) + defaultPptx = self._getDefaultStyleSet() + for key in ("slide_size", "content_per_slide", "design_theme", "color_scheme", "background_style", "accent_colors", "professional_grade", "executive_ready"): + internalStyle[key] = defaultPptx.get(key) + internalStyle["heading"] = internalStyle["heading1"] + internalStyle["subheading"] = internalStyle["heading2"] + styles = internalStyle + self._unifiedStyle = style + else: + styles = await self._getStyleSet(extractedContent, userPrompt, aiService) + self._unifiedStyle = None # Create new presentation prs = Presentation() @@ -910,15 +921,17 @@ JSON ONLY. NO OTHER TEXT.""" # Extract from nested content structure content = paragraph_data.get("content", {}) if isinstance(content, dict): - text = content.get("text", "") + if content.get("inlineRuns"): + text = "".join(r.get("value", "") for r in content["inlineRuns"]) + else: + text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" if text: - # Limit paragraph length based on content density - max_length = 200 # Default limit + max_length = 200 if len(text) > max_length: text = text[:max_length] + "..." @@ -1303,6 +1316,32 @@ JSON ONLY. NO OTHER TEXT.""" r.text = text[pos:] _applyBase(r) + def _renderInlineRunsPptx(self, runs, paragraph, fontSize=None, fontColor=None): + """Process InlineRun dicts into pptx text runs.""" + from pptx.util import Pt + paragraph.text = "" + us = getattr(self, '_unifiedStyle', None) + monoFont = us["fonts"]["monospace"] if us else "Courier New" + for run in runs: + runType = run.get("type", "text") + value = run.get("value", "") + r = paragraph.add_run() + r.text = value + if fontSize: + r.font.size = fontSize + if fontColor: + r.font.color.rgb = fontColor + if runType == "bold": + r.font.bold = True + elif runType == "italic": + r.font.italic = True + elif runType == "code": + r.font.name = monoFont + if fontSize and hasattr(fontSize, 'pt'): + r.font.size = Pt(max(8, int(fontSize.pt * 0.85))) + elif runType == "link": + r.font.underline = True + def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None: """Add a PowerPoint table to slide.""" try: @@ -1374,7 +1413,8 @@ JSON ONLY. NO OTHER TEXT.""" cell = table.cell(0, col_idx) # Clear existing text and set new text cell.text_frame.clear() - header_text = str(header) if header else "" + cellRuns = self._inlineRunsForCell(header) + header_text = "".join(r.get("value", "") for r in cellRuns) cell.text = header_text # Ensure paragraph exists @@ -1420,7 +1460,8 @@ JSON ONLY. NO OTHER TEXT.""" cell = table.cell(row_idx, col_idx) # Clear existing text and set new text cell.text_frame.clear() - cell_text = str(cell_data) if cell_data is not None else "" + cellRuns = self._inlineRunsForCell(cell_data) + cell_text = "".join(r.get("value", "") for r in cellRuns) cell.text = cell_text # Ensure paragraph exists @@ -1462,9 +1503,8 @@ JSON ONLY. NO OTHER TEXT.""" fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47)))) for item in items: - itemText = item.get("text", "") if isinstance(item, dict) else str(item) - if not itemText or not itemText.strip(): - continue + runs = self._inlineRunsForListItem(item) + isNewFormat = isinstance(item, list) p = text_frame.add_paragraph() p.level = 0 @@ -1472,21 +1512,33 @@ JSON ONLY. NO OTHER TEXT.""" p.space_before = Pt(2) p.space_after = Pt(2) - # Consistent bullet prefix - self._addMarkdownInlineRuns(p, f" • {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) + if isNewFormat: + bulletRuns = [{"type": "text", "value": " \u2022 "}] + runs + self._renderInlineRunsPptx(bulletRuns, p, fontSize=fontSize, fontColor=fontColor) + else: + itemText = item.get("text", "") if isinstance(item, dict) else str(item) + if not itemText or not itemText.strip(): + continue + self._addMarkdownInlineRuns(p, f" \u2022 {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) - # Subitems + # Subitems (only for dict-style items) if isinstance(item, dict): for sub in item.get("subitems", []): - subText = sub.get("text", "") if isinstance(sub, dict) else str(sub) - if not subText: - continue + subRuns = self._inlineRunsForListItem(sub) + isSubNew = isinstance(sub, list) sp = text_frame.add_paragraph() sp.level = 0 sp.alignment = PP_ALIGN.LEFT sp.space_before = Pt(1) sp.space_after = Pt(1) - self._addMarkdownInlineRuns(sp, f" – {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) + if isSubNew: + subBulletRuns = [{"type": "text", "value": " \u2013 "}] + subRuns + self._renderInlineRunsPptx(subBulletRuns, sp, fontSize=fontSize, fontColor=fontColor) + else: + subText = sub.get("text", "") if isinstance(sub, dict) else str(sub) + if not subText: + continue + self._addMarkdownInlineRuns(sp, f" \u2013 {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) except Exception as e: logger.warning(f"Error adding bullet list to slide: {str(e)}") @@ -1540,42 +1592,53 @@ JSON ONLY. NO OTHER TEXT.""" # Extract from nested content structure content = element.get("content", {}) if isinstance(content, dict): + inlineRuns = self._inlineRunsFromContent(content) + hasInlineRuns = content.get("inlineRuns") is not None text = content.get("text", "") elif isinstance(content, str): text = content + inlineRuns = [{"type": "text", "value": text}] if text else [] + hasInlineRuns = False else: text = "" + inlineRuns = [] + hasInlineRuns = False - if text: - p = text_frame.add_paragraph() - p.level = 0 - - try: - if hasattr(p, 'paragraph_format'): - p.paragraph_format.bullet.type = None - except (AttributeError, TypeError): - pass - - paragraph_style = styles.get("paragraph", {}) - base_font_size = paragraph_style.get("font_size", 14) - calculated_size = max(10, int(base_font_size * font_size_multiplier)) - fSize = Pt(calculated_size) - fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) - fBold = paragraph_style.get("bold", False) + if not inlineRuns and not text: + return + + p = text_frame.add_paragraph() + p.level = 0 + + try: + if hasattr(p, 'paragraph_format'): + p.paragraph_format.bullet.type = None + except (AttributeError, TypeError): + pass + + paragraph_style = styles.get("paragraph", {}) + base_font_size = paragraph_style.get("font_size", 14) + calculated_size = max(10, int(base_font_size * font_size_multiplier)) + fSize = Pt(calculated_size) + fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) + fBold = paragraph_style.get("bold", False) + + if hasInlineRuns: + self._renderInlineRunsPptx(inlineRuns, p, fontSize=fSize, fontColor=fColor) + else: self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold) - - # Add proper spacing - p.space_before = Pt(6) # Space before paragraph - p.space_after = Pt(6) # Space after paragraph - p.line_spacing = 1.2 # Line spacing for readability - - align = paragraph_style.get("align", "left") - if align == "center": - p.alignment = PP_ALIGN.CENTER - elif align == "right": - p.alignment = PP_ALIGN.RIGHT - else: - p.alignment = PP_ALIGN.LEFT + + p.space_before = Pt(6) + p.space_after = Pt(6) + p.line_spacing = 1.2 + + align = paragraph_style.get("align", "left") + if align == "center": + p.alignment = PP_ALIGN.CENTER + elif align == "right": + p.alignment = PP_ALIGN.RIGHT + else: + p.alignment = PP_ALIGN.LEFT except Exception as e: logger.warning(f"Error adding paragraph to slide: {str(e)}") diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py index 15a7161c..94400df9 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py @@ -76,7 +76,7 @@ class RendererText(BaseRenderer): # Text renderer accepts all types except images return [st for st in supportedSectionTypes if st != "image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to plain text format.""" try: # Generate text from JSON structure diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py index 79f5688c..3c6fdd5e 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py @@ -68,17 +68,17 @@ class RendererXlsx(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to Excel format using AI-analyzed styling.""" try: if not OPENPYXL_AVAILABLE: # Fallback to CSV if openpyxl not available from .rendererCsv import RendererCsv csvRenderer = RendererCsv() - return await csvRenderer.render(extractedContent, title, userPrompt, aiService) + return await csvRenderer.render(extractedContent, title, userPrompt, aiService, style=style) # Generate Excel using AI-analyzed styling - excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService) + excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService, style=style) # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} @@ -298,15 +298,22 @@ class RendererXlsx(BaseRenderer): except Exception as e: self.logger.warning(f"Could not populate analysis sheet: {str(e)}") - async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: + async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str: """Generate Excel content from structured JSON document using AI-generated styling.""" try: # Debug output self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER") self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER") - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(jsonContent, userPrompt, aiService) + # Store unified style for use by inline-run helpers + self._unifiedStyle = style + + # Get style set: prefer unified style, fall back to legacy approach + if style: + styles = self._convertUnifiedStyleToInternal(style) + styles = self._convertColorsFormat(styles) + else: + styles = await self._getStyleSet(jsonContent, userPrompt, aiService) # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) if not self._validateJsonStructure(jsonContent): @@ -511,6 +518,10 @@ class RendererXlsx(BaseRenderer): "code_block": {"font": "Courier New", "font_size": 10, "color": "FF2F2F2F", "background": "FFF5F5F5"} } + def _renderInlineRuns(self, runs: list) -> str: + """Flatten inline runs to plain text for Excel cells.""" + return "".join(r.get("value", "") for r in runs) + async def _getAiStylesWithExcelColors(self, aiService, styleTemplate: str, defaultStyles: Dict[str, Any]) -> Dict[str, Any]: """Get AI styles with proper Excel color conversion.""" if not aiService: @@ -1206,7 +1217,9 @@ class RendererXlsx(BaseRenderer): # Add headers with formatting - OPTIMIZED: use cached style objects for col, header in enumerate(headers, 1): - sanitized_header = self._sanitizeCellValue(header) + runs = self._inlineRunsForCell(header) + headerText = self._renderInlineRuns(runs) + sanitized_header = self._sanitizeCellValue(headerText) cell = sheet.cell(row=headerRow, column=col, value=sanitized_header) # Apply styling with fallbacks - use pre-calculated objects @@ -1272,7 +1285,9 @@ class RendererXlsx(BaseRenderer): cell_values = cell_values[:header_count] for col, cell_value in enumerate(cell_values, 1): - sanitized_value = self._sanitizeCellValue(cell_value) + runs = self._inlineRunsForCell(cell_value) + cellText = self._renderInlineRuns(runs) + sanitized_value = self._sanitizeCellValue(cellText) cell = sheet.cell(row=startRow, column=col, value=sanitized_value) # Apply styling with fallbacks - use pre-calculated objects @@ -1311,20 +1326,20 @@ class RendererXlsx(BaseRenderer): def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int: """Add a list element to Excel sheet. Expects nested content structure.""" try: - # Extract from nested content structure content = element.get("content", {}) if not isinstance(content, dict): return startRow - list_items = content.get("items") or [] - # Ensure list_items is a list - if not isinstance(list_items, list): - list_items = [] + listItems = content.get("items") or [] + if not isinstance(listItems, list): + listItems = [] - list_style = styles.get("bullet_list", {}) - for item in list_items: - sheet.cell(row=startRow, column=1, value=f"• {item}") - if list_style.get("color"): - sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(list_style["color"])) + listStyle = styles.get("bullet_list", {}) + for item in listItems: + runs = self._inlineRunsForListItem(item) + text = self._renderInlineRuns(runs) + sheet.cell(row=startRow, column=1, value=f"\u2022 {text}") + if listStyle.get("color"): + sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(listStyle["color"])) startRow += 1 return startRow @@ -1336,10 +1351,10 @@ class RendererXlsx(BaseRenderer): def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int: """Add a paragraph element to Excel sheet. Expects nested content structure.""" try: - # Extract from nested content structure content = element.get("content", {}) if isinstance(content, dict): - text = content.get("text", "") + runs = self._inlineRunsFromContent(content) + text = self._renderInlineRuns(runs) elif isinstance(content, str): text = content else: diff --git a/modules/serviceCenter/services/serviceGeneration/styleDefaults.py b/modules/serviceCenter/services/serviceGeneration/styleDefaults.py new file mode 100644 index 00000000..b5a92641 --- /dev/null +++ b/modules/serviceCenter/services/serviceGeneration/styleDefaults.py @@ -0,0 +1,75 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Default style definitions and style resolution for document rendering.""" + +from typing import Any, Dict + + +DEFAULT_STYLE: Dict[str, Any] = { + "fonts": { + "primary": "Calibri", + "monospace": "Consolas", + }, + "colors": { + "primary": "#1F3864", + "secondary": "#2C3E50", + "accent": "#2980B9", + "background": "#FFFFFF", + }, + "headings": { + "h1": {"sizePt": 24, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 12, "spaceAfterPt": 6}, + "h2": {"sizePt": 18, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 10, "spaceAfterPt": 4}, + "h3": {"sizePt": 14, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 8, "spaceAfterPt": 3}, + "h4": {"sizePt": 12, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 6, "spaceAfterPt": 2}, + }, + "paragraph": {"sizePt": 11, "lineSpacing": 1.15, "color": "#333333"}, + "table": { + "headerBg": "#1F3864", + "headerFg": "#FFFFFF", + "headerSizePt": 10, + "bodySizePt": 10, + "rowBandingEven": "#F2F6FC", + "rowBandingOdd": "#FFFFFF", + "borderColor": "#CBD5E1", + "borderWidthPt": 0.5, + }, + "list": {"bulletChar": "\u2022", "indentPt": 18, "sizePt": 11}, + "image": {"defaultWidthPt": 480, "maxWidthPt": 800, "alignment": "center"}, + "codeBlock": {"fontSizePt": 9, "background": "#F8F9FA", "borderColor": "#E2E8F0"}, + "page": { + "format": "A4", + "marginsPt": {"top": 60, "bottom": 60, "left": 60, "right": 60}, + "showPageNumbers": True, + "headerHeight": 30, + "footerHeight": 30, + "headerLogo": None, + "headerText": "", + "footerText": "", + }, +} + + +def _deepMerge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + """Recursively merge override into base. Both dicts left unchanged; returns new dict.""" + result = {} + for key in base: + if key in override: + baseVal = base[key] + overVal = override[key] + if isinstance(baseVal, dict) and isinstance(overVal, dict): + result[key] = _deepMerge(baseVal, overVal) + else: + result[key] = overVal + else: + result[key] = base[key] + for key in override: + if key not in base: + result[key] = override[key] + return result + + +def resolveStyle(agentStyle: dict | None) -> Dict[str, Any]: + """Deep-merge DEFAULT_STYLE <- agentStyle. Returns fully resolved style dict.""" + if not agentStyle: + return dict(DEFAULT_STYLE) + return _deepMerge(DEFAULT_STYLE, agentStyle) diff --git a/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py b/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py index 8a3e7cea..594fbe02 100644 --- a/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py +++ b/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py @@ -9,11 +9,70 @@ from typing import Any, Dict logger = logging.getLogger(__name__) +def _parseInlineRuns(text: str) -> list: + """ + Parse inline markdown formatting into a list of InlineRun dicts. + Handles: images, links, bold, italic, inline code, plain text. + Uses a regex-based tokenizer that processes tokens left-to-right. + """ + if not text: + return [{"type": "text", "value": ""}] + + # Pattern order matters: images before links, bold before italic + _TOKEN_RE = re.compile( + r'!\[(?P[^`]+)`' # inline code
+ r'|\*\*(?P.+?)\*\*' # bold
+ r'|(?.+?)\*(?!\w)' # italic *x*
+ r'|(?.+?)_(?!\w)' # italic _x_
+ )
+
+ runs = []
+ lastEnd = 0
+
+ for m in _TOKEN_RE.finditer(text):
+ # Plain text before this match
+ if m.start() > lastEnd:
+ runs.append({"type": "text", "value": text[lastEnd:m.start()]})
+
+ if m.group("imgAlt") is not None or m.group("imgSrc") is not None:
+ alt = (m.group("imgAlt") or "").strip() or "Image"
+ src = (m.group("imgSrc") or "").strip()
+ widthStr = m.group("imgWidth")
+ run = {"type": "image", "value": alt}
+ if src.startswith("file:"):
+ run["fileId"] = src[5:]
+ else:
+ run["href"] = src
+ if widthStr:
+ run["widthPt"] = int(widthStr)
+ runs.append(run)
+ elif m.group("linkText") is not None:
+ runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")})
+ elif m.group("code") is not None:
+ runs.append({"type": "code", "value": m.group("code")})
+ elif m.group("bold") is not None:
+ runs.append({"type": "bold", "value": m.group("bold")})
+ elif m.group("italic1") is not None:
+ runs.append({"type": "italic", "value": m.group("italic1")})
+ elif m.group("italic2") is not None:
+ runs.append({"type": "italic", "value": m.group("italic2")})
+
+ lastEnd = m.end()
+
+ # Trailing plain text
+ if lastEnd < len(text):
+ runs.append({"type": "text", "value": text[lastEnd:]})
+
+ return runs if runs else [{"type": "text", "value": text}]
+
+
def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""
- Convert markdown content to the standard document JSON format expected by renderReport.
- Supports headings, code blocks, tables, lists, images (file: refs), paragraphs.
- For plain text: wraps entire content in a single paragraph section.
+ Convert markdown content to the standard document JSON format with Inline-Run model.
+ Sections use inlineRuns (list of run dicts) instead of plain text strings.
+ Supports headings, code blocks, tables, lists, images, paragraphs.
"""
if not isinstance(markdown, str):
markdown = str(markdown) if markdown else ""
@@ -31,7 +90,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
while i < len(lines):
line = lines[i]
- # Headings
+ # Headings (plain text, no inline formatting)
headingMatch = re.match(r"^(#{1,6})\s+(.+)", line)
if headingMatch:
level = len(headingMatch.group(1))
@@ -43,7 +102,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
i += 1
continue
- # Fenced code blocks
+ # Fenced code blocks (no inline formatting)
codeMatch = re.match(r"^```(\w*)", line)
if codeMatch:
lang = codeMatch.group(1) or "text"
@@ -59,14 +118,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
})
continue
- # Tables
+ # Tables - cells are List[InlineRun]
tableMatch = re.match(r"^\|(.+)\|$", line)
if tableMatch and (i + 1) < len(lines) and re.match(r"^\|[\s\-:|]+\|$", lines[i + 1]):
- headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
+ headerCells = [_parseInlineRuns(c.strip()) for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and re.match(r"^\|(.+)\|$", lines[i]):
- rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
+ rowCells = [_parseInlineRuns(c.strip()) for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
@@ -75,14 +134,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
})
continue
- # Bullet / numbered lists
+ # Bullet / numbered lists - items are List[List[InlineRun]]
listMatch = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", line)
if listMatch:
isNumbered = bool(re.match(r"\d+[.)]", listMatch.group(2)))
items = []
while i < len(lines) and re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i]):
m = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i])
- items.append({"text": m.group(3).strip()})
+ items.append(_parseInlineRuns(m.group(3).strip()))
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
@@ -95,46 +154,50 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
i += 1
continue
- # Images (simplified: store as paragraph with ref for now - full resolution needs Knowledge Store)
- imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)]+)\)", line)
+ # Standalone image on its own line -> block-level image section
+ imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)\"]+)(?:\s+\"(\d+)pt\")?\)\s*$", line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
+ widthStr = imgMatch.group(3)
fileId = src[5:] if src.startswith("file:") else ""
+ content = {
+ "altText": altText,
+ "base64Data": "",
+ "_fileRef": fileId,
+ "_srcUrl": src if not fileId else "",
+ }
+ if widthStr:
+ content["widthPt"] = int(widthStr)
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
- "elements": [{
- "content": {
- "altText": altText,
- "base64Data": "",
- "_fileRef": fileId,
- "_srcUrl": src if not fileId else "",
- }
- }],
+ "elements": [{"content": content}],
})
i += 1
continue
- # Paragraph
+ # Paragraph - produces inlineRuns
paraLines = []
while i < len(lines) and lines[i].strip() and not re.match(
- r"^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )", lines[i]
+ r"^(#{1,6}\s|```|\|.+\||!\[[^\]]*\]\([^)]+\)\s*$|(\s*)([-*+]|\d+[.)]) )", lines[i]
):
paraLines.append(lines[i])
i += 1
if paraLines:
+ combinedText = " ".join(paraLines)
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
- "elements": [{"content": {"text": " ".join(paraLines)}}],
+ "elements": [{"content": {"inlineRuns": _parseInlineRuns(combinedText)}}],
})
continue
i += 1
if not sections:
+ fallbackText = markdown.strip() or "(empty)"
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
- "elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
+ "elements": [{"content": {"inlineRuns": _parseInlineRuns(fallbackText)}}],
})
return {
diff --git a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
index dab8cc25..6698e164 100644
--- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
+++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
@@ -2,9 +2,13 @@
# All rights reserved.
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
+import hashlib
+import json
import logging
import re
-from typing import Any, Callable, Dict, List, Optional
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Union
from modules.datamodels.datamodelKnowledge import (
FileContentIndex, ContentChunk, WorkflowMemory,
@@ -20,6 +24,68 @@ DEFAULT_CHUNK_TOKENS = 400
DEFAULT_CONTEXT_BUDGET = 12000
+# =============================================================================
+# Ingestion façade (P0 of unified-knowledge-indexing concept)
+# =============================================================================
+
+@dataclass
+class IngestionJob:
+ """One request to add or refresh content in the unified knowledge store.
+
+ Callers from any lane (routes, feature hooks, agent tools, connector sync)
+ describe the work they want done via this object; idempotency, scope
+ resolution, and embedding are handled by KnowledgeService.requestIngestion.
+ """
+ sourceKind: str
+ sourceId: str
+ fileName: str
+ mimeType: str
+ userId: str
+ contentObjects: List[Dict[str, Any]] = field(default_factory=list)
+ featureInstanceId: str = ""
+ mandateId: str = ""
+ structure: Optional[Dict[str, Any]] = None
+ containerPath: Optional[str] = None
+ contentVersion: Optional[str] = None
+ provenance: Optional[Dict[str, Any]] = None
+ # Connector-driven neutralization: True when the user opted in via §2.6 preferences.
+ # For sourceKind == "file", _indexFileInternal resolves this from FileItem.neutralize instead.
+ neutralize: bool = False
+
+
+@dataclass
+class IngestionHandle:
+ """Result of requestIngestion. Stable across in-process and future queue impls."""
+ jobId: str
+ status: str
+ contentHash: str
+ fileId: str
+ index: Optional[FileContentIndex] = None
+ error: Optional[str] = None
+
+
+def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
+ """Deterministic SHA256 over (contentType, data) tuples in extractor order.
+
+ `contentObjectId` is intentionally excluded because extractors generate
+ fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable
+ across re-extractions of the same source — defeating idempotency.
+ Order is preserved (no sort) because two different documents can share the
+ same multiset of parts but differ in arrangement (e.g. swapped pages).
+ Text whitespace is preserved intentionally because chunk boundaries
+ depend on it.
+ """
+ normalized = [
+ (
+ str(o.get("contentType", "text") or "text"),
+ o.get("data", "") or "",
+ )
+ for o in (contentObjects or [])
+ ]
+ payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
class KnowledgeService:
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
@@ -46,6 +112,224 @@ class KnowledgeService:
results = await self._embed([text])
return results[0] if results else []
+ # =========================================================================
+ # Ingestion façade (single entry point for all lanes)
+ # =========================================================================
+
+ async def requestIngestion(self, job: IngestionJob) -> IngestionHandle:
+ """Unified entry point for filling the knowledge corpus.
+
+ Applies idempotency based on a content hash (or caller-supplied
+ `contentVersion`) persisted in `FileContentIndex.structure._ingestion`.
+ Re-runs indexing only when the hash differs or the previous run did
+ not reach `indexed` state. Runs embedding synchronously for now
+ (callers already schedule background tasks where needed).
+ """
+ jobId = f"{job.sourceKind}:{job.sourceId}"
+ startMs = time.time()
+ contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects)
+
+ # 1. Check for duplicate via existing FileContentIndex row.
+ existing = None
+ try:
+ existing = self._knowledgeDb.getFileContentIndex(job.sourceId)
+ except Exception:
+ existing = None
+
+ if existing:
+ existingStructure = (
+ existing.get("structure") if isinstance(existing, dict)
+ else getattr(existing, "structure", {})
+ ) or {}
+ existingMeta = existingStructure.get("_ingestion", {}) or {}
+ existingStatus = (
+ existing.get("status") if isinstance(existing, dict)
+ else getattr(existing, "status", "")
+ ) or ""
+ if existingMeta.get("hash") == contentHash and existingStatus == "indexed":
+ logger.info(
+ "ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s",
+ job.sourceKind, job.sourceId, contentHash[:12],
+ extra={
+ "event": "ingestion.skipped.duplicate",
+ "jobId": jobId,
+ "sourceKind": job.sourceKind,
+ "sourceId": job.sourceId,
+ "hash": contentHash,
+ "durationMs": int((time.time() - startMs) * 1000),
+ },
+ )
+ return IngestionHandle(
+ jobId=jobId,
+ status="duplicate",
+ contentHash=contentHash,
+ fileId=job.sourceId,
+ index=None,
+ )
+
+ # 2. Prepare ingestion metadata; stays in structure._ingestion so
+ # later connector revoke/purge can filter chunks by sourceKind /
+ # provenance.connectionId without a schema migration.
+ ingestionMeta = {
+ "hash": contentHash,
+ "sourceKind": job.sourceKind,
+ "sourceId": job.sourceId,
+ "contentVersion": job.contentVersion,
+ "indexedAt": getUtcTimestamp(),
+ "provenance": dict(job.provenance or {}),
+ }
+ structure = dict(job.structure or {})
+ structure["_ingestion"] = ingestionMeta
+
+ logger.info(
+ "ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s",
+ job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12],
+ extra={
+ "event": "ingestion.queued",
+ "jobId": jobId,
+ "sourceKind": job.sourceKind,
+ "sourceId": job.sourceId,
+ "hash": contentHash,
+ "objectCount": len(job.contentObjects or []),
+ },
+ )
+
+ # 3. Run real indexing.
+ try:
+ index = await self._indexFileInternal(
+ fileId=job.sourceId,
+ fileName=job.fileName,
+ mimeType=job.mimeType,
+ userId=job.userId,
+ featureInstanceId=job.featureInstanceId,
+ mandateId=job.mandateId,
+ contentObjects=job.contentObjects or [],
+ structure=structure,
+ containerPath=job.containerPath,
+ sourceKind=job.sourceKind,
+ connectionId=(job.provenance or {}).get("connectionId"),
+ neutralize=job.neutralize,
+ )
+ except Exception as exc:
+ logger.error(
+ "ingestion.failed sourceKind=%s sourceId=%s error=%s",
+ job.sourceKind, job.sourceId, exc,
+ exc_info=True,
+ extra={
+ "event": "ingestion.failed",
+ "jobId": jobId,
+ "sourceKind": job.sourceKind,
+ "sourceId": job.sourceId,
+ "hash": contentHash,
+ "error": str(exc),
+ "durationMs": int((time.time() - startMs) * 1000),
+ },
+ )
+ try:
+ self._knowledgeDb.updateFileStatus(job.sourceId, "failed")
+ except Exception:
+ pass
+ return IngestionHandle(
+ jobId=jobId,
+ status="failed",
+ contentHash=contentHash,
+ fileId=job.sourceId,
+ index=None,
+ error=str(exc),
+ )
+
+ logger.info(
+ "ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d",
+ job.sourceKind, job.sourceId, len(job.contentObjects or []),
+ int((time.time() - startMs) * 1000),
+ extra={
+ "event": "ingestion.indexed",
+ "jobId": jobId,
+ "sourceKind": job.sourceKind,
+ "sourceId": job.sourceId,
+ "hash": contentHash,
+ "objectCount": len(job.contentObjects or []),
+ "durationMs": int((time.time() - startMs) * 1000),
+ },
+ )
+ return IngestionHandle(
+ jobId=jobId,
+ status="indexed",
+ contentHash=contentHash,
+ fileId=job.sourceId,
+ index=index,
+ )
+
+ def purgeConnection(self, connectionId: str) -> Dict[str, int]:
+ """Delete every FileContentIndex + ContentChunk linked to a UserConnection.
+
+ Called on `connection.revoked` events so the knowledge corpus never
+ holds chunks the user has withdrawn access to. Returns deletion counts
+ for observability.
+ """
+ if not connectionId:
+ return {"indexRows": 0, "chunks": 0}
+ startMs = time.time()
+ result = self._knowledgeDb.deleteFileContentIndexByConnectionId(connectionId)
+ logger.info(
+ "ingestion.connection.purged connectionId=%s rows=%d chunks=%d durationMs=%d",
+ connectionId, result["indexRows"], result["chunks"],
+ int((time.time() - startMs) * 1000),
+ extra={
+ "event": "ingestion.connection.purged",
+ "connectionId": connectionId,
+ "indexRows": result["indexRows"],
+ "chunks": result["chunks"],
+ "durationMs": int((time.time() - startMs) * 1000),
+ },
+ )
+ return result
+
+ def getIngestionStatus(
+ self, handleOrJobId: Union[IngestionHandle, str]
+ ) -> Dict[str, Any]:
+ """Map a handle or `sourceKind:sourceId` jobId to a status snapshot."""
+ if isinstance(handleOrJobId, IngestionHandle):
+ sourceId = handleOrJobId.fileId
+ jobId = handleOrJobId.jobId
+ elif isinstance(handleOrJobId, str) and ":" in handleOrJobId:
+ jobId = handleOrJobId
+ sourceId = handleOrJobId.split(":", 1)[1]
+ else:
+ jobId = str(handleOrJobId)
+ sourceId = str(handleOrJobId)
+
+ row = None
+ try:
+ row = self._knowledgeDb.getFileContentIndex(sourceId)
+ except Exception:
+ row = None
+ if not row:
+ return {
+ "jobId": jobId,
+ "sourceId": sourceId,
+ "status": "unknown",
+ "contentHash": None,
+ }
+
+ structure = (
+ row.get("structure") if isinstance(row, dict)
+ else getattr(row, "structure", {})
+ ) or {}
+ meta = structure.get("_ingestion", {}) or {}
+ status = (
+ row.get("status") if isinstance(row, dict)
+ else getattr(row, "status", "")
+ ) or "unknown"
+ return {
+ "jobId": jobId,
+ "sourceId": sourceId,
+ "status": status,
+ "contentHash": meta.get("hash"),
+ "sourceKind": meta.get("sourceKind"),
+ "indexedAt": meta.get("indexedAt"),
+ }
+
# =========================================================================
# File Indexing (called after extraction, before embedding)
# =========================================================================
@@ -61,6 +345,57 @@ class KnowledgeService:
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
+ ) -> Optional[FileContentIndex]:
+ """Backward-compatible wrapper delegating to requestIngestion.
+
+ Existing callers that still invoke `indexFile` directly automatically
+ participate in the idempotency/metrics layer. New callers should
+ prefer `requestIngestion` so they can pass `sourceKind` and
+ `provenance` for connector revoke/purge later.
+ """
+ job = IngestionJob(
+ sourceKind="file",
+ sourceId=fileId,
+ fileName=fileName,
+ mimeType=mimeType,
+ userId=userId,
+ featureInstanceId=featureInstanceId,
+ mandateId=mandateId,
+ contentObjects=list(contentObjects or []),
+ structure=structure,
+ containerPath=containerPath,
+ )
+ handle = await self.requestIngestion(job)
+ if handle.index is not None:
+ return handle.index
+ if handle.status == "duplicate":
+ row = None
+ try:
+ row = self._knowledgeDb.getFileContentIndex(fileId)
+ except Exception:
+ row = None
+ if isinstance(row, dict):
+ try:
+ return FileContentIndex(**row)
+ except Exception:
+ return None
+ return row
+ return None
+
+ async def _indexFileInternal(
+ self,
+ fileId: str,
+ fileName: str,
+ mimeType: str,
+ userId: str,
+ featureInstanceId: str = "",
+ mandateId: str = "",
+ contentObjects: List[Dict[str, Any]] = None,
+ structure: Dict[str, Any] = None,
+ containerPath: str = None,
+ sourceKind: str = "file",
+ connectionId: Optional[str] = None,
+ neutralize: bool = False,
) -> FileContentIndex:
"""Index a file's content objects and create embeddings for text chunks.
@@ -83,39 +418,41 @@ class KnowledgeService:
"""
contentObjects = contentObjects or []
- # 1. Resolve scope fields from FileItem (Single Source of Truth)
- # FileItem lives in poweron_management; its scope/mandateId/featureInstanceId
- # are authoritative and must be mirrored onto the FileContentIndex.
+ # 1. Resolve scope fields from FileItem (Single Source of Truth) for
+ # uploaded files. Connector-sourced ingestion (sharepoint_item,
+ # outlook_message, ...) has no FileItem row — trust the caller's
+ # scope + ids directly.
resolvedScope = "personal"
resolvedMandateId = mandateId
resolvedFeatureInstanceId = featureInstanceId
resolvedUserId = userId
- _shouldNeutralize = False
- try:
- from modules.datamodels.datamodelFiles import FileItem as _FileItem
- _dbComponent = getattr(self._context, "interfaceDbComponent", None)
- _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
- if not _fileRecords:
- from modules.interfaces.interfaceDbManagement import ComponentObjects
- _row = ComponentObjects().db._loadRecord(_FileItem, fileId)
- if _row:
- _fileRecords = [_row]
- if _fileRecords:
- _fileRecord = _fileRecords[0]
- _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
- _shouldNeutralize = bool(_get("neutralize", False))
- _fileScope = _get("scope")
- if _fileScope:
- resolvedScope = _fileScope
- if not resolvedMandateId:
- resolvedMandateId = str(_get("mandateId", "") or "")
- if not resolvedFeatureInstanceId:
- resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
- _fileCreatedBy = _get("sysCreatedBy")
- if _fileCreatedBy:
- resolvedUserId = str(_fileCreatedBy)
- except Exception:
- pass
+ _shouldNeutralize = neutralize # caller-supplied flag (connector prefs / IngestionJob)
+ if sourceKind == "file":
+ try:
+ from modules.datamodels.datamodelFiles import FileItem as _FileItem
+ _dbComponent = getattr(self._context, "interfaceDbComponent", None)
+ _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
+ if not _fileRecords:
+ from modules.interfaces.interfaceDbManagement import ComponentObjects
+ _row = ComponentObjects().db._loadRecord(_FileItem, fileId)
+ if _row:
+ _fileRecords = [_row]
+ if _fileRecords:
+ _fileRecord = _fileRecords[0]
+ _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
+ _shouldNeutralize = bool(_get("neutralize", False)) # FileItem is authoritative for uploads
+ _fileScope = _get("scope")
+ if _fileScope:
+ resolvedScope = _fileScope
+ if not resolvedMandateId:
+ resolvedMandateId = str(_get("mandateId", "") or "")
+ if not resolvedFeatureInstanceId:
+ resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
+ _fileCreatedBy = _get("sysCreatedBy")
+ if _fileCreatedBy:
+ resolvedUserId = str(_fileCreatedBy)
+ except Exception:
+ pass
# 2. Create FileContentIndex with correct scope from the start
index = FileContentIndex(
@@ -124,6 +461,8 @@ class KnowledgeService:
featureInstanceId=resolvedFeatureInstanceId,
mandateId=resolvedMandateId,
scope=resolvedScope,
+ sourceKind=sourceKind,
+ connectionId=connectionId,
fileName=fileName,
mimeType=mimeType,
containerPath=containerPath,
@@ -300,7 +639,12 @@ class KnowledgeService:
Formatted context string for injection into the agent's system prompt.
"""
queryVector = await self._embedSingle(currentPrompt)
+ logger.debug(
+ "buildAgentContext.start userId=%s featureInstanceId=%s mandateId=%s isSysAdmin=%s prompt=%r",
+ userId, featureInstanceId, mandateId, isSysAdmin, (currentPrompt or "")[:120],
+ )
if not queryVector:
+ logger.debug("buildAgentContext.abort reason=no_query_vector")
return ""
builder = _ContextBuilder(budget=contextBudget)
@@ -327,9 +671,14 @@ class KnowledgeService:
featureInstanceId=featureInstanceId,
mandateId=mandateId,
limit=15,
- minScore=0.65,
+ minScore=0.35,
isSysAdmin=isSysAdmin,
)
+ logger.debug(
+ "buildAgentContext.layer1 instanceChunks=%d top_scores=%s",
+ len(instanceChunks),
+ [round(float(c.get("_score", 0) or 0), 3) for c in (instanceChunks or [])[:3]],
+ )
if instanceChunks:
builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000)
@@ -338,7 +687,7 @@ class KnowledgeService:
queryVector=queryVector,
workflowId=workflowId,
limit=10,
- minScore=0.55,
+ minScore=0.35,
)
if roundMemories:
memItems = []
@@ -376,7 +725,7 @@ class KnowledgeService:
scope="mandate",
mandateId=mandateId,
limit=10,
- minScore=0.7,
+ minScore=0.35,
isSysAdmin=isSysAdmin,
)
if mandateChunks:
@@ -392,7 +741,12 @@ class KnowledgeService:
maxChars=500,
)
- return builder.build()
+ _result = builder.build()
+ logger.debug(
+ "buildAgentContext.done totalChars=%d userId=%s",
+ len(_result), userId,
+ )
+ return _result
# =========================================================================
# Workflow Memory
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py
new file mode 100644
index 00000000..97ac61d5
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Connection-lifecycle consumer bridging OAuth events to ingestion jobs.
+
+Subscribes to `connection.established` and `connection.revoked` callbacks
+emitted by the OAuth callbacks / connection management routes and dispatches:
+
+- `connection.established` -> enqueue a `connection.bootstrap` BackgroundJob
+ that walks the connector and ingests all reachable items via
+ KnowledgeService.requestIngestion (file-like or virtual documents).
+- `connection.revoked` -> run `KnowledgeService.purgeConnection` synchronously
+ so the knowledge corpus releases the data before the UI confirms the revoke.
+
+The consumer is registered once at process boot (see `app.py` lifespan).
+It intentionally does NOT hold a per-user service context; each callback
+creates whatever context it needs from the UserConnection row itself.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any, Dict, Optional
+
+from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
+from modules.shared.callbackRegistry import callbackRegistry
+from modules.serviceCenter.services.serviceBackgroundJobs import (
+ registerJobHandler,
+ startJob,
+)
+
+logger = logging.getLogger(__name__)
+
+BOOTSTRAP_JOB_TYPE = "connection.bootstrap"
+
+_registered = False
+
+
+def _onConnectionEstablished(
+ *,
+ connectionId: str,
+ authority: str,
+ userId: Optional[str] = None,
+ **kwargs: Any,
+) -> None:
+ """Fire-and-forget bootstrap enqueue for a freshly connected UserConnection."""
+ if not connectionId:
+ logger.warning("connection.established without connectionId; ignoring")
+ return
+ payload: Dict[str, Any] = {
+ "connectionId": connectionId,
+ "authority": (authority or "").lower(),
+ "userId": userId,
+ }
+ logger.info(
+ "ingestion.connection.bootstrap.queued connectionId=%s authority=%s",
+ connectionId, authority,
+ extra={
+ "event": "ingestion.connection.bootstrap.queued",
+ "connectionId": connectionId,
+ "authority": authority,
+ },
+ )
+
+ async def _enqueue() -> None:
+ try:
+ await startJob(
+ BOOTSTRAP_JOB_TYPE,
+ payload,
+ triggeredBy=userId,
+ )
+ except Exception as exc:
+ logger.error(
+ "ingestion.connection.bootstrap.enqueue_failed connectionId=%s error=%s",
+ connectionId, exc, exc_info=True,
+ )
+
+ try:
+ loop = asyncio.get_event_loop()
+ if loop.is_running():
+ loop.create_task(_enqueue())
+ else:
+ loop.run_until_complete(_enqueue())
+ except RuntimeError:
+ asyncio.run(_enqueue())
+
+
+def _onConnectionRevoked(
+ *,
+ connectionId: str,
+ authority: Optional[str] = None,
+ userId: Optional[str] = None,
+ reason: Optional[str] = None,
+ **kwargs: Any,
+) -> None:
+ """Run the knowledge purge synchronously so UI feedback is authoritative."""
+ if not connectionId:
+ logger.warning("connection.revoked without connectionId; ignoring")
+ return
+ try:
+ # Purge lives on the DB interface to avoid ServiceCenter/user-context
+ # plumbing here; the service method is a thin wrapper on top of this.
+ result = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId)
+ except Exception as exc:
+ logger.error(
+ "ingestion.connection.purged.failed connectionId=%s error=%s",
+ connectionId, exc, exc_info=True,
+ )
+ return
+ logger.info(
+ "ingestion.connection.purged connectionId=%s authority=%s reason=%s rows=%d chunks=%d",
+ connectionId, authority, reason,
+ result.get("indexRows", 0), result.get("chunks", 0),
+ extra={
+ "event": "ingestion.connection.purged",
+ "connectionId": connectionId,
+ "authority": authority,
+ "reason": reason,
+ "indexRows": result.get("indexRows", 0),
+ "chunks": result.get("chunks", 0),
+ },
+ )
+
+
+async def _bootstrapJobHandler(
+ job: Dict[str, Any],
+ progressCb,
+) -> Dict[str, Any]:
+ """Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps."""
+ payload = job.get("payload") or {}
+ connectionId = payload.get("connectionId")
+ authority = (payload.get("authority") or "").lower()
+ if not connectionId:
+ raise ValueError("connection.bootstrap requires payload.connectionId")
+
+ progressCb(5, f"resolving {authority} connection")
+
+ # Defensive consent check: if the connection has since disabled knowledge ingestion
+ # (e.g. user toggled setting after the job was enqueued), skip all walkers.
+ try:
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ _root = getRootInterface()
+ _conn = _root.getUserConnectionById(connectionId)
+ if _conn and not getattr(_conn, "knowledgeIngestionEnabled", True):
+ logger.info(
+ "ingestion.connection.bootstrap.skipped — consent disabled connectionId=%s",
+ connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.skipped",
+ "connectionId": connectionId,
+ "authority": authority,
+ "reason": "consent_disabled",
+ },
+ )
+ return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "consent_disabled"}
+ except Exception as _guardErr:
+ logger.debug("Could not load connection for consent guard: %s", _guardErr)
+
+ def _normalize(res: Any, label: str) -> Dict[str, Any]:
+ if isinstance(res, Exception):
+ logger.error(
+ "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
+ label, connectionId, res, exc_info=res,
+ )
+ return {"error": str(res)}
+ return res or {}
+
+ if authority == "msft":
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
+ bootstrapSharepoint,
+ )
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
+ bootstrapOutlook,
+ )
+
+ progressCb(10, "sharepoint + outlook")
+ spResult, olResult = await asyncio.gather(
+ bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb),
+ bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
+ return_exceptions=True,
+ )
+ return {
+ "connectionId": connectionId,
+ "authority": authority,
+ "sharepoint": _normalize(spResult, "sharepoint"),
+ "outlook": _normalize(olResult, "outlook"),
+ }
+
+ if authority == "google":
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
+ bootstrapGdrive,
+ )
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+ bootstrapGmail,
+ )
+
+ progressCb(10, "drive + gmail")
+ gdResult, gmResult = await asyncio.gather(
+ bootstrapGdrive(connectionId=connectionId, progressCb=progressCb),
+ bootstrapGmail(connectionId=connectionId, progressCb=progressCb),
+ return_exceptions=True,
+ )
+ return {
+ "connectionId": connectionId,
+ "authority": authority,
+ "drive": _normalize(gdResult, "gdrive"),
+ "gmail": _normalize(gmResult, "gmail"),
+ }
+
+ if authority == "clickup":
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
+ bootstrapClickup,
+ )
+
+ progressCb(10, "clickup tasks")
+ cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb)
+ return {
+ "connectionId": connectionId,
+ "authority": authority,
+ "clickup": _normalize(cuResult, "clickup"),
+ }
+
+ logger.info(
+ "ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s",
+ authority, connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.skipped",
+ "authority": authority,
+ "connectionId": connectionId,
+ "reason": "unsupported_authority",
+ },
+ )
+ return {
+ "connectionId": connectionId,
+ "authority": authority,
+ "skipped": True,
+ "reason": "unsupported_authority",
+ }
+
+
+async def _scheduledDailyResync() -> None:
+ """Enqueue a connection.bootstrap job for every active knowledge connection.
+
+ Runs once per day (default 2 AM Europe/Zurich). Each job re-walks the
+ connector and hands new / changed items to KnowledgeService.requestIngestion.
+ Unchanged items are deduplicated by content-hash and skipped automatically.
+ """
+ try:
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ rootInterface = getRootInterface()
+ connections = rootInterface.getActiveKnowledgeConnections()
+ except Exception as exc:
+ logger.error("knowledge.daily_resync: could not load connections: %s", exc, exc_info=True)
+ return
+
+ if not connections:
+ logger.info("knowledge.daily_resync: no active knowledge connections — nothing to do")
+ return
+
+ logger.info(
+ "knowledge.daily_resync: enqueuing bootstrap for %d connection(s)",
+ len(connections),
+ extra={"event": "knowledge.daily_resync.started", "count": len(connections)},
+ )
+
+ enqueued = 0
+ skipped = 0
+ for conn in connections:
+ connectionId = str(conn.id)
+ authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority)
+ userId = str(conn.userId)
+ payload: Dict[str, Any] = {
+ "connectionId": connectionId,
+ "authority": authority.lower(),
+ "userId": userId,
+ }
+ try:
+ await startJob(
+ BOOTSTRAP_JOB_TYPE,
+ payload,
+ triggeredBy="scheduler.daily_resync",
+ )
+ enqueued += 1
+ logger.debug(
+ "knowledge.daily_resync: queued connectionId=%s authority=%s",
+ connectionId, authority,
+ )
+ except Exception as exc:
+ skipped += 1
+ logger.error(
+ "knowledge.daily_resync: failed to enqueue connectionId=%s: %s",
+ connectionId, exc,
+ )
+
+ logger.info(
+ "knowledge.daily_resync: done — enqueued=%d skipped=%d",
+ enqueued, skipped,
+ extra={"event": "knowledge.daily_resync.done", "enqueued": enqueued, "skipped": skipped},
+ )
+
+
+def registerDailyResyncScheduler(*, hour: int = 2, minute: int = 0) -> None:
+ """Register the daily knowledge re-sync cron job. Idempotent.
+
+ Args:
+ hour: Hour of day to run (0–23, default 2 → 2 AM Europe/Zurich).
+ minute: Minute within the hour (default 0).
+ """
+ try:
+ from modules.shared.eventManagement import eventManager
+ eventManager.registerCron(
+ jobId="knowledge.daily_resync",
+ func=_scheduledDailyResync,
+ cronKwargs={"hour": str(hour), "minute": str(minute)},
+ )
+ logger.info(
+ "knowledge.daily_resync scheduler registered (daily %02d:%02d Europe/Zurich)",
+ hour, minute,
+ )
+ except Exception as exc:
+ logger.warning("knowledge.daily_resync scheduler registration failed (non-critical): %s", exc)
+
+
+def registerKnowledgeIngestionConsumer() -> None:
+ """Register callback subscribers + background job handler. Idempotent."""
+ global _registered
+ if _registered:
+ return
+ callbackRegistry.register("connection.established", _onConnectionEstablished)
+ callbackRegistry.register("connection.revoked", _onConnectionRevoked)
+ registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler)
+ registerDailyResyncScheduler()
+ _registered = True
+ logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler + daily resync)", BOOTSTRAP_JOB_TYPE)
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py
new file mode 100644
index 00000000..950400ce
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py
@@ -0,0 +1,101 @@
+"""Per-connection knowledge ingestion preference helpers.
+
+Walkers call `loadConnectionPrefs(connectionId)` once at bootstrap start and
+receive a `ConnectionIngestionPrefs` dataclass they can pass down into their
+inner loops. All fields have safe defaults so walkers stay backward-compatible
+with connections that predate the §2.6 preference schema (knowledgePreferences
+is None).
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_MAX_AGE_DAYS = 90
+_DEFAULT_MAIL_DEPTH = "full"
+_DEFAULT_CLICKUP_SCOPE = "title_description"
+
+
+@dataclass
+class ConnectionIngestionPrefs:
+ """Parsed per-connection preferences for knowledge ingestion walkers."""
+
+ # PII
+ neutralizeBeforeEmbed: bool = False
+
+ # Mail (Outlook + Gmail)
+ mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full"
+ mailIndexAttachments: bool = False
+
+ # Files (Drive / SharePoint / OneDrive)
+ filesIndexBinaries: bool = True
+ mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed
+
+ # ClickUp
+ clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments"
+ clickupIndexAttachments: bool = False
+
+ # Per-authority surface toggles (default everything on)
+ gmailEnabled: bool = True
+ driveEnabled: bool = True
+ sharepointEnabled: bool = True
+ outlookEnabled: bool = True
+
+ # Time window
+ maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit
+
+
+def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs:
+ """Load and parse per-connection preferences from the database.
+
+ Returns safe defaults for any missing or unparseable values so walkers
+ never fail due to missing preference data.
+ """
+ try:
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ root = getRootInterface()
+ conn = root.getUserConnectionById(connectionId)
+ if not conn:
+ logger.debug("loadConnectionPrefs: connection %s not found, using defaults", connectionId)
+ return ConnectionIngestionPrefs()
+
+ raw: Optional[Dict[str, Any]] = getattr(conn, "knowledgePreferences", None)
+ if not raw or not isinstance(raw, dict):
+ return ConnectionIngestionPrefs()
+
+ def _bool(key: str, default: bool) -> bool:
+ v = raw.get(key)
+ return bool(v) if isinstance(v, bool) else default
+
+ def _str(key: str, allowed: List[str], default: str) -> str:
+ v = raw.get(key)
+ return v if v in allowed else default
+
+ def _int(key: str, default: int) -> int:
+ v = raw.get(key)
+ return int(v) if isinstance(v, int) else default
+
+ surface = raw.get("surfaceToggles") or {}
+ google_surf = surface.get("google") or {}
+ msft_surf = surface.get("msft") or {}
+
+ return ConnectionIngestionPrefs(
+ neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False),
+ mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH),
+ mailIndexAttachments=_bool("mailIndexAttachments", False),
+ filesIndexBinaries=_bool("filesIndexBinaries", True),
+ mimeAllowlist=list(raw.get("mimeAllowlist") or []),
+ clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE),
+ clickupIndexAttachments=_bool("clickupIndexAttachments", False),
+ gmailEnabled=bool(google_surf.get("gmail", True)),
+ driveEnabled=bool(google_surf.get("drive", True)),
+ sharepointEnabled=bool(msft_surf.get("sharepoint", True)),
+ outlookEnabled=bool(msft_surf.get("outlook", True)),
+ maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS),
+ )
+ except Exception as exc:
+ logger.warning("loadConnectionPrefs failed for %s, using defaults: %s", connectionId, exc)
+ return ConnectionIngestionPrefs()
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py
new file mode 100644
index 00000000..31ac9687
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py
@@ -0,0 +1,512 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""ClickUp bootstrap for the unified knowledge ingestion lane.
+
+ClickUp tasks are ingested as *virtual documents* — we never download file
+bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose
+`contentObjects` carry a summary header (name + status + metadata) and the
+task description / text content so retrieval finds them without a live API
+call.
+
+Hierarchy traversal: workspace (team) → spaces → folders / folderless lists →
+tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` /
+`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d).
+
+Idempotency: `date_updated` from the ClickUp task payload is a millisecond
+timestamp and strictly monotonic per revision — used as `contentVersion`.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+MAX_TASKS_DEFAULT = 500
+MAX_WORKSPACES_DEFAULT = 3
+MAX_LISTS_PER_WORKSPACE_DEFAULT = 20
+MAX_DESCRIPTION_CHARS_DEFAULT = 8000
+MAX_AGE_DAYS_DEFAULT = 180
+
+
+@dataclass
+class ClickupBootstrapLimits:
+ maxTasks: int = MAX_TASKS_DEFAULT
+ maxWorkspaces: int = MAX_WORKSPACES_DEFAULT
+ maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT
+ maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT
+ # Only ingest tasks updated within the last N days. None disables filter.
+ maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
+ # Include closed/archived tasks if they still meet the recency filter.
+ # ClickUp `closed` tasks often carry the most useful RAG context
+ # ("why was this shipped the way it was?").
+ includeClosed: bool = True
+ # Pass-through to IngestionJob.neutralize
+ neutralize: bool = False
+ # Content scope: "titles" | "title_description" | "with_comments"
+ clickupScope: str = "title_description"
+
+
+@dataclass
+class ClickupBootstrapResult:
+ connectionId: str
+ indexed: int = 0
+ skippedDuplicate: int = 0
+ skippedPolicy: int = 0
+ failed: int = 0
+ workspaces: int = 0
+ lists: int = 0
+ errors: List[str] = field(default_factory=list)
+
+
+def _syntheticTaskId(connectionId: str, taskId: str) -> str:
+ token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16]
+ return f"cu:{connectionId[:8]}:{token}"
+
+
+def _truncate(value: Any, limit: int) -> str:
+ text = str(value or "").strip()
+ if not text:
+ return ""
+ if len(text) <= limit:
+ return text
+ return text[:limit].rstrip() + "\n[truncated]"
+
+
+def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool:
+ if not maxAgeDays:
+ return True
+ if not dateUpdatedMs:
+ return True
+ try:
+ ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc)
+ except Exception:
+ return True
+ cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
+ return ts >= cutoff
+
+
+def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]:
+ """Header (name/status/metadata) + optional description + text_content.
+
+ `limits.clickupScope` controls how much is embedded:
+ - "titles": task name + status metadata only
+ - "title_description": header + description / text_content (default)
+ - "with_comments": header + description + text_content
+ (comments themselves are not yet fetched in v1)
+ """
+ name = task.get("name") or f"Task {task.get('id', '')}"
+ status = ((task.get("status") or {}).get("status")) or ""
+ assignees = ", ".join(
+ filter(None, [
+ (a.get("username") or a.get("email") or "")
+ for a in (task.get("assignees") or [])
+ ])
+ )
+ tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])]))
+ listInfo = task.get("list") or {}
+ folderInfo = task.get("folder") or {}
+ spaceInfo = task.get("space") or {}
+ dueMs = task.get("due_date")
+ dueIso = ""
+ if dueMs:
+ try:
+ dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
+ except Exception:
+ dueIso = ""
+
+ headerLines = [
+ f"Task: {name}",
+ f"Status: {status}" if status else "",
+ f"List: {listInfo.get('name', '')}" if listInfo else "",
+ f"Folder: {folderInfo.get('name', '')}" if folderInfo else "",
+ f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "",
+ f"Assignees: {assignees}" if assignees else "",
+ f"Tags: {tags}" if tags else "",
+ f"Due: {dueIso}" if dueIso else "",
+ f"Url: {task.get('url', '')}" if task.get("url") else "",
+ ]
+ header = "\n".join(line for line in headerLines if line)
+
+ parts: List[Dict[str, Any]] = [{
+ "contentObjectId": "header",
+ "contentType": "text",
+ "data": header,
+ "contextRef": {"part": "header"},
+ }]
+
+ scope = getattr(limits, "clickupScope", "title_description")
+ if scope in ("title_description", "with_comments"):
+ description = _truncate(task.get("description"), limits.maxDescriptionChars)
+ if description:
+ parts.append({
+ "contentObjectId": "description",
+ "contentType": "text",
+ "data": description,
+ "contextRef": {"part": "description"},
+ })
+ # text_content is ClickUp's rendered-markdown version; include if it adds
+ # something beyond the plain description (common for bullet lists, checklists).
+ textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
+ if textContent and textContent != description:
+ parts.append({
+ "contentObjectId": "text_content",
+ "contentType": "text",
+ "data": textContent,
+ "contextRef": {"part": "text_content"},
+ })
+ return parts
+
+
+async def bootstrapClickup(
+ connectionId: str,
+ *,
+ progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+ adapter: Any = None,
+ connection: Any = None,
+ knowledgeService: Any = None,
+ limits: Optional[ClickupBootstrapLimits] = None,
+) -> Dict[str, Any]:
+ """Walk workspaces → lists → tasks and ingest each task as a virtual doc."""
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+ prefs = loadConnectionPrefs(connectionId)
+
+ if not limits:
+ limits = ClickupBootstrapLimits(
+ maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
+ neutralize=prefs.neutralizeBeforeEmbed,
+ clickupScope=prefs.clickupScope,
+ )
+
+ startMs = time.time()
+ result = ClickupBootstrapResult(connectionId=connectionId)
+
+ logger.info(
+ "ingestion.connection.bootstrap.started part=clickup connectionId=%s",
+ connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.started",
+ "part": "clickup",
+ "connectionId": connectionId,
+ },
+ )
+
+ if adapter is None or knowledgeService is None or connection is None:
+ adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+
+ mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+ userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+ svc = getattr(adapter, "_svc", None)
+ if svc is None:
+ result.errors.append("adapter missing _svc instance")
+ return _finalizeResult(connectionId, result, startMs)
+
+ try:
+ teamsResp = await svc.getAuthorizedTeams()
+ except Exception as exc:
+ logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True)
+ result.errors.append(f"teams: {exc}")
+ return _finalizeResult(connectionId, result, startMs)
+
+ teams = (teamsResp or {}).get("teams") or []
+ for team in teams[: limits.maxWorkspaces]:
+ if result.indexed + result.skippedDuplicate >= limits.maxTasks:
+ break
+ teamId = str(team.get("id", "") or "")
+ if not teamId:
+ continue
+ result.workspaces += 1
+ try:
+ await _walkTeam(
+ svc=svc,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ team=team,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ except Exception as exc:
+ logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
+ result.errors.append(f"team({teamId}): {exc}")
+
+ return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ from modules.auth import TokenManager
+ from modules.connectors.providerClickup.connectorClickup import ClickupConnector
+ from modules.serviceCenter import getService
+ from modules.serviceCenter.context import ServiceCenterContext
+ from modules.security.rootAccess import getRootUser
+
+ rootInterface = getRootInterface()
+ connection = rootInterface.getUserConnectionById(connectionId)
+ if connection is None:
+ raise ValueError(f"UserConnection not found: {connectionId}")
+
+ token = TokenManager().getFreshToken(connectionId)
+ if not token or not token.tokenAccess:
+ raise ValueError(f"No valid token for connection {connectionId}")
+
+ provider = ClickupConnector(connection, token.tokenAccess)
+ adapter = provider.getServiceAdapter("clickup")
+
+ rootUser = getRootUser()
+ ctx = ServiceCenterContext(
+ user=rootUser,
+ mandate_id=str(getattr(connection, "mandateId", "") or ""),
+ )
+ knowledgeService = getService("knowledge", ctx)
+ return adapter, connection, knowledgeService
+
+
+async def _walkTeam(
+ *,
+ svc,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ team: Dict[str, Any],
+ limits: ClickupBootstrapLimits,
+ result: ClickupBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ teamId = str(team.get("id", "") or "")
+ spacesResp = await svc.getSpaces(teamId)
+ spaces = (spacesResp or {}).get("spaces") or []
+
+ listsCollected: List[Dict[str, Any]] = []
+ for space in spaces:
+ if len(listsCollected) >= limits.maxListsPerWorkspace:
+ break
+ spaceId = str(space.get("id", "") or "")
+ if not spaceId:
+ continue
+
+ # Folderless lists directly under the space
+ folderless = await svc.getFolderlessLists(spaceId)
+ for lst in (folderless or {}).get("lists") or []:
+ if len(listsCollected) >= limits.maxListsPerWorkspace:
+ break
+ listsCollected.append({**lst, "_space": space})
+
+ # Lists inside folders
+ foldersResp = await svc.getFolders(spaceId)
+ for folder in (foldersResp or {}).get("folders") or []:
+ if len(listsCollected) >= limits.maxListsPerWorkspace:
+ break
+ folderId = str(folder.get("id", "") or "")
+ if not folderId:
+ continue
+ folderLists = await svc.getListsInFolder(folderId)
+ for lst in (folderLists or {}).get("lists") or []:
+ if len(listsCollected) >= limits.maxListsPerWorkspace:
+ break
+ listsCollected.append({**lst, "_space": space, "_folder": folder})
+
+ for lst in listsCollected:
+ if result.indexed + result.skippedDuplicate >= limits.maxTasks:
+ return
+ result.lists += 1
+ await _walkList(
+ svc=svc,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ teamId=teamId,
+ lst=lst,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+
+
+async def _walkList(
+ *,
+ svc,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ teamId: str,
+ lst: Dict[str, Any],
+ limits: ClickupBootstrapLimits,
+ result: ClickupBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ listId = str(lst.get("id", "") or "")
+ if not listId:
+ return
+ page = 0
+ while result.indexed + result.skippedDuplicate < limits.maxTasks:
+ resp = await svc.getTasksInList(
+ listId,
+ page=page,
+ include_closed=limits.includeClosed,
+ subtasks=True,
+ )
+ if isinstance(resp, dict) and resp.get("error"):
+ logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error"))
+ result.errors.append(f"list({listId}): {resp.get('error')}")
+ return
+ tasks = (resp or {}).get("tasks") or []
+ if not tasks:
+ return
+
+ for task in tasks:
+ if result.indexed + result.skippedDuplicate >= limits.maxTasks:
+ return
+ if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
+ result.skippedPolicy += 1
+ continue
+ # Inject the list/folder/space metadata we already loaded.
+ task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
+ task["folder"] = task.get("folder") or lst.get("_folder") or {}
+ task["space"] = task.get("space") or lst.get("_space") or {}
+ await _ingestTask(
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ teamId=teamId,
+ task=task,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+
+ if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page
+ return
+ page += 1
+
+
+async def _ingestTask(
+ *,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ teamId: str,
+ task: Dict[str, Any],
+ limits: ClickupBootstrapLimits,
+ result: ClickupBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ taskId = str(task.get("id", "") or "")
+ if not taskId:
+ result.skippedPolicy += 1
+ return
+ revision = str(task.get("date_updated") or task.get("date_created") or "")
+ name = task.get("name") or f"Task {taskId}"
+ syntheticId = _syntheticTaskId(connectionId, taskId)
+ fileName = f"{name[:80].strip() or taskId}.task.json"
+
+ contentObjects = _buildContentObjects(task, limits)
+
+ try:
+ handle = await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="clickup_task",
+ sourceId=syntheticId,
+ fileName=fileName,
+ mimeType="application/vnd.clickup.task+json",
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ contentVersion=revision or None,
+ neutralize=limits.neutralize,
+ provenance={
+ "connectionId": connectionId,
+ "authority": "clickup",
+ "service": "clickup",
+ "externalItemId": taskId,
+ "teamId": teamId,
+ "listId": ((task.get("list") or {}).get("id")),
+ "spaceId": ((task.get("space") or {}).get("id")),
+ "url": task.get("url"),
+ "status": ((task.get("status") or {}).get("status")),
+ "tier": limits.clickupScope,
+ },
+ )
+ )
+ except Exception as exc:
+ logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True)
+ result.failed += 1
+ result.errors.append(f"ingest({taskId}): {exc}")
+ return
+
+ if handle.status == "duplicate":
+ result.skippedDuplicate += 1
+ elif handle.status == "indexed":
+ result.indexed += 1
+ else:
+ result.failed += 1
+
+ if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+ processed = result.indexed + result.skippedDuplicate
+ try:
+ progressCb(
+ min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),
+ f"clickup processed={processed}",
+ )
+ except Exception:
+ pass
+ logger.info(
+ "ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d",
+ processed, result.skippedDuplicate, result.failed,
+ extra={
+ "event": "ingestion.connection.bootstrap.progress",
+ "part": "clickup",
+ "connectionId": connectionId,
+ "processed": processed,
+ "skippedDup": result.skippedDuplicate,
+ "failed": result.failed,
+ },
+ )
+
+
+def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]:
+ durationMs = int((time.time() - startMs) * 1000)
+ logger.info(
+ "ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d",
+ connectionId,
+ result.indexed, result.skippedDuplicate, result.skippedPolicy,
+ result.failed, result.workspaces, result.lists, durationMs,
+ extra={
+ "event": "ingestion.connection.bootstrap.done",
+ "part": "clickup",
+ "connectionId": connectionId,
+ "indexed": result.indexed,
+ "skippedDup": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "failed": result.failed,
+ "workspaces": result.workspaces,
+ "lists": result.lists,
+ "durationMs": durationMs,
+ },
+ )
+ return {
+ "connectionId": result.connectionId,
+ "indexed": result.indexed,
+ "skippedDuplicate": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "failed": result.failed,
+ "workspaces": result.workspaces,
+ "lists": result.lists,
+ "durationMs": durationMs,
+ "errors": result.errors[:20],
+ }
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py
new file mode 100644
index 00000000..5e4e659b
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py
@@ -0,0 +1,443 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Google Drive bootstrap for the unified knowledge ingestion lane.
+
+Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
+user's *My Drive* tree from the virtual `root` folder, downloads each
+file-like item via `DriveAdapter.download` (which handles native Google docs
+via export), runs the standard extraction pipeline and routes results through
+`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
+`contentVersion = modifiedTime` (monotonic per-revision).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+from modules.datamodels.datamodelExtraction import ExtractionOptions
+
+logger = logging.getLogger(__name__)
+
+MAX_ITEMS_DEFAULT = 500
+MAX_BYTES_DEFAULT = 200 * 1024 * 1024
+MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
+SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
+MAX_DEPTH_DEFAULT = 4
+MAX_AGE_DAYS_DEFAULT = 365
+
+# Google Drive uses virtual mime-types for folders and non-downloadable assets.
+FOLDER_MIME = "application/vnd.google-apps.folder"
+
+
+@dataclass
+class GdriveBootstrapLimits:
+ maxItems: int = MAX_ITEMS_DEFAULT
+ maxBytes: int = MAX_BYTES_DEFAULT
+ maxFileSize: int = MAX_FILE_SIZE_DEFAULT
+ skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
+ maxDepth: int = MAX_DEPTH_DEFAULT
+ # Only ingest files modified within the last N days. None disables filter.
+ maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
+ # Pass-through to IngestionJob.neutralize
+ neutralize: bool = False
+ # Whether to skip binary/non-text files
+ filesIndexBinaries: bool = True
+
+
+@dataclass
+class GdriveBootstrapResult:
+ connectionId: str
+ indexed: int = 0
+ skippedDuplicate: int = 0
+ skippedPolicy: int = 0
+ failed: int = 0
+ bytesProcessed: int = 0
+ errors: List[str] = field(default_factory=list)
+
+
+def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
+ token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
+ return f"gd:{connectionId[:8]}:{token}"
+
+
+def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
+ parts = getattr(extracted, "parts", None) or []
+ out: List[Dict[str, Any]] = []
+ for part in parts:
+ data = getattr(part, "data", None) or ""
+ if not data or not str(data).strip():
+ continue
+ typeGroup = getattr(part, "typeGroup", "text") or "text"
+ contentType = "text"
+ if typeGroup == "image":
+ contentType = "image"
+ elif typeGroup in ("binary", "container"):
+ contentType = "other"
+ out.append({
+ "contentObjectId": getattr(part, "id", ""),
+ "contentType": contentType,
+ "data": data,
+ "contextRef": {
+ "containerPath": fileName,
+ "location": getattr(part, "label", None) or "file",
+ **(getattr(part, "metadata", None) or {}),
+ },
+ })
+ return out
+
+
+def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
+ if not maxAgeDays:
+ return True
+ if not modifiedIso:
+ # No timestamp -> be permissive (Drive native docs sometimes omit it on export).
+ return True
+ try:
+ # Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
+ ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
+ except Exception:
+ return True
+ cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
+ if ts.tzinfo is None:
+ ts = ts.replace(tzinfo=timezone.utc)
+ return ts >= cutoff
+
+
+async def bootstrapGdrive(
+ connectionId: str,
+ *,
+ progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+ adapter: Any = None,
+ connection: Any = None,
+ knowledgeService: Any = None,
+ limits: Optional[GdriveBootstrapLimits] = None,
+ runExtractionFn: Optional[Callable[..., Any]] = None,
+) -> Dict[str, Any]:
+ """Walk My Drive starting from the virtual root folder."""
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+ prefs = loadConnectionPrefs(connectionId)
+
+ if not limits:
+ limits = GdriveBootstrapLimits(
+ maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
+ neutralize=prefs.neutralizeBeforeEmbed,
+ filesIndexBinaries=prefs.filesIndexBinaries,
+ )
+
+ startMs = time.time()
+ result = GdriveBootstrapResult(connectionId=connectionId)
+
+ logger.info(
+ "ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
+ connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.started",
+ "part": "gdrive",
+ "connectionId": connectionId,
+ },
+ )
+
+ if adapter is None or knowledgeService is None or connection is None:
+ adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+ if runExtractionFn is None:
+ from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+ from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+ ExtractorRegistry, ChunkerRegistry,
+ )
+ extractorRegistry = ExtractorRegistry()
+ chunkerRegistry = ChunkerRegistry()
+
+ def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
+ return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
+
+ mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+ userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+ try:
+ await _walkFolder(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ runExtractionFn=runExtractionFn,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
+ depth=0,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ except Exception as exc:
+ logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
+ result.errors.append(f"walk: {exc}")
+
+ return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ from modules.auth import TokenManager
+ from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
+ from modules.serviceCenter import getService
+ from modules.serviceCenter.context import ServiceCenterContext
+ from modules.security.rootAccess import getRootUser
+
+ rootInterface = getRootInterface()
+ connection = rootInterface.getUserConnectionById(connectionId)
+ if connection is None:
+ raise ValueError(f"UserConnection not found: {connectionId}")
+
+ token = TokenManager().getFreshToken(connectionId)
+ if not token or not token.tokenAccess:
+ raise ValueError(f"No valid token for connection {connectionId}")
+
+ provider = GoogleConnector(connection, token.tokenAccess)
+ adapter = provider.getServiceAdapter("drive")
+
+ rootUser = getRootUser()
+ ctx = ServiceCenterContext(
+ user=rootUser,
+ mandate_id=str(getattr(connection, "mandateId", "") or ""),
+ )
+ knowledgeService = getService("knowledge", ctx)
+ return adapter, connection, knowledgeService
+
+
+async def _walkFolder(
+ *,
+ adapter,
+ knowledgeService,
+ runExtractionFn,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ folderPath: str,
+ depth: int,
+ limits: GdriveBootstrapLimits,
+ result: GdriveBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ if depth > limits.maxDepth:
+ return
+ try:
+ entries = await adapter.browse(folderPath)
+ except Exception as exc:
+ logger.warning("gdrive browse %s failed: %s", folderPath, exc)
+ result.errors.append(f"browse({folderPath}): {exc}")
+ return
+
+ for entry in entries:
+ if result.indexed + result.skippedDuplicate >= limits.maxItems:
+ return
+ if result.bytesProcessed >= limits.maxBytes:
+ return
+
+ entryPath = getattr(entry, "path", "") or ""
+ metadata = getattr(entry, "metadata", {}) or {}
+ mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
+
+ if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
+ await _walkFolder(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ runExtractionFn=runExtractionFn,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ folderPath=entryPath,
+ depth=depth + 1,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ continue
+
+ effectiveMime = mimeType or "application/octet-stream"
+ if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
+ result.skippedPolicy += 1
+ continue
+ size = int(getattr(entry, "size", 0) or 0)
+ if size and size > limits.maxFileSize:
+ result.skippedPolicy += 1
+ continue
+ modifiedTime = metadata.get("modifiedTime")
+ if not _isRecent(modifiedTime, limits.maxAgeDays):
+ result.skippedPolicy += 1
+ continue
+
+ externalItemId = metadata.get("id") or entryPath
+ revision = modifiedTime
+
+ await _ingestOne(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ runExtractionFn=runExtractionFn,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ entry=entry,
+ entryPath=entryPath,
+ mimeType=effectiveMime,
+ externalItemId=externalItemId,
+ revision=revision,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+
+
+async def _ingestOne(
+ *,
+ adapter,
+ knowledgeService,
+ runExtractionFn,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ entry,
+ entryPath: str,
+ mimeType: str,
+ externalItemId: str,
+ revision: Optional[str],
+ limits: GdriveBootstrapLimits,
+ result: GdriveBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ syntheticFileId = _syntheticFileId(connectionId, externalItemId)
+ fileName = getattr(entry, "name", "") or externalItemId
+
+ try:
+ downloaded = await adapter.download(entryPath)
+ except Exception as exc:
+ logger.warning("gdrive download %s failed: %s", entryPath, exc)
+ result.failed += 1
+ result.errors.append(f"download({entryPath}): {exc}")
+ return
+
+ # Adapter.download returns raw bytes today; guard DownloadResult shape too.
+ fileBytes: bytes
+ if isinstance(downloaded, (bytes, bytearray)):
+ fileBytes = bytes(downloaded)
+ else:
+ fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
+ if getattr(downloaded, "mimeType", None):
+ mimeType = downloaded.mimeType # export may have changed the type
+ if not fileBytes:
+ result.failed += 1
+ return
+ if len(fileBytes) > limits.maxFileSize:
+ result.skippedPolicy += 1
+ return
+
+ result.bytesProcessed += len(fileBytes)
+
+ try:
+ extracted = runExtractionFn(
+ fileBytes, fileName, mimeType,
+ ExtractionOptions(mergeStrategy=None),
+ )
+ except Exception as exc:
+ logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
+ result.failed += 1
+ result.errors.append(f"extract({entryPath}): {exc}")
+ return
+
+ contentObjects = _toContentObjects(extracted, fileName)
+ if not contentObjects:
+ result.skippedPolicy += 1
+ return
+
+ try:
+ handle = await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="gdrive_item",
+ sourceId=syntheticFileId,
+ fileName=fileName,
+ mimeType=mimeType,
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ contentVersion=revision,
+ neutralize=limits.neutralize,
+ provenance={
+ "connectionId": connectionId,
+ "authority": "google",
+ "service": "drive",
+ "externalItemId": externalItemId,
+ "entryPath": entryPath,
+ "tier": "body",
+ },
+ )
+ )
+ except Exception as exc:
+ logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
+ result.failed += 1
+ result.errors.append(f"ingest({entryPath}): {exc}")
+ return
+
+ if handle.status == "duplicate":
+ result.skippedDuplicate += 1
+ elif handle.status == "indexed":
+ result.indexed += 1
+ else:
+ result.failed += 1
+
+ if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+ processed = result.indexed + result.skippedDuplicate
+ try:
+ progressCb(
+ min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
+ f"gdrive processed={processed}",
+ )
+ except Exception:
+ pass
+ logger.info(
+ "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
+ processed, result.skippedDuplicate, result.failed,
+ extra={
+ "event": "ingestion.connection.bootstrap.progress",
+ "part": "gdrive",
+ "connectionId": connectionId,
+ "processed": processed,
+ "skippedDup": result.skippedDuplicate,
+ "failed": result.failed,
+ },
+ )
+
+
+def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
+ durationMs = int((time.time() - startMs) * 1000)
+ logger.info(
+ "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
+ connectionId,
+ result.indexed, result.skippedDuplicate, result.skippedPolicy,
+ result.failed, result.bytesProcessed, durationMs,
+ extra={
+ "event": "ingestion.connection.bootstrap.done",
+ "part": "gdrive",
+ "connectionId": connectionId,
+ "indexed": result.indexed,
+ "skippedDup": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "failed": result.failed,
+ "bytes": result.bytesProcessed,
+ "durationMs": durationMs,
+ },
+ )
+ return {
+ "connectionId": result.connectionId,
+ "indexed": result.indexed,
+ "skippedDuplicate": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "failed": result.failed,
+ "bytesProcessed": result.bytesProcessed,
+ "durationMs": durationMs,
+ "errors": result.errors[:20],
+ }
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py
new file mode 100644
index 00000000..21fec83d
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py
@@ -0,0 +1,606 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Gmail bootstrap for the unified knowledge ingestion lane.
+
+Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
+Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
+with header / snippet / cleaned body content-objects; attachments are optional
+child jobs with `sourceKind="gmail_attachment"`.
+
+Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
+passed as `contentVersion`, so rerunning the bootstrap yields
+`ingestion.skipped.duplicate` for unchanged messages.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
+
+logger = logging.getLogger(__name__)
+
+MAX_MESSAGES_DEFAULT = 500
+MAX_BODY_CHARS_DEFAULT = 8000
+MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
+DEFAULT_LABELS = ("INBOX", "SENT")
+
+
+@dataclass
+class GmailBootstrapLimits:
+ maxMessages: int = MAX_MESSAGES_DEFAULT
+ labels: tuple = DEFAULT_LABELS
+ maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
+ includeAttachments: bool = False
+ maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
+ # Only fetch messages newer than N days. None disables filter.
+ maxAgeDays: Optional[int] = 90
+ # Content depth: "metadata" | "snippet" | "full"
+ mailContentDepth: str = "full"
+ # Pass-through to IngestionJob.neutralize
+ neutralize: bool = False
+
+
+@dataclass
+class GmailBootstrapResult:
+ connectionId: str
+ indexed: int = 0
+ skippedDuplicate: int = 0
+ skippedPolicy: int = 0
+ failed: int = 0
+ attachmentsIndexed: int = 0
+ errors: List[str] = field(default_factory=list)
+
+
+def _syntheticMessageId(connectionId: str, messageId: str) -> str:
+ token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
+ return f"gm:{connectionId[:8]}:{token}"
+
+
+def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
+ token = hashlib.sha256(
+ f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
+ ).hexdigest()[:16]
+ return f"ga:{connectionId[:8]}:{token}"
+
+
+def _decodeBase64Url(data: str) -> bytes:
+ if not data:
+ return b""
+ # Gmail uses URL-safe base64 without padding.
+ padding = 4 - (len(data) % 4)
+ if padding != 4:
+ data = data + ("=" * padding)
+ try:
+ return base64.urlsafe_b64decode(data)
+ except Exception:
+ return b""
+
+
+def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
+ """Return {"text": ..., "html": ...} by walking MIME parts.
+
+ Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
+ body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
+ markup if plain is missing.
+ """
+ found: Dict[str, str] = {"text": "", "html": ""}
+
+ def _walk(part: Dict[str, Any]) -> None:
+ mime = (part.get("mimeType") or "").lower()
+ body = part.get("body") or {}
+ raw = body.get("data") or ""
+ if raw and mime.startswith("text/"):
+ decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
+ key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
+ if key and not found[key]:
+ found[key] = decoded
+ for sub in part.get("parts") or []:
+ _walk(sub)
+
+ _walk(payload or {})
+ return found
+
+
+def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
+ return {
+ (h.get("name") or "").lower(): (h.get("value") or "")
+ for h in (payload.get("headers") or [])
+ }
+
+
+def _buildContentObjects(
+ message: Dict[str, Any],
+ maxBodyChars: int,
+ mailContentDepth: str = "full",
+) -> List[Dict[str, Any]]:
+ """Build content objects for a Gmail message.
+
+ `mailContentDepth` controls how much is embedded:
+ - "metadata": header only (subject, from, to, date)
+ - "snippet": header + Gmail snippet (~155 chars, no full body)
+ - "full": header + snippet + cleaned full body (default)
+ """
+ payload = message.get("payload") or {}
+ headers = _headerMap(payload)
+ subject = headers.get("subject") or "(no subject)"
+ fromAddr = headers.get("from") or ""
+ toAddr = headers.get("to") or ""
+ ccAddr = headers.get("cc") or ""
+ date = headers.get("date") or ""
+ snippet = message.get("snippet") or ""
+
+ parts: List[Dict[str, Any]] = []
+ header = (
+ f"Subject: {subject}\n"
+ f"From: {fromAddr}\n"
+ f"To: {toAddr}\n"
+ + (f"Cc: {ccAddr}\n" if ccAddr else "")
+ + f"Date: {date}"
+ )
+ parts.append({
+ "contentObjectId": "header",
+ "contentType": "text",
+ "data": header,
+ "contextRef": {"part": "header"},
+ })
+ if mailContentDepth in ("snippet", "full") and snippet:
+ parts.append({
+ "contentObjectId": "snippet",
+ "contentType": "text",
+ "data": snippet,
+ "contextRef": {"part": "snippet"},
+ })
+ if mailContentDepth == "full":
+ bodies = _walkPayloadForBody(payload)
+ rawBody = bodies["text"] or bodies["html"]
+ cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
+ if cleanedBody:
+ parts.append({
+ "contentObjectId": "body",
+ "contentType": "text",
+ "data": cleanedBody,
+ "contextRef": {"part": "body"},
+ })
+ return parts
+
+
+async def bootstrapGmail(
+ connectionId: str,
+ *,
+ progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+ adapter: Any = None,
+ connection: Any = None,
+ knowledgeService: Any = None,
+ limits: Optional[GmailBootstrapLimits] = None,
+ googleGetFn: Optional[Callable[..., Any]] = None,
+) -> Dict[str, Any]:
+ """Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+ prefs = loadConnectionPrefs(connectionId)
+
+ if not limits:
+ limits = GmailBootstrapLimits(
+ includeAttachments=prefs.mailIndexAttachments,
+ maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
+ mailContentDepth=prefs.mailContentDepth,
+ neutralize=prefs.neutralizeBeforeEmbed,
+ )
+
+ startMs = time.time()
+ result = GmailBootstrapResult(connectionId=connectionId)
+
+ logger.info(
+ "ingestion.connection.bootstrap.started part=gmail connectionId=%s",
+ connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.started",
+ "part": "gmail",
+ "connectionId": connectionId,
+ },
+ )
+
+ if adapter is None or knowledgeService is None or connection is None:
+ adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+
+ if googleGetFn is None:
+ from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet
+
+ token = getattr(adapter, "_token", "")
+
+ async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef]
+ return await _defaultGet(token, url)
+
+ mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+ userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+ for labelId in limits.labels:
+ if result.indexed + result.skippedDuplicate >= limits.maxMessages:
+ break
+ try:
+ await _ingestLabel(
+ googleGetFn=googleGetFn,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ labelId=labelId,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ except Exception as exc:
+ logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
+ result.errors.append(f"label({labelId}): {exc}")
+
+ return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ from modules.auth import TokenManager
+ from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
+ from modules.serviceCenter import getService
+ from modules.serviceCenter.context import ServiceCenterContext
+ from modules.security.rootAccess import getRootUser
+
+ rootInterface = getRootInterface()
+ connection = rootInterface.getUserConnectionById(connectionId)
+ if connection is None:
+ raise ValueError(f"UserConnection not found: {connectionId}")
+
+ token = TokenManager().getFreshToken(connectionId)
+ if not token or not token.tokenAccess:
+ raise ValueError(f"No valid token for connection {connectionId}")
+
+ provider = GoogleConnector(connection, token.tokenAccess)
+ adapter = provider.getServiceAdapter("gmail")
+
+ rootUser = getRootUser()
+ ctx = ServiceCenterContext(
+ user=rootUser,
+ mandate_id=str(getattr(connection, "mandateId", "") or ""),
+ )
+ knowledgeService = getService("knowledge", ctx)
+ return adapter, connection, knowledgeService
+
+
+async def _ingestLabel(
+ *,
+ googleGetFn,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ labelId: str,
+ limits: GmailBootstrapLimits,
+ result: GmailBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
+ if remaining <= 0:
+ return
+
+ pageSize = min(100, remaining)
+ query = ""
+ if limits.maxAgeDays:
+ cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
+ # Gmail uses YYYY/MM/DD.
+ query = f"after:{cutoff.strftime('%Y/%m/%d')}"
+
+ baseUrl = (
+ "https://gmail.googleapis.com/gmail/v1/users/me/messages"
+ f"?labelIds={labelId}&maxResults={pageSize}"
+ )
+ if query:
+ baseUrl = f"{baseUrl}&q={query}"
+
+ nextPageToken: Optional[str] = None
+ while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
+ url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
+ page = await googleGetFn(url)
+ if not isinstance(page, dict) or "error" in page:
+ err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
+ logger.warning("gmail list page error for label %s: %s", labelId, err)
+ result.errors.append(f"list({labelId}): {err}")
+ return
+
+ messageStubs = page.get("messages") or []
+ for stub in messageStubs:
+ if result.indexed + result.skippedDuplicate >= limits.maxMessages:
+ break
+ msgId = stub.get("id")
+ if not msgId:
+ continue
+ detailUrl = (
+ f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
+ )
+ detail = await googleGetFn(detailUrl)
+ if not isinstance(detail, dict) or "error" in detail:
+ result.failed += 1
+ continue
+ await _ingestMessage(
+ googleGetFn=googleGetFn,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ labelId=labelId,
+ message=detail,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+
+ nextPageToken = page.get("nextPageToken")
+ if not nextPageToken:
+ break
+
+
+async def _ingestMessage(
+ *,
+ googleGetFn,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ labelId: str,
+ message: Dict[str, Any],
+ limits: GmailBootstrapLimits,
+ result: GmailBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ messageId = message.get("id")
+ if not messageId:
+ result.skippedPolicy += 1
+ return
+ revision = message.get("historyId") or message.get("internalDate")
+ headers = _headerMap(message.get("payload") or {})
+ subject = headers.get("subject") or "(no subject)"
+ syntheticId = _syntheticMessageId(connectionId, messageId)
+ fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
+
+ contentObjects = _buildContentObjects(
+ message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
+ )
+ try:
+ handle = await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="gmail_message",
+ sourceId=syntheticId,
+ fileName=fileName,
+ mimeType="message/rfc822",
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ contentVersion=str(revision) if revision else None,
+ neutralize=limits.neutralize,
+ provenance={
+ "connectionId": connectionId,
+ "authority": "google",
+ "service": "gmail",
+ "externalItemId": messageId,
+ "label": labelId,
+ "threadId": message.get("threadId"),
+ "tier": limits.mailContentDepth,
+ },
+ )
+ )
+ except Exception as exc:
+ logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
+ result.failed += 1
+ result.errors.append(f"ingest({messageId}): {exc}")
+ return
+
+ if handle.status == "duplicate":
+ result.skippedDuplicate += 1
+ elif handle.status == "indexed":
+ result.indexed += 1
+ else:
+ result.failed += 1
+
+ if limits.includeAttachments:
+ try:
+ await _ingestAttachments(
+ googleGetFn=googleGetFn,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ message=message,
+ parentSyntheticId=syntheticId,
+ limits=limits,
+ result=result,
+ )
+ except Exception as exc:
+ logger.warning("gmail attachments %s failed: %s", messageId, exc)
+ result.errors.append(f"attachments({messageId}): {exc}")
+
+ if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+ processed = result.indexed + result.skippedDuplicate
+ try:
+ progressCb(
+ min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
+ f"gmail processed={processed}",
+ )
+ except Exception:
+ pass
+ logger.info(
+ "ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
+ processed, result.skippedDuplicate, result.failed,
+ extra={
+ "event": "ingestion.connection.bootstrap.progress",
+ "part": "gmail",
+ "connectionId": connectionId,
+ "processed": processed,
+ "skippedDup": result.skippedDuplicate,
+ "failed": result.failed,
+ },
+ )
+
+ await asyncio.sleep(0)
+
+
+async def _ingestAttachments(
+ *,
+ googleGetFn,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ message: Dict[str, Any],
+ parentSyntheticId: str,
+ limits: GmailBootstrapLimits,
+ result: GmailBootstrapResult,
+) -> None:
+ """Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+ from modules.datamodels.datamodelExtraction import ExtractionOptions
+ from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+ from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+ ExtractorRegistry, ChunkerRegistry,
+ )
+
+ messageId = message.get("id") or ""
+
+ def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
+ filename = part.get("filename") or ""
+ body = part.get("body") or {}
+ attId = body.get("attachmentId")
+ if filename and attId:
+ acc.append({
+ "filename": filename,
+ "mimeType": part.get("mimeType") or "application/octet-stream",
+ "attachmentId": attId,
+ "size": int(body.get("size") or 0),
+ })
+ for sub in part.get("parts") or []:
+ _collectAttachmentStubs(sub, acc)
+
+ stubs: List[Dict[str, Any]] = []
+ _collectAttachmentStubs(message.get("payload") or {}, stubs)
+ if not stubs:
+ return
+
+ extractorRegistry = ExtractorRegistry()
+ chunkerRegistry = ChunkerRegistry()
+
+ for stub in stubs:
+ if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
+ result.skippedPolicy += 1
+ continue
+ attUrl = (
+ f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
+ f"/attachments/{stub['attachmentId']}"
+ )
+ detail = await googleGetFn(attUrl)
+ if not isinstance(detail, dict) or "error" in detail:
+ result.failed += 1
+ continue
+ rawBytes = _decodeBase64Url(detail.get("data") or "")
+ if not rawBytes:
+ continue
+ fileName = stub["filename"]
+ mimeType = stub["mimeType"]
+ syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])
+
+ try:
+ extracted = runExtraction(
+ extractorRegistry, chunkerRegistry,
+ rawBytes, fileName, mimeType,
+ ExtractionOptions(mergeStrategy=None),
+ )
+ except Exception as exc:
+ logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
+ result.failed += 1
+ continue
+
+ contentObjects: List[Dict[str, Any]] = []
+ for part in getattr(extracted, "parts", None) or []:
+ data = getattr(part, "data", None) or ""
+ if not data or not str(data).strip():
+ continue
+ typeGroup = getattr(part, "typeGroup", "text") or "text"
+ contentType = "text"
+ if typeGroup == "image":
+ contentType = "image"
+ elif typeGroup in ("binary", "container"):
+ contentType = "other"
+ contentObjects.append({
+ "contentObjectId": getattr(part, "id", ""),
+ "contentType": contentType,
+ "data": data,
+ "contextRef": {
+ "containerPath": fileName,
+ "location": getattr(part, "label", None) or "attachment",
+ **(getattr(part, "metadata", None) or {}),
+ },
+ })
+ if not contentObjects:
+ result.skippedPolicy += 1
+ continue
+
+ try:
+ await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="gmail_attachment",
+ sourceId=syntheticId,
+ fileName=fileName,
+ mimeType=mimeType,
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ provenance={
+ "connectionId": connectionId,
+ "authority": "google",
+ "service": "gmail",
+ "parentId": parentSyntheticId,
+ "externalItemId": stub["attachmentId"],
+ "parentMessageId": messageId,
+ },
+ )
+ )
+ result.attachmentsIndexed += 1
+ except Exception as exc:
+ logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
+ result.failed += 1
+
+
+def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
+ durationMs = int((time.time() - startMs) * 1000)
+ logger.info(
+ "ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
+ connectionId,
+ result.indexed, result.skippedDuplicate, result.skippedPolicy,
+ result.attachmentsIndexed, result.failed, durationMs,
+ extra={
+ "event": "ingestion.connection.bootstrap.done",
+ "part": "gmail",
+ "connectionId": connectionId,
+ "indexed": result.indexed,
+ "skippedDup": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "attachmentsIndexed": result.attachmentsIndexed,
+ "failed": result.failed,
+ "durationMs": durationMs,
+ },
+ )
+ return {
+ "connectionId": result.connectionId,
+ "indexed": result.indexed,
+ "skippedDuplicate": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "attachmentsIndexed": result.attachmentsIndexed,
+ "failed": result.failed,
+ "durationMs": durationMs,
+ "errors": result.errors[:20],
+ }
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py
new file mode 100644
index 00000000..64a3545f
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py
@@ -0,0 +1,576 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Outlook bootstrap for the unified knowledge ingestion lane.
+
+Unlike SharePoint, Outlook messages are "virtual documents" — we never persist
+file bytes in the store. Each message becomes a `sourceKind="outlook_message"`
+IngestionJob whose `contentObjects` carry the header, snippet and cleaned body
+so retrieval can show a compact answer without fetching Graph again.
+
+Attachments are optional (`includeAttachments` limit flag) and enqueued as
+child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
+
+logger = logging.getLogger(__name__)
+
+MAX_MESSAGES_DEFAULT = 500
+MAX_FOLDERS_DEFAULT = 5
+MAX_BODY_CHARS_DEFAULT = 8000
+MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
+WELL_KNOWN_FOLDERS = ("inbox", "sentitems")
+
+
+@dataclass
+class OutlookBootstrapLimits:
+ maxMessages: int = MAX_MESSAGES_DEFAULT
+ maxFolders: int = MAX_FOLDERS_DEFAULT
+ maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
+ includeAttachments: bool = False
+ maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
+ # Only fetch messages newer than N days. None disables filter.
+ maxAgeDays: Optional[int] = 90
+ # Content depth: "metadata" | "snippet" | "full"
+ mailContentDepth: str = "full"
+ # Pass-through to IngestionJob.neutralize
+ neutralize: bool = False
+
+
+@dataclass
+class OutlookBootstrapResult:
+ connectionId: str
+ indexed: int = 0
+ skippedDuplicate: int = 0
+ skippedPolicy: int = 0
+ failed: int = 0
+ attachmentsIndexed: int = 0
+ errors: List[str] = field(default_factory=list)
+
+
+def _syntheticMessageId(connectionId: str, messageId: str) -> str:
+ token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
+ return f"om:{connectionId[:8]}:{token}"
+
+
+def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
+ token = hashlib.sha256(
+ f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
+ ).hexdigest()[:16]
+ return f"oa:{connectionId[:8]}:{token}"
+
+
+def _extractRecipient(recipient: Dict[str, Any]) -> str:
+ email = (recipient or {}).get("emailAddress") or {}
+ name = email.get("name") or ""
+ addr = email.get("address") or ""
+ if name and addr:
+ return f"{name} <{addr}>"
+ return addr or name
+
+
+def _joinRecipients(recipients: List[Dict[str, Any]]) -> str:
+ return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []]))
+
+
+def _buildContentObjects(
+ message: Dict[str, Any],
+ maxBodyChars: int,
+ mailContentDepth: str = "full",
+) -> List[Dict[str, Any]]:
+ """Build content objects for an Outlook message.
+
+ `mailContentDepth` mirrors the Gmail walker:
+ - "metadata": header only
+ - "snippet": header + bodyPreview (~255 chars)
+ - "full": header + snippet + cleaned body (default)
+ """
+ subject = message.get("subject") or "(no subject)"
+ fromAddr = _extractRecipient(message.get("from") or {})
+ toAddr = _joinRecipients(message.get("toRecipients") or [])
+ ccAddr = _joinRecipients(message.get("ccRecipients") or [])
+ received = message.get("receivedDateTime") or ""
+ snippet = message.get("bodyPreview") or ""
+
+ parts: List[Dict[str, Any]] = []
+ header = (
+ f"Subject: {subject}\n"
+ f"From: {fromAddr}\n"
+ f"To: {toAddr}\n"
+ + (f"Cc: {ccAddr}\n" if ccAddr else "")
+ + f"Date: {received}"
+ )
+ parts.append({
+ "contentObjectId": "header",
+ "contentType": "text",
+ "data": header,
+ "contextRef": {"part": "header"},
+ })
+ if mailContentDepth in ("snippet", "full") and snippet:
+ parts.append({
+ "contentObjectId": "snippet",
+ "contentType": "text",
+ "data": snippet,
+ "contextRef": {"part": "snippet"},
+ })
+ if mailContentDepth == "full":
+ body = message.get("body") or {}
+ bodyContent = body.get("content") or ""
+ cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else ""
+ if cleanedBody:
+ parts.append({
+ "contentObjectId": "body",
+ "contentType": "text",
+ "data": cleanedBody,
+ "contextRef": {"part": "body"},
+ })
+ return parts
+
+
+async def bootstrapOutlook(
+ connectionId: str,
+ *,
+ progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+ adapter: Any = None,
+ connection: Any = None,
+ knowledgeService: Any = None,
+ limits: Optional[OutlookBootstrapLimits] = None,
+) -> Dict[str, Any]:
+ """Enumerate Outlook folders (inbox + sent by default) and ingest messages."""
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+ prefs = loadConnectionPrefs(connectionId)
+
+ if not limits:
+ limits = OutlookBootstrapLimits(
+ includeAttachments=prefs.mailIndexAttachments,
+ maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
+ mailContentDepth=prefs.mailContentDepth,
+ neutralize=prefs.neutralizeBeforeEmbed,
+ )
+
+ startMs = time.time()
+ result = OutlookBootstrapResult(connectionId=connectionId)
+
+ logger.info(
+ "ingestion.connection.bootstrap.started part=outlook connectionId=%s",
+ connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.started",
+ "part": "outlook",
+ "connectionId": connectionId,
+ },
+ )
+
+ if adapter is None or knowledgeService is None or connection is None:
+ adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+
+ mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+ userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+ folderIds = await _selectFolderIds(adapter, limits)
+ for folderId in folderIds:
+ if result.indexed + result.skippedDuplicate >= limits.maxMessages:
+ break
+ try:
+ await _ingestFolder(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ folderId=folderId,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ except Exception as exc:
+ logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
+ result.errors.append(f"folder({folderId}): {exc}")
+
+ return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ from modules.auth import TokenManager
+ from modules.connectors.providerMsft.connectorMsft import MsftConnector
+ from modules.serviceCenter import getService
+ from modules.serviceCenter.context import ServiceCenterContext
+ from modules.security.rootAccess import getRootUser
+
+ rootInterface = getRootInterface()
+ connection = rootInterface.getUserConnectionById(connectionId)
+ if connection is None:
+ raise ValueError(f"UserConnection not found: {connectionId}")
+
+ token = TokenManager().getFreshToken(connectionId)
+ if not token or not token.tokenAccess:
+ raise ValueError(f"No valid token for connection {connectionId}")
+
+ provider = MsftConnector(connection, token.tokenAccess)
+ adapter = provider.getServiceAdapter("outlook")
+
+ rootUser = getRootUser()
+ ctx = ServiceCenterContext(
+ user=rootUser,
+ mandate_id=str(getattr(connection, "mandateId", "") or ""),
+ )
+ knowledgeService = getService("knowledge", ctx)
+ return adapter, connection, knowledgeService
+
+
+async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]:
+ """Prefer well-known folders (inbox, sentitems); fall back to browse()."""
+ folderIds: List[str] = []
+ for wellKnown in WELL_KNOWN_FOLDERS:
+ if len(folderIds) >= limits.maxFolders:
+ break
+ try:
+ row = await adapter._graphGet(f"me/mailFolders/{wellKnown}")
+ except Exception:
+ row = None
+ if isinstance(row, dict) and "error" not in row and row.get("id"):
+ folderIds.append(row["id"])
+
+ if len(folderIds) < limits.maxFolders:
+ try:
+ entries = await adapter.browse("/")
+ except Exception:
+ entries = []
+ for entry in entries:
+ metadata = getattr(entry, "metadata", {}) or {}
+ fid = metadata.get("id")
+ if fid and fid not in folderIds:
+ folderIds.append(fid)
+ if len(folderIds) >= limits.maxFolders:
+ break
+ return folderIds
+
+
+async def _ingestFolder(
+ *,
+ adapter,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ folderId: str,
+ limits: OutlookBootstrapLimits,
+ result: OutlookBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
+ if remaining <= 0:
+ return
+
+ pageSize = min(100, remaining)
+ select = (
+ "id,subject,from,toRecipients,ccRecipients,receivedDateTime,"
+ "bodyPreview,body,internetMessageId,hasAttachments,changeKey"
+ )
+ endpoint: Optional[str] = (
+ f"me/mailFolders/{folderId}/messages"
+ f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}"
+ )
+
+ # Keep header-based age filter in Graph itself to avoid shipping ancient
+ # messages we'd discard client-side.
+ if limits.maxAgeDays:
+ from datetime import datetime, timezone, timedelta
+
+ cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
+ cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ")
+ endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}"
+
+ while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages:
+ try:
+ page = await adapter._graphGet(endpoint)
+ except Exception as exc:
+ logger.warning("outlook graph page failed for folder %s: %s", folderId, exc)
+ result.errors.append(f"graph({folderId}): {exc}")
+ return
+ if not isinstance(page, dict) or "error" in page:
+ err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
+ logger.warning("outlook graph page error for folder %s: %s", folderId, err)
+ result.errors.append(f"graph({folderId}): {err}")
+ return
+
+ for message in page.get("value", []) or []:
+ if result.indexed + result.skippedDuplicate >= limits.maxMessages:
+ break
+ await _ingestMessage(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ message=message,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+
+ nextLink = page.get("@odata.nextLink")
+ if not nextLink:
+ break
+ # Strip Graph base so adapter._graphGet accepts the relative path.
+ from modules.connectors.providerMsft.connectorMsft import _stripGraphBase
+
+ endpoint = _stripGraphBase(nextLink)
+
+
+async def _ingestMessage(
+ *,
+ adapter,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ message: Dict[str, Any],
+ limits: OutlookBootstrapLimits,
+ result: OutlookBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ messageId = message.get("id")
+ if not messageId:
+ result.skippedPolicy += 1
+ return
+ revision = message.get("changeKey") or message.get("internetMessageId")
+ subject = message.get("subject") or "(no subject)"
+ syntheticId = _syntheticMessageId(connectionId, messageId)
+ fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
+
+ contentObjects = _buildContentObjects(
+ message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
+ )
+ # Always at least the header is emitted, so `contentObjects` is non-empty.
+ try:
+ handle = await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="outlook_message",
+ sourceId=syntheticId,
+ fileName=fileName,
+ mimeType="message/rfc822",
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ contentVersion=revision,
+ neutralize=limits.neutralize,
+ provenance={
+ "connectionId": connectionId,
+ "authority": "msft",
+ "service": "outlook",
+ "externalItemId": messageId,
+ "internetMessageId": message.get("internetMessageId"),
+ "tier": limits.mailContentDepth,
+ },
+ )
+ )
+ except Exception as exc:
+ logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True)
+ result.failed += 1
+ result.errors.append(f"ingest({messageId}): {exc}")
+ return
+
+ if handle.status == "duplicate":
+ result.skippedDuplicate += 1
+ elif handle.status == "indexed":
+ result.indexed += 1
+ else:
+ result.failed += 1
+
+ if limits.includeAttachments and message.get("hasAttachments"):
+ try:
+ await _ingestAttachments(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ messageId=messageId,
+ parentSyntheticId=syntheticId,
+ limits=limits,
+ result=result,
+ )
+ except Exception as exc:
+ logger.warning("outlook attachments %s failed: %s", messageId, exc)
+ result.errors.append(f"attachments({messageId}): {exc}")
+
+ if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+ processed = result.indexed + result.skippedDuplicate
+ try:
+ progressCb(
+ min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
+ f"outlook processed={processed}",
+ )
+ except Exception:
+ pass
+ logger.info(
+ "ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d",
+ processed, result.skippedDuplicate, result.failed,
+ extra={
+ "event": "ingestion.connection.bootstrap.progress",
+ "part": "outlook",
+ "connectionId": connectionId,
+ "processed": processed,
+ "skippedDup": result.skippedDuplicate,
+ "failed": result.failed,
+ },
+ )
+
+ await asyncio.sleep(0)
+
+
+async def _ingestAttachments(
+ *,
+ adapter,
+ knowledgeService,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ messageId: str,
+ parentSyntheticId: str,
+ limits: OutlookBootstrapLimits,
+ result: OutlookBootstrapResult,
+) -> None:
+ """Child ingestion jobs for file attachments (skip inline & oversized)."""
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+ from modules.datamodels.datamodelExtraction import ExtractionOptions
+ from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+ from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+ ExtractorRegistry, ChunkerRegistry,
+ )
+ import base64
+
+ page = await adapter._graphGet(f"me/messages/{messageId}/attachments")
+ if not isinstance(page, dict) or "error" in page:
+ return
+
+ extractorRegistry = ExtractorRegistry()
+ chunkerRegistry = ChunkerRegistry()
+
+ for attachment in page.get("value", []) or []:
+ if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment":
+ continue
+ if attachment.get("isInline"):
+ continue
+ size = int(attachment.get("size") or 0)
+ if size and size > limits.maxAttachmentBytes:
+ result.skippedPolicy += 1
+ continue
+ contentBytesB64 = attachment.get("contentBytes")
+ if not contentBytesB64:
+ continue
+ try:
+ rawBytes = base64.b64decode(contentBytesB64)
+ except Exception:
+ result.skippedPolicy += 1
+ continue
+ fileName = attachment.get("name") or "attachment"
+ mimeType = attachment.get("contentType") or "application/octet-stream"
+ attachmentId = attachment.get("id") or fileName
+ syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId)
+
+ try:
+ extracted = runExtraction(
+ extractorRegistry, chunkerRegistry,
+ rawBytes, fileName, mimeType,
+ ExtractionOptions(mergeStrategy=None),
+ )
+ except Exception as exc:
+ logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc)
+ result.failed += 1
+ continue
+
+ contentObjects: List[Dict[str, Any]] = []
+ for part in getattr(extracted, "parts", None) or []:
+ data = getattr(part, "data", None) or ""
+ if not data or not str(data).strip():
+ continue
+ typeGroup = getattr(part, "typeGroup", "text") or "text"
+ contentType = "text"
+ if typeGroup == "image":
+ contentType = "image"
+ elif typeGroup in ("binary", "container"):
+ contentType = "other"
+ contentObjects.append({
+ "contentObjectId": getattr(part, "id", ""),
+ "contentType": contentType,
+ "data": data,
+ "contextRef": {
+ "containerPath": fileName,
+ "location": getattr(part, "label", None) or "attachment",
+ **(getattr(part, "metadata", None) or {}),
+ },
+ })
+ if not contentObjects:
+ result.skippedPolicy += 1
+ continue
+
+ try:
+ await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="outlook_attachment",
+ sourceId=syntheticId,
+ fileName=fileName,
+ mimeType=mimeType,
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ neutralize=limits.neutralize,
+ provenance={
+ "connectionId": connectionId,
+ "authority": "msft",
+ "service": "outlook",
+ "parentId": parentSyntheticId,
+ "externalItemId": attachmentId,
+ "parentMessageId": messageId,
+ },
+ )
+ )
+ result.attachmentsIndexed += 1
+ except Exception as exc:
+ logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc)
+ result.failed += 1
+
+
+def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]:
+ durationMs = int((time.time() - startMs) * 1000)
+ logger.info(
+ "ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
+ connectionId,
+ result.indexed, result.skippedDuplicate, result.skippedPolicy,
+ result.attachmentsIndexed, result.failed, durationMs,
+ extra={
+ "event": "ingestion.connection.bootstrap.done",
+ "part": "outlook",
+ "connectionId": connectionId,
+ "indexed": result.indexed,
+ "skippedDup": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "attachmentsIndexed": result.attachmentsIndexed,
+ "failed": result.failed,
+ "durationMs": durationMs,
+ },
+ )
+ return {
+ "connectionId": result.connectionId,
+ "indexed": result.indexed,
+ "skippedDuplicate": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "attachmentsIndexed": result.attachmentsIndexed,
+ "failed": result.failed,
+ "durationMs": durationMs,
+ "errors": result.errors[:20],
+ }
diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py
new file mode 100644
index 00000000..07fef7a8
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py
@@ -0,0 +1,433 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""SharePoint bootstrap for the unified knowledge ingestion lane.
+
+Walks the SharePoint drive(s) reachable via a UserConnection, downloads each
+file-like item, runs the standard content extraction pipeline and hands the
+result to `KnowledgeService.requestIngestion`. Idempotency is provided by the
+ingestion façade itself; repeat bootstraps therefore produce
+`ingestion.skipped.duplicate` for every unchanged item because we pass the
+Graph `eTag` as `contentVersion`.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+from modules.datamodels.datamodelExtraction import ExtractionOptions
+
+logger = logging.getLogger(__name__)
+
+MAX_ITEMS_DEFAULT = 500
+MAX_BYTES_DEFAULT = 200 * 1024 * 1024
+MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
+SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
+MAX_DEPTH_DEFAULT = 4
+MAX_SITES_DEFAULT = 3
+
+
+@dataclass
+class SharepointBootstrapLimits:
+ maxItems: int = MAX_ITEMS_DEFAULT
+ maxBytes: int = MAX_BYTES_DEFAULT
+ maxFileSize: int = MAX_FILE_SIZE_DEFAULT
+ skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
+ maxDepth: int = MAX_DEPTH_DEFAULT
+ maxSites: int = MAX_SITES_DEFAULT
+ # Pass-through to IngestionJob.neutralize
+ neutralize: bool = False
+
+
+@dataclass
+class SharepointBootstrapResult:
+ connectionId: str
+ indexed: int = 0
+ skippedDuplicate: int = 0
+ skippedPolicy: int = 0
+ failed: int = 0
+ bytesProcessed: int = 0
+ errors: List[str] = field(default_factory=list)
+
+
+def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
+ """Deterministic synthetic FileContentIndex id for a SharePoint item.
+
+ Stable across bootstraps → idempotency works; independent of file name so
+ moves/renames don't duplicate chunks.
+ """
+ token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
+ return f"sp:{connectionId[:8]}:{token}"
+
+
+def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
+ """Translate ExtractionResult → content objects accepted by requestIngestion."""
+ parts = getattr(extracted, "parts", None) or []
+ out: List[Dict[str, Any]] = []
+ for part in parts:
+ data = getattr(part, "data", None) or ""
+ if not data or not str(data).strip():
+ continue
+ typeGroup = getattr(part, "typeGroup", "text") or "text"
+ contentType = "text"
+ if typeGroup == "image":
+ contentType = "image"
+ elif typeGroup in ("binary", "container"):
+ contentType = "other"
+ out.append({
+ "contentObjectId": getattr(part, "id", ""),
+ "contentType": contentType,
+ "data": data,
+ "contextRef": {
+ "containerPath": fileName,
+ "location": getattr(part, "label", None) or "file",
+ **(getattr(part, "metadata", None) or {}),
+ },
+ })
+ return out
+
+
+async def bootstrapSharepoint(
+ connectionId: str,
+ *,
+ progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
+ adapter: Any = None,
+ connection: Any = None,
+ knowledgeService: Any = None,
+ limits: Optional[SharepointBootstrapLimits] = None,
+ runExtractionFn: Optional[Callable[..., Any]] = None,
+) -> Dict[str, Any]:
+ """Enumerate SharePoint drives and ingest every reachable file via the façade.
+
+ Parameters allow injection for tests; production callers pass only
+ `connectionId` (and optionally a progressCb) and everything else is
+ resolved against the registered services.
+ """
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+ prefs = loadConnectionPrefs(connectionId)
+
+ if not limits:
+ limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed)
+
+ startMs = time.time()
+ result = SharepointBootstrapResult(connectionId=connectionId)
+
+ logger.info(
+ "ingestion.connection.bootstrap.started part=sharepoint connectionId=%s",
+ connectionId,
+ extra={
+ "event": "ingestion.connection.bootstrap.started",
+ "part": "sharepoint",
+ "connectionId": connectionId,
+ },
+ )
+
+ if adapter is None or knowledgeService is None or connection is None:
+ adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
+ if runExtractionFn is None:
+ from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+ from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+ ExtractorRegistry, ChunkerRegistry,
+ )
+ extractorRegistry = ExtractorRegistry()
+ chunkerRegistry = ChunkerRegistry()
+
+ def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
+ return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
+
+ mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
+ userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
+
+ try:
+ sites = await adapter.browse("/", limit=limits.maxSites)
+ except Exception as exc:
+ logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
+ result.errors.append(f"site_discovery: {exc}")
+ return _finalizeResult(connectionId, result, startMs)
+
+ for site in sites[: limits.maxSites]:
+ if result.indexed + result.skippedDuplicate >= limits.maxItems:
+ break
+ sitePath = getattr(site, "path", "") or ""
+ try:
+ await _walkFolder(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ runExtractionFn=runExtractionFn,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ folderPath=sitePath,
+ depth=0,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ except Exception as exc:
+ logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True)
+ result.errors.append(f"walk({sitePath}): {exc}")
+
+ return _finalizeResult(connectionId, result, startMs)
+
+
+async def _resolveDependencies(connectionId: str):
+ """Load connection, instantiate SharepointAdapter, and build a KnowledgeService.
+
+ Runs with root privileges: bootstrap is a system operation triggered by an
+ authenticated user via callback; it must not be gated by a per-user
+ service-center context.
+ """
+ from modules.interfaces.interfaceDbApp import getRootInterface
+ from modules.auth import TokenManager
+ from modules.connectors.providerMsft.connectorMsft import MsftConnector
+ from modules.serviceCenter import getService
+ from modules.serviceCenter.context import ServiceCenterContext
+ from modules.security.rootAccess import getRootUser
+
+ rootInterface = getRootInterface()
+ connection = rootInterface.getUserConnectionById(connectionId)
+ if connection is None:
+ raise ValueError(f"UserConnection not found: {connectionId}")
+
+ token = TokenManager().getFreshToken(connectionId)
+ if not token or not token.tokenAccess:
+ raise ValueError(f"No valid token for connection {connectionId}")
+
+ provider = MsftConnector(connection, token.tokenAccess)
+ adapter = provider.getServiceAdapter("sharepoint")
+
+ rootUser = getRootUser()
+ ctx = ServiceCenterContext(
+ user=rootUser,
+ mandate_id=str(getattr(connection, "mandateId", "") or ""),
+ )
+ knowledgeService = getService("knowledge", ctx)
+ return adapter, connection, knowledgeService
+
+
+async def _walkFolder(
+ *,
+ adapter,
+ knowledgeService,
+ runExtractionFn,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ folderPath: str,
+ depth: int,
+ limits: SharepointBootstrapLimits,
+ result: SharepointBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ if depth > limits.maxDepth:
+ return
+ try:
+ entries = await adapter.browse(folderPath)
+ except Exception as exc:
+ logger.warning("sharepoint browse %s failed: %s", folderPath, exc)
+ result.errors.append(f"browse({folderPath}): {exc}")
+ return
+
+ for entry in entries:
+ if result.indexed + result.skippedDuplicate >= limits.maxItems:
+ return
+ if result.bytesProcessed >= limits.maxBytes:
+ return
+
+ entryPath = getattr(entry, "path", "") or ""
+ if getattr(entry, "isFolder", False):
+ await _walkFolder(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ runExtractionFn=runExtractionFn,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ folderPath=entryPath,
+ depth=depth + 1,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+ continue
+
+ mimeType = getattr(entry, "mimeType", None) or "application/octet-stream"
+ if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes):
+ result.skippedPolicy += 1
+ continue
+ size = int(getattr(entry, "size", 0) or 0)
+ if size and size > limits.maxFileSize:
+ result.skippedPolicy += 1
+ continue
+
+ metadata = getattr(entry, "metadata", {}) or {}
+ externalItemId = metadata.get("id") or entryPath
+ revision = metadata.get("revision") or metadata.get("lastModifiedDateTime")
+
+ await _ingestOne(
+ adapter=adapter,
+ knowledgeService=knowledgeService,
+ runExtractionFn=runExtractionFn,
+ connectionId=connectionId,
+ mandateId=mandateId,
+ userId=userId,
+ entry=entry,
+ entryPath=entryPath,
+ mimeType=mimeType,
+ externalItemId=externalItemId,
+ revision=revision,
+ limits=limits,
+ result=result,
+ progressCb=progressCb,
+ )
+
+
+async def _ingestOne(
+ *,
+ adapter,
+ knowledgeService,
+ runExtractionFn,
+ connectionId: str,
+ mandateId: str,
+ userId: str,
+ entry,
+ entryPath: str,
+ mimeType: str,
+ externalItemId: str,
+ revision: Optional[str],
+ limits: SharepointBootstrapLimits,
+ result: SharepointBootstrapResult,
+ progressCb: Optional[Callable[[int, Optional[str]], None]],
+) -> None:
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ syntheticFileId = _syntheticFileId(connectionId, externalItemId)
+ fileName = getattr(entry, "name", "") or externalItemId
+
+ try:
+ fileBytes = await adapter.download(entryPath)
+ except Exception as exc:
+ logger.warning("sharepoint download %s failed: %s", entryPath, exc)
+ result.failed += 1
+ result.errors.append(f"download({entryPath}): {exc}")
+ return
+ if not fileBytes:
+ result.failed += 1
+ return
+
+ result.bytesProcessed += len(fileBytes)
+
+ try:
+ extracted = runExtractionFn(
+ fileBytes, fileName, mimeType,
+ ExtractionOptions(mergeStrategy=None),
+ )
+ except Exception as exc:
+ logger.warning("sharepoint extraction %s failed: %s", entryPath, exc)
+ result.failed += 1
+ result.errors.append(f"extract({entryPath}): {exc}")
+ return
+
+ contentObjects = _toContentObjects(extracted, fileName)
+ if not contentObjects:
+ result.skippedPolicy += 1
+ return
+
+ provenance: Dict[str, Any] = {
+ "connectionId": connectionId,
+ "authority": "msft",
+ "service": "sharepoint",
+ "externalItemId": externalItemId,
+ "externalPath": entryPath,
+ "revision": revision,
+ }
+ try:
+ handle = await knowledgeService.requestIngestion(
+ IngestionJob(
+ sourceKind="sharepoint_item",
+ sourceId=syntheticFileId,
+ fileName=fileName,
+ mimeType=mimeType,
+ userId=userId,
+ mandateId=mandateId,
+ contentObjects=contentObjects,
+ contentVersion=revision,
+ neutralize=limits.neutralize,
+ provenance=provenance,
+ )
+ )
+ except Exception as exc:
+ logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True)
+ result.failed += 1
+ result.errors.append(f"ingest({entryPath}): {exc}")
+ return
+
+ if handle.status == "duplicate":
+ result.skippedDuplicate += 1
+ elif handle.status == "indexed":
+ result.indexed += 1
+ else:
+ result.failed += 1
+ if handle.error:
+ result.errors.append(f"ingest({entryPath}): {handle.error}")
+
+ if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
+ processed = result.indexed + result.skippedDuplicate
+ try:
+ progressCb(
+ min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
+ f"sharepoint processed={processed}",
+ )
+ except Exception:
+ pass
+ logger.info(
+ "ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d",
+ processed, result.skippedDuplicate, result.failed,
+ extra={
+ "event": "ingestion.connection.bootstrap.progress",
+ "part": "sharepoint",
+ "connectionId": connectionId,
+ "processed": processed,
+ "skippedDup": result.skippedDuplicate,
+ "failed": result.failed,
+ },
+ )
+
+ # Yield so the event loop can interleave other tasks (download/extract are
+ # CPU-ish and extraction uses sync libs; cooperative scheduling prevents
+ # starving other workers).
+ await asyncio.sleep(0)
+
+
+def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]:
+ durationMs = int((time.time() - startMs) * 1000)
+ logger.info(
+ "ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d",
+ connectionId,
+ result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed,
+ durationMs,
+ extra={
+ "event": "ingestion.connection.bootstrap.done",
+ "part": "sharepoint",
+ "connectionId": connectionId,
+ "indexed": result.indexed,
+ "skippedDup": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "failed": result.failed,
+ "durationMs": durationMs,
+ },
+ )
+ return {
+ "connectionId": result.connectionId,
+ "indexed": result.indexed,
+ "skippedDuplicate": result.skippedDuplicate,
+ "skippedPolicy": result.skippedPolicy,
+ "failed": result.failed,
+ "bytesProcessed": result.bytesProcessed,
+ "durationMs": durationMs,
+ "errors": result.errors[:20],
+ }
diff --git a/modules/serviceCenter/services/serviceKnowledge/subTextClean.py b/modules/serviceCenter/services/serviceKnowledge/subTextClean.py
new file mode 100644
index 00000000..2d352cfa
--- /dev/null
+++ b/modules/serviceCenter/services/serviceKnowledge/subTextClean.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Text normalisation utilities used by knowledge ingestion.
+
+The email body cleaning logic is intentionally regex-based and works on plain
+text after an HTML→text pass so we never store unsanitised HTML/JS in the
+knowledge store and retrieval stays robust (no extraneous markup tokens
+eating embedding budget).
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+DEFAULT_MAX_CHARS = 8000
+
+
+_QUOTE_MARKER_PATTERNS = [
+ re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
+]
+
+_SIGNATURE_MARKERS = [
+ re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
+ re.compile(r"^\s*—\s*$", re.MULTILINE),
+ re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
+ re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
+]
+
+
+def _htmlToText(html: str) -> str:
+ """Prefer BeautifulSoup when available, fall back to regex."""
+ try:
+ from bs4 import BeautifulSoup # type: ignore
+
+ soup = BeautifulSoup(html, "html.parser")
+ for tag in soup(["script", "style", "head"]):
+ tag.decompose()
+ for br in soup.find_all(["br"]):
+ br.replace_with("\n")
+ for p in soup.find_all(["p", "div", "li", "tr"]):
+ p.append("\n")
+ text = soup.get_text()
+ except Exception:
+ # Minimal fallback: strip tags crudely.
+ text = re.sub(r"
", "\n", html, flags=re.IGNORECASE)
+ text = re.sub(r"(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
+ text = re.sub(r"<[^>]+>", "", text)
+ # Collapse non-breaking + zero-width whitespace.
+ text = text.replace("\u00a0", " ").replace("\u200b", "")
+ return text
+
+
+def _stripQuotedThread(text: str) -> str:
+ """Remove reply-chain content so only the author's own contribution remains."""
+ earliest = len(text)
+ for pattern in _QUOTE_MARKER_PATTERNS:
+ match = pattern.search(text)
+ if match and match.start() < earliest:
+ earliest = match.start()
+ # Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
+ quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
+ if quotedBlock and quotedBlock.start() < earliest:
+ earliest = quotedBlock.start()
+ return text[:earliest].rstrip()
+
+
+def _stripSignature(text: str) -> str:
+ earliest = len(text)
+ for pattern in _SIGNATURE_MARKERS:
+ match = pattern.search(text)
+ if match and match.start() < earliest:
+ earliest = match.start()
+ return text[:earliest].rstrip()
+
+
+def _collapseWhitespace(text: str) -> str:
+ text = re.sub(r"[ \t]+", " ", text)
+ text = re.sub(r"\n{3,}", "\n\n", text)
+ return text.strip()
+
+
+def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
+ """Return a compact plain-text view of an email body suitable for embedding.
+
+ Steps: HTML → text, remove quoted reply chain, remove signature, collapse
+ whitespace, truncate to maxChars. Always returns a string (possibly empty).
+ """
+ if not html:
+ return ""
+ text = _htmlToText(html) if "<" in html and ">" in html else html
+ text = _stripQuotedThread(text)
+ text = _stripSignature(text)
+ text = _collapseWhitespace(text)
+ if maxChars and len(text) > maxChars:
+ text = text[:maxChars].rstrip() + "…"
+ return text
diff --git a/modules/workflows/automation2/executionEngine.py b/modules/workflows/automation2/executionEngine.py
index 1d0ca5c8..55a63281 100644
--- a/modules/workflows/automation2/executionEngine.py
+++ b/modules/workflows/automation2/executionEngine.py
@@ -302,6 +302,30 @@ async def _executeWithRetry(executor, node, context, maxRetries: int = 0, retryD
raise lastError
+def _substituteFeatureInstancePlaceholders(
+ graph: Dict[str, Any],
+ targetFeatureInstanceId: str,
+) -> Dict[str, Any]:
+ """Replace ``{{featureInstanceId}}`` placeholders in the serialised graph.
+
+ Works on the full JSON representation so that placeholders inside nested
+ parameter dicts, prompt strings, etc. are all caught. Already-resolved
+ concrete UUIDs (pre-baked by ``_copyTemplateWorkflows``) are left untouched
+ because the placeholder literal ``{{featureInstanceId}}`` will not match.
+ """
+ import json as _json
+ raw = _json.dumps(graph)
+ if "{{featureInstanceId}}" not in raw:
+ return graph
+ replaced = raw.replace("{{featureInstanceId}}", targetFeatureInstanceId)
+ logger.debug(
+ "_substituteFeatureInstancePlaceholders: resolved %d occurrence(s) -> %s",
+ raw.count("{{featureInstanceId}}"),
+ targetFeatureInstanceId,
+ )
+ return _json.loads(replaced)
+
+
async def executeGraph(
graph: Dict[str, Any],
services: Any,
@@ -315,6 +339,7 @@ async def executeGraph(
runId: Optional[str] = None,
run_envelope: Optional[Dict[str, Any]] = None,
label: Optional[str] = None,
+ targetFeatureInstanceId: Optional[str] = None,
) -> Dict[str, Any]:
"""
Execute automation2 graph. Returns { success, nodeOutputs, error?, stopped? }.
@@ -322,14 +347,16 @@ async def executeGraph(
pauses the run, and returns { success: False, paused: True, taskId, runId }.
For resume: pass initialNodeOutputs (with result for the human node) and startAfterNodeId.
For fresh runs: pass run_envelope (unified start payload for the start node); normalized with userId into context.runEnvelope.
+ targetFeatureInstanceId: resolves {{featureInstanceId}} placeholders in the graph JSON before execution.
"""
logger.info(
- "executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s",
+ "executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s targetInstance=%s",
instanceId,
workflowId,
userId,
mandateId,
startAfterNodeId is not None,
+ targetFeatureInstanceId,
)
from modules.workflows.processing.shared.methodDiscovery import discoverMethods
discoverMethods(services)
@@ -338,6 +365,9 @@ async def executeGraph(
materializeFeatureInstanceRefs,
)
+ if targetFeatureInstanceId:
+ graph = _substituteFeatureInstancePlaceholders(graph, targetFeatureInstanceId)
+
# Phase-5 Schicht-4: typed-ref envelopes are materialized FIRST so the
# subsequent connection-ref pass and validation see the canonical shape.
graph = materializeFeatureInstanceRefs(graph)
diff --git a/modules/workflows/automation2/executors/actionNodeExecutor.py b/modules/workflows/automation2/executors/actionNodeExecutor.py
index 6162aa2d..163ed3b2 100644
--- a/modules/workflows/automation2/executors/actionNodeExecutor.py
+++ b/modules/workflows/automation2/executors/actionNodeExecutor.py
@@ -377,7 +377,11 @@ class ActionNodeExecutor:
if nodeType.startswith("ai."):
out["prompt"] = promptText
out["response"] = extractedContext
- out["context"] = f"{promptText}\n\n{extractedContext}" if promptText and extractedContext else (extractedContext or promptText)
+ inputContext = resolvedParams.get("context")
+ if inputContext is not None:
+ out["context"] = inputContext if isinstance(inputContext, str) else json.dumps(inputContext, ensure_ascii=False, default=str)
+ else:
+ out["context"] = ""
# Structured output
if extractedContext:
try:
diff --git a/modules/workflows/methods/methodAi/_common.py b/modules/workflows/methods/methodAi/_common.py
new file mode 100644
index 00000000..9e77d431
--- /dev/null
+++ b/modules/workflows/methods/methodAi/_common.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+
+"""Shared helpers for AI workflow actions."""
+
+
+def applyCommonAiParams(parameters: dict, request) -> None:
+ """Apply common AI parameters (requireNeutralization, allowedModels) from node to request."""
+ requireNeutralization = parameters.get("requireNeutralization")
+ if requireNeutralization is not None:
+ request.requireNeutralization = bool(requireNeutralization)
+
+ allowedModels = parameters.get("allowedModels")
+ if allowedModels and isinstance(allowedModels, list):
+ if not request.options:
+ from modules.datamodels.datamodelAi import AiCallOptions
+ request.options = AiCallOptions()
+ request.options.allowedModels = allowedModels
diff --git a/modules/workflows/methods/methodAi/actions/consolidate.py b/modules/workflows/methods/methodAi/actions/consolidate.py
index fa622507..7483507e 100644
--- a/modules/workflows/methods/methodAi/actions/consolidate.py
+++ b/modules/workflows/methods/methodAi/actions/consolidate.py
@@ -67,6 +67,8 @@ async def consolidate(self, parameters: Dict[str, Any]) -> ActionResult:
prompt=prompt,
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE),
)
+ from modules.workflows.methods.methodAi._common import applyCommonAiParams
+ applyCommonAiParams(parameters, req)
resp = await ai_service.callAi(req)
except (SubscriptionInactiveException, BillingContextError):
raise
diff --git a/modules/workflows/methods/methodAi/actions/convertDocument.py b/modules/workflows/methods/methodAi/actions/convertDocument.py
index 39d6e16f..b2ed908b 100644
--- a/modules/workflows/methods/methodAi/actions/convertDocument.py
+++ b/modules/workflows/methods/methodAi/actions/convertDocument.py
@@ -36,6 +36,10 @@ async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult:
}
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
+ if parameters.get("allowedModels"):
+ processParams["allowedModels"] = parameters["allowedModels"]
+ if parameters.get("requireNeutralization") is not None:
+ processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)
diff --git a/modules/workflows/methods/methodAi/actions/generateCode.py b/modules/workflows/methods/methodAi/actions/generateCode.py
index 313057a0..5ec6b51d 100644
--- a/modules/workflows/methods/methodAi/actions/generateCode.py
+++ b/modules/workflows/methods/methodAi/actions/generateCode.py
@@ -55,6 +55,16 @@ async def generateCode(self, parameters: Dict[str, Any]) -> ActionResult:
processingMode=ProcessingModeEnum.DETAILED
)
+ # Apply node-level AI params
+ allowedModels = parameters.get("allowedModels")
+ if allowedModels and isinstance(allowedModels, list):
+ options.allowedModels = allowedModels
+ requireNeutralization = parameters.get("requireNeutralization")
+ if requireNeutralization is not None:
+ _ctx = getattr(self.services, '_context', None)
+ if _ctx:
+ _ctx.requireNeutralization = bool(requireNeutralization)
+
# outputFormat: Optional - if None, formats determined from prompt by AI
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt,
diff --git a/modules/workflows/methods/methodAi/actions/generateDocument.py b/modules/workflows/methods/methodAi/actions/generateDocument.py
index 0709b924..18c158c1 100644
--- a/modules/workflows/methods/methodAi/actions/generateDocument.py
+++ b/modules/workflows/methods/methodAi/actions/generateDocument.py
@@ -59,6 +59,16 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
compressContext=False
)
+ # Apply node-level AI params
+ allowedModels = parameters.get("allowedModels")
+ if allowedModels and isinstance(allowedModels, list):
+ options.allowedModels = allowedModels
+ requireNeutralization = parameters.get("requireNeutralization")
+ if requireNeutralization is not None:
+ _ctx = getattr(self.services, '_context', None)
+ if _ctx:
+ _ctx.requireNeutralization = bool(requireNeutralization)
+
# outputFormat: Optional - if None, formats determined from prompt by AI
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt,
diff --git a/modules/workflows/methods/methodAi/actions/process.py b/modules/workflows/methods/methodAi/actions/process.py
index 63e0f33e..2af480e7 100644
--- a/modules/workflows/methods/methodAi/actions/process.py
+++ b/modules/workflows/methods/methodAi/actions/process.py
@@ -73,6 +73,49 @@ def _action_docs_to_content_parts(services, docs: List[Any]) -> List[ContentPart
logger.info(f"ai.process: Extracted {len(ec.parts)} parts from {name} (no persistence)")
return all_parts
+def _resolve_file_refs_to_content_parts(services, fileIdRefs) -> List[ContentPart]:
+ """Fetch files by ID from the file store and extract content.
+ Used ONLY for automation2 workflows where documents are file-store
+ references, not chat message attachments. In the agent/chat context,
+ ``DocumentItemReference`` holds ChatDocument IDs that must be resolved
+ via ``getChatDocumentsFromDocumentList`` instead."""
+ from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
+
+ mgmt = getattr(services, 'interfaceDbComponent', None)
+ extraction = getattr(services, 'extraction', None)
+ if not mgmt or not extraction:
+ logger.warning("_resolve_file_refs_to_content_parts: missing interfaceDbComponent or extraction service")
+ return []
+
+ allParts: List[ContentPart] = []
+ opts = ExtractionOptions(prompt="", mergeStrategy=MergeStrategy())
+ for ref in fileIdRefs:
+ fileId = ref.documentId
+ fileMeta = mgmt.getFile(fileId)
+ if not fileMeta:
+ logger.warning(f"_resolve_file_refs_to_content_parts: file {fileId} not found")
+ continue
+ fileData = mgmt.getFileData(fileId)
+ if not fileData:
+ logger.warning(f"_resolve_file_refs_to_content_parts: no data for file {fileId}")
+ continue
+ fileName = getattr(fileMeta, 'fileName', fileId)
+ mimeType = getattr(fileMeta, 'mimeType', 'application/octet-stream')
+ ec = extraction.extractContentFromBytes(
+ documentBytes=fileData,
+ fileName=fileName,
+ mimeType=mimeType,
+ documentId=fileId,
+ options=opts,
+ )
+ for p in ec.parts:
+ if p.data or getattr(p, "typeGroup", "") == "image":
+ p.metadata.setdefault("originalFileName", fileName)
+ allParts.append(p)
+ logger.info(f"_resolve_file_refs_to_content_parts: extracted {len(ec.parts)} parts from {fileName}")
+ return allParts
+
+
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
@@ -129,6 +172,25 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
f"ai.process: Coerced documentList ({type(documentListParam).__name__}) "
f"to DocumentReferenceList with {len(documentList.references)} references"
)
+
+ # DocumentItemReferences carry either file-store IDs (automation2)
+ # or ChatDocument IDs (agent context with docItem: refs).
+ # Route based on context: if a chat workflow with messages exists,
+ # let getChatDocumentsFromDocumentList handle them (it resolves
+ # docItem:uuid via workflow.messages). Otherwise fall through to
+ # the file-store path for automation2.
+ from modules.datamodels.datamodelDocref import DocumentItemReference
+ fileIdRefs = [r for r in documentList.references if isinstance(r, DocumentItemReference)]
+ if fileIdRefs:
+ chatService = getattr(self.services, 'chat', None)
+ workflow = getattr(chatService, '_workflow', None) if chatService else None
+ hasChatContext = workflow and getattr(workflow, 'messages', None)
+ if not hasChatContext:
+ extractedParts = _resolve_file_refs_to_content_parts(self.services, fileIdRefs)
+ if extractedParts:
+ inline_content_parts = (inline_content_parts or []) + extractedParts
+ remaining = [r for r in documentList.references if not isinstance(r, DocumentItemReference)]
+ documentList = DocumentReferenceList(references=remaining)
# Optional: if omitted, formats determined from prompt. Default "txt" is validation fallback only.
resultType = parameters.get("resultType")
@@ -157,7 +219,19 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
mimeMap = {"txt": "text/plain", "json": "application/json", "html": "text/html", "md": "text/markdown", "csv": "text/csv", "xml": "application/xml"}
output_mime_type = mimeMap.get(normalized_result_type, "text/plain") if normalized_result_type else "text/plain"
-
+
+ # Normalize context: workflow refs may resolve to dict/list instead of str
+ paramContext = parameters.get("context")
+ if paramContext is not None and not isinstance(paramContext, str):
+ try:
+ paramContext = json.dumps(paramContext, ensure_ascii=False, default=str)
+ parameters["context"] = paramContext
+ logger.info(f"ai.process: Serialized non-string context ({type(parameters.get('context')).__name__}) to JSON ({len(paramContext)} chars)")
+ except Exception as e:
+ logger.warning(f"ai.process: Failed to serialize context: {e}")
+ paramContext = str(paramContext)
+ parameters["context"] = paramContext
+
# Phase 7.3: Pass documentList and/or contentParts to AI service
contentParts: Optional[List[ContentPart]] = inline_content_parts
if "contentParts" in parameters and not inline_content_parts:
@@ -212,6 +286,9 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
)
)
+ from modules.workflows.methods.methodAi._common import applyCommonAiParams
+ applyCommonAiParams(parameters, request)
+
aiResponse_obj = await self.services.ai.callAi(request)
# Convert AiCallResponse to AiResponse format
@@ -243,6 +320,16 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
operationType=OperationTypeEnum.IMAGE_GENERATE if isImageGeneration else OperationTypeEnum.DATA_GENERATE
)
+ # Apply node-level AI params (allowedModels, requireNeutralization)
+ allowedModels = parameters.get("allowedModels")
+ if allowedModels and isinstance(allowedModels, list):
+ options.allowedModels = allowedModels
+ requireNeutralization = parameters.get("requireNeutralization")
+ if requireNeutralization is not None:
+ _ctx = getattr(self.services, '_context', None)
+ if _ctx:
+ _ctx.requireNeutralization = bool(requireNeutralization)
+
# Get generationIntent from parameters (required for DATA_GENERATE)
# Default to "document" if not provided (most common use case)
# For code generation, use ai.generateCode action or explicitly pass generationIntent="code"
diff --git a/modules/workflows/methods/methodAi/actions/summarizeDocument.py b/modules/workflows/methods/methodAi/actions/summarizeDocument.py
index e32c1965..4c2bb2bc 100644
--- a/modules/workflows/methods/methodAi/actions/summarizeDocument.py
+++ b/modules/workflows/methods/methodAi/actions/summarizeDocument.py
@@ -39,6 +39,10 @@ async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult:
}
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
+ if parameters.get("allowedModels"):
+ processParams["allowedModels"] = parameters["allowedModels"]
+ if parameters.get("requireNeutralization") is not None:
+ processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)
diff --git a/modules/workflows/methods/methodAi/actions/translateDocument.py b/modules/workflows/methods/methodAi/actions/translateDocument.py
index bb6f8437..dc0533a9 100644
--- a/modules/workflows/methods/methodAi/actions/translateDocument.py
+++ b/modules/workflows/methods/methodAi/actions/translateDocument.py
@@ -41,6 +41,10 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
processParams["resultType"] = resultType
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
+ if parameters.get("allowedModels"):
+ processParams["allowedModels"] = parameters["allowedModels"]
+ if parameters.get("requireNeutralization") is not None:
+ processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)
diff --git a/modules/workflows/methods/methodAi/methodAi.py b/modules/workflows/methods/methodAi/methodAi.py
index 5265f5c9..ecd60b12 100644
--- a/modules/workflows/methods/methodAi/methodAi.py
+++ b/modules/workflows/methods/methodAi/methodAi.py
@@ -56,6 +56,23 @@ class MethodAi(MethodBase):
required=False,
description="Document reference(s) in any format to use as input/context"
),
+ "context": WorkflowActionParameter(
+ name="context",
+ type="str",
+ frontendType=FrontendType.TEXTAREA,
+ required=False,
+ default="",
+ description="Additional context data (string or upstream-bound dict/list, e.g. accounting data) appended to the prompt. Non-string values are JSON-serialized."
+ ),
+ "documentTheme": WorkflowActionParameter(
+ name="documentTheme",
+ type="str",
+ frontendType=FrontendType.SELECT,
+ frontendOptions=["general", "finance", "legal", "technical", "hr"],
+ required=False,
+ default="general",
+ description="Style hint for the document renderer (e.g. finance, legal). Used by the AI agent to choose colors and layout."
+ ),
"resultType": WorkflowActionParameter(
name="resultType",
type="str",
diff --git a/modules/workflows/methods/methodClickup/actions/list_tasks.py b/modules/workflows/methods/methodClickup/actions/list_tasks.py
index 4caf9e31..9ae57f94 100644
--- a/modules/workflows/methods/methodClickup/actions/list_tasks.py
+++ b/modules/workflows/methods/methodClickup/actions/list_tasks.py
@@ -31,8 +31,30 @@ async def list_tasks(self, parameters: Dict[str, Any]) -> ActionResult:
page = int(parameters.get("page") or 0)
include_closed = bool(parameters.get("includeClosed", False))
+
+ dateFilters = {}
+ for key in ("dateCreatedGt", "dateCreatedLt", "dateUpdatedGt", "dateUpdatedLt"):
+ val = parameters.get(key)
+ if val is not None and str(val).strip():
+ try:
+ dateFilters[key] = int(val)
+ except (ValueError, TypeError):
+ pass
+
+ rawCustomFields = parameters.get("customFields")
+ customFields = None
+ if rawCustomFields:
+ if isinstance(rawCustomFields, str):
+ try:
+ customFields = json.loads(rawCustomFields)
+ except json.JSONDecodeError:
+ return ActionResult.isFailure(error="customFields must be valid JSON array")
+ elif isinstance(rawCustomFields, list):
+ customFields = rawCustomFields
+
data = await self.services.clickup.getTasksInList(
- list_id, page=page, include_closed=include_closed, subtasks=True
+ list_id, page=page, include_closed=include_closed, subtasks=True,
+ **dateFilters, customFields=customFields,
)
if isinstance(data, dict) and data.get("error"):
return ActionResult.isFailure(error=str(data.get("error")) + (data.get("body") or ""))
diff --git a/modules/workflows/methods/methodClickup/methodClickup.py b/modules/workflows/methods/methodClickup/methodClickup.py
index 17f42300..725929dd 100644
--- a/modules/workflows/methods/methodClickup/methodClickup.py
+++ b/modules/workflows/methods/methodClickup/methodClickup.py
@@ -66,6 +66,41 @@ class MethodClickup(MethodBase):
default=False,
description="Include closed tasks",
),
+ "dateCreatedGt": WorkflowActionParameter(
+ name="dateCreatedGt",
+ type="int",
+ frontendType=FrontendType.NUMBER,
+ required=False,
+ description="Filter: created after this Unix ms timestamp",
+ ),
+ "dateCreatedLt": WorkflowActionParameter(
+ name="dateCreatedLt",
+ type="int",
+ frontendType=FrontendType.NUMBER,
+ required=False,
+ description="Filter: created before this Unix ms timestamp",
+ ),
+ "dateUpdatedGt": WorkflowActionParameter(
+ name="dateUpdatedGt",
+ type="int",
+ frontendType=FrontendType.NUMBER,
+ required=False,
+ description="Filter: updated after this Unix ms timestamp",
+ ),
+ "dateUpdatedLt": WorkflowActionParameter(
+ name="dateUpdatedLt",
+ type="int",
+ frontendType=FrontendType.NUMBER,
+ required=False,
+ description="Filter: updated before this Unix ms timestamp",
+ ),
+ "customFields": WorkflowActionParameter(
+ name="customFields",
+ type="str",
+ frontendType=FrontendType.TEXTAREA,
+ required=False,
+ description='JSON array of custom field filters per ClickUp API, e.g. [{"field_id":"abc","operator":"=","value":"123"}]',
+ ),
},
execute=list_tasks.__get__(self, self.__class__),
),
diff --git a/modules/workflows/scheduler/mainScheduler.py b/modules/workflows/scheduler/mainScheduler.py
index bf2cd0fd..0dce2ec5 100644
--- a/modules/workflows/scheduler/mainScheduler.py
+++ b/modules/workflows/scheduler/mainScheduler.py
@@ -243,6 +243,7 @@ class WorkflowScheduler:
runEnv = normalize_run_envelope(runEnv, user_id=str(eventUser.id) if eventUser else None)
_wfLabel = wf.get("label") if isinstance(wf, dict) else getattr(wf, "label", None)
+ _targetInstanceId = wf.get("targetFeatureInstanceId") if isinstance(wf, dict) else getattr(wf, "targetFeatureInstanceId", None)
result = await executeGraph(
graph=wf["graph"],
@@ -254,6 +255,7 @@ class WorkflowScheduler:
automation2_interface=iface,
run_envelope=runEnv,
label=_wfLabel,
+ targetFeatureInstanceId=_targetInstanceId,
)
logger.info(
"WorkflowScheduler: executed workflow %s success=%s paused=%s",
diff --git a/scripts/_archive/README.md b/scripts/_archive/README.md
new file mode 100644
index 00000000..dba3deef
--- /dev/null
+++ b/scripts/_archive/README.md
@@ -0,0 +1,19 @@
+# Archived one-shot scripts
+
+Diese Scripts haben einmal eine konkrete Daten- oder Code-Migration ausgefuehrt
+und werden nicht mehr aktiv aufgerufen. Sie bleiben hier liegen, falls jemand
+spaeter auf einem alten DB-Dump oder einem alten Branch nochmal denselben Stand
+herstellen muss.
+
+KEIN aktives Tool. Nicht aus CI, nicht aus Docs verlinken. Bei Aufraeumarbeiten
+(z.B. nach 6 Monaten ohne Anwendung) loeschen.
+
+## Inhalt
+
+| Datei | Migrationsthema | Archiviert am | Begruendung |
+|-------|-----------------|---------------|-------------|
+| `check_orphan_featureinstance.py` | Vor-Ort-Check mit hardcoded FeatureInstance-/Mandate-UUIDs | 2026-04-29 | Ad-hoc fuer einen konkreten Vorfall |
+| `script_db_cleanup_duplicate_roles.py` | Cleanup doppelter Roles wegen `IS NULL`-Bug in `connectorDbPostgre` | 2026-04-29 | Bug ist laengst gefixt, Cleanup ueberall durchgelaufen |
+| `migrate_async_to_sync.py` | One-shot Codemod `async def` -> `def` fuer FastAPI-Routes | 2026-04-29 | Refactor abgeschlossen |
+| `i18n_rekey_plaintext_keys.py` | Frontend `t('dot.notation')` -> `t('Klartext')` Rekey | 2026-04-29 | Frontend-Migration abgeschlossen (siehe `wiki/c-work/4-done/2026-04-ui-i18n-dynamic-language-sets.md`) |
+| `script_db_migrate_accessrules_objectkeys.py` | AccessRule-Items: kurz -> vollqualifiziert (Navigation-API) | 2026-04-29 | Navigation-API live, MIGRATION_MAP nur fuer trustee+realestate hardcoded |
diff --git a/scripts/check_orphan_featureinstance.py b/scripts/_archive/check_orphan_featureinstance.py
similarity index 100%
rename from scripts/check_orphan_featureinstance.py
rename to scripts/_archive/check_orphan_featureinstance.py
diff --git a/scripts/i18n_rekey_plaintext_keys.py b/scripts/_archive/i18n_rekey_plaintext_keys.py
similarity index 100%
rename from scripts/i18n_rekey_plaintext_keys.py
rename to scripts/_archive/i18n_rekey_plaintext_keys.py
diff --git a/scripts/migrate_async_to_sync.py b/scripts/_archive/migrate_async_to_sync.py
similarity index 100%
rename from scripts/migrate_async_to_sync.py
rename to scripts/_archive/migrate_async_to_sync.py
diff --git a/scripts/script_db_cleanup_duplicate_roles.py b/scripts/_archive/script_db_cleanup_duplicate_roles.py
similarity index 100%
rename from scripts/script_db_cleanup_duplicate_roles.py
rename to scripts/_archive/script_db_cleanup_duplicate_roles.py
diff --git a/scripts/script_db_migrate_accessrules_objectkeys.py b/scripts/_archive/script_db_migrate_accessrules_objectkeys.py
similarity index 100%
rename from scripts/script_db_migrate_accessrules_objectkeys.py
rename to scripts/_archive/script_db_migrate_accessrules_objectkeys.py
diff --git a/scripts/_listMandates.py b/scripts/_listMandates.py
deleted file mode 100644
index cf3e9bd2..00000000
--- a/scripts/_listMandates.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
-import psycopg2, psycopg2.extras
-from modules.shared.configuration import APP_CONFIG
-
-c = psycopg2.connect(
- host=APP_CONFIG.get('DB_HOST','localhost'),
- user=APP_CONFIG.get('DB_USER'),
- password=APP_CONFIG.get('DB_PASSWORD_SECRET'),
- port=int(APP_CONFIG.get('DB_PORT',5432)),
- dbname='poweron_app',
-)
-cur = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
-cur.execute('SELECT id, name, label, enabled, "deletedAt", "sysCreatedAt" FROM "Mandate" ORDER BY "sysCreatedAt"')
-print("All Mandates in poweron_app:")
-for r in cur.fetchall():
- print(f" id={r['id']} name={r['name']} label={r['label']} enabled={r['enabled']} deletedAt={r['deletedAt']}")
-
-cur.execute('SELECT COUNT(*) AS n FROM "FeatureInstance" WHERE "featureCode" = %s', ("redmine",))
-print(f"\nTotal redmine FeatureInstances in poweron_app: {cur.fetchone()['n']}")
-
-cur.execute('SELECT id, "mandateId", label, enabled FROM "FeatureInstance" WHERE "featureCode" = %s ORDER BY "sysCreatedAt"', ("redmine",))
-for r in cur.fetchall():
- print(f" fi={r['id']} mandate={r['mandateId']} label={r['label']} enabled={r['enabled']}")
diff --git a/scripts/script_db_audit_legacy_state.py b/scripts/script_db_audit_legacy_state.py
new file mode 100644
index 00000000..f51a132a
--- /dev/null
+++ b/scripts/script_db_audit_legacy_state.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+"""Audit-Skript fuer Legacy-Bestaende vor Bootstrap-Cleanup (Plan C).
+
+Prueft fuer jede der 5 Bootstrap-Migrationsroutinen, ob noch Restbestand
+existiert. Wenn alle Checks 0 / GREEN liefern, kann die jeweilige Routine
+sicher aus ``interfaceBootstrap.py`` / ``interfaceDbKnowledge.py`` entfernt
+werden.
+
+Checks:
+ 1. Mandate.description != NULL und Mandate.label leer
+ -> _migrateMandateDescriptionToLabel
+ 2. Mandate.label leer ODER Mandate.name verstoesst gegen Slug-Regeln
+ -> _migrateMandateNameLabelSlugRules
+ 3. Mandate mit name='Root' und isSystem=False
+ -> initRootMandate Legacy-Zweig
+ 4. Role mit roleLabel='sysadmin' im Root-Mandat
+ -> _migrateAndDropSysAdminRole
+ 5. FileContentIndex mit leerem mandateId UND leerem featureInstanceId
+ -> aggregateMandateRagTotalBytes Fallback-Block
+
+Verwendung:
+ python -m scripts.script_db_audit_legacy_state # text-output
+ python -m scripts.script_db_audit_legacy_state --json # JSON-output
+ python -m scripts.script_db_audit_legacy_state --purge-rag-orphans
+ # loescht FileContentIndex-Rows ohne mandateId UND ohne featureInstanceId
+ # (Voraussetzung fuer Removal des aggregateMandateRagTotalBytes-Fallback)
+
+Exit-Code:
+ 0 alle Checks GREEN (Removal sicher)
+ 1 mind. ein Check RED (erst Daten bereinigen)
+ 2 Skript-Fehler (DB nicht erreichbar etc.)
+
+Lese-Zugriffe sind die Default. Schreibzugriffe NUR mit explizitem
+``--purge-*``-Flag.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+
+_gatewayDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _gatewayDir not in sys.path:
+ sys.path.insert(0, _gatewayDir)
+
+from dotenv import load_dotenv
+
+_envPath = os.path.join(_gatewayDir, "env_dev.env")
+if os.path.exists(_envPath):
+ load_dotenv(_envPath)
+
+from modules.datamodels.datamodelUam import Mandate
+from modules.datamodels.datamodelRbac import Role
+from modules.datamodels.datamodelKnowledge import FileContentIndex
+from modules.security.rootAccess import getRootDbAppConnector
+from modules.interfaces.interfaceDbKnowledge import KnowledgeObjects
+from modules.shared.mandateNameUtils import isValidMandateName
+
+logging.basicConfig(level=logging.WARNING, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _CheckResult:
+ """Ergebnis eines einzelnen Audit-Checks."""
+
+ name: str
+ routine: str
+ location: str
+ count: int
+ status: str
+ samples: List[Dict[str, Any]] = field(default_factory=list)
+ error: Optional[str] = None
+
+ def toDict(self) -> Dict[str, Any]:
+ return {
+ "name": self.name,
+ "routine": self.routine,
+ "location": self.location,
+ "count": self.count,
+ "status": self.status,
+ "samples": self.samples,
+ "error": self.error,
+ }
+
+
+def _getAppDb():
+ return getRootDbAppConnector()
+
+
+def _getKnowledgeDb():
+ return KnowledgeObjects().db
+
+
+def _checkMandateDescription(db) -> _CheckResult:
+ """Mandate.description noch vorhanden und label leer?"""
+ rows = db.getRecordset(Mandate)
+ legacy = [
+ {
+ "id": r.get("id"),
+ "name": r.get("name"),
+ "description": str(r.get("description"))[:60] if r.get("description") else None,
+ "label": r.get("label"),
+ }
+ for r in rows
+ if r.get("description") and not r.get("label")
+ ]
+ return _CheckResult(
+ name="mandate-description-to-label",
+ routine="_migrateMandateDescriptionToLabel",
+ location="interfaces/interfaceBootstrap.py:422-445",
+ count=len(legacy),
+ status="GREEN" if not legacy else "RED",
+ samples=legacy[:5],
+ )
+
+
+def _checkMandateSlugRules(db) -> _CheckResult:
+ """Mandate.name verletzt Slug-Regeln ODER Mandate.label leer?"""
+ rows = db.getRecordset(Mandate)
+ legacy = []
+ seen: set[str] = set()
+ for r in sorted(rows, key=lambda x: str(x.get("id", ""))):
+ name = (r.get("name") or "").strip()
+ labelRaw = r.get("label")
+ labelEmpty = not (labelRaw or "").strip() if labelRaw is not None else True
+ nameInvalid = not isValidMandateName(name)
+ nameCollides = name in seen
+ if not nameInvalid and not nameCollides:
+ seen.add(name)
+ if labelEmpty or nameInvalid or nameCollides:
+ legacy.append(
+ {
+ "id": r.get("id"),
+ "name": name,
+ "label": r.get("label"),
+ "labelEmpty": labelEmpty,
+ "nameInvalid": nameInvalid,
+ "nameCollides": nameCollides,
+ }
+ )
+ return _CheckResult(
+ name="mandate-name-slug-rules",
+ routine="_migrateMandateNameLabelSlugRules",
+ location="interfaces/interfaceBootstrap.py:448-511",
+ count=len(legacy),
+ status="GREEN" if not legacy else "RED",
+ samples=legacy[:5],
+ )
+
+
+def _checkRootMandateLegacy(db) -> _CheckResult:
+ """Mandate mit name='Root' (case-sensitive) ODER isSystem=False fuer root?"""
+ legacyByName = db.getRecordset(Mandate, recordFilter={"name": "Root"})
+ rows = db.getRecordset(Mandate, recordFilter={"name": "root"})
+ legacyByFlag = [r for r in rows if not r.get("isSystem")]
+ combined = list(legacyByName) + legacyByFlag
+ samples = [
+ {
+ "id": r.get("id"),
+ "name": r.get("name"),
+ "isSystem": r.get("isSystem"),
+ }
+ for r in combined
+ ]
+ return _CheckResult(
+ name="root-mandate-legacy",
+ routine="initRootMandate-legacy-branch",
+ location="interfaces/interfaceBootstrap.py:406-412",
+ count=len(samples),
+ status="GREEN" if not samples else "RED",
+ samples=samples[:5],
+ )
+
+
+def _checkSysadminRole(db) -> _CheckResult:
+ """Legacy 'sysadmin'-Rolle im Root-Mandat?"""
+ rootMandates = db.getRecordset(Mandate, recordFilter={"name": "root", "isSystem": True})
+ if not rootMandates:
+ return _CheckResult(
+ name="sysadmin-role",
+ routine="_migrateAndDropSysAdminRole",
+ location="interfaces/interfaceBootstrap.py:840-932",
+ count=0,
+ status="GREEN",
+ samples=[],
+ error="kein Root-Mandat gefunden -- Check uebersprungen (kann nicht relevant sein)",
+ )
+ rootId = str(rootMandates[0].get("id"))
+ rows = db.getRecordset(
+ Role,
+ recordFilter={"roleLabel": "sysadmin", "mandateId": rootId, "featureInstanceId": None},
+ )
+ samples = [{"id": r.get("id"), "roleLabel": r.get("roleLabel")} for r in rows]
+ return _CheckResult(
+ name="sysadmin-role",
+ routine="_migrateAndDropSysAdminRole",
+ location="interfaces/interfaceBootstrap.py:840-932",
+ count=len(samples),
+ status="GREEN" if not samples else "RED",
+ samples=samples[:5],
+ )
+
+
+def _checkRagFallback(knowDb) -> _CheckResult:
+ """FileContentIndex-Rows ohne mandateId UND ohne featureInstanceId?"""
+ rows = knowDb.getRecordset(FileContentIndex)
+ legacy = [
+ {
+ "id": r.get("id"),
+ "fileName": r.get("fileName"),
+ "totalSize": r.get("totalSize"),
+ }
+ for r in rows
+ if not (r.get("mandateId") or "").strip() and not (r.get("featureInstanceId") or "").strip()
+ ]
+ return _CheckResult(
+ name="rag-fallback-orphan-index",
+ routine="aggregateMandateRagTotalBytes-fallback",
+ location="interfaces/interfaceDbKnowledge.py:609-635",
+ count=len(legacy),
+ status="GREEN" if not legacy else "RED",
+ samples=legacy[:5],
+ )
+
+
+def _runChecks() -> List[_CheckResult]:
+ appDb = _getAppDb()
+ knowDb = _getKnowledgeDb()
+
+ appChecks: List[Callable[[Any], _CheckResult]] = [
+ _checkMandateDescription,
+ _checkMandateSlugRules,
+ _checkRootMandateLegacy,
+ _checkSysadminRole,
+ ]
+
+ results: List[_CheckResult] = []
+ for fn in appChecks:
+ try:
+ results.append(fn(appDb))
+ except Exception as exc:
+ results.append(
+ _CheckResult(
+ name=fn.__name__,
+ routine="?",
+ location="?",
+ count=-1,
+ status="ERROR",
+ error=f"{type(exc).__name__}: {exc}",
+ )
+ )
+
+ try:
+ results.append(_checkRagFallback(knowDb))
+ except Exception as exc:
+ results.append(
+ _CheckResult(
+ name="rag-fallback-orphan-index",
+ routine="aggregateMandateRagTotalBytes-fallback",
+ location="interfaces/interfaceDbKnowledge.py:609-635",
+ count=-1,
+ status="ERROR",
+ error=f"{type(exc).__name__}: {exc}",
+ )
+ )
+
+ return results
+
+
+def _printText(results: List[_CheckResult]) -> None:
+ print("=" * 78)
+ print("BOOTSTRAP-MIGRATIONS LEGACY-STATE-AUDIT")
+ print("=" * 78)
+ for r in results:
+ marker = {
+ "GREEN": "[OK]",
+ "RED": "[!!]",
+ "ERROR": "[ERR]",
+ }.get(r.status, "[?]")
+ print(f"\n{marker} {r.name}")
+ print(f" Routine : {r.routine}")
+ print(f" Location: {r.location}")
+ print(f" Count : {r.count}")
+ print(f" Status : {r.status}")
+ if r.error:
+ print(f" Note : {r.error}")
+ if r.samples:
+ print(f" Samples : (max 5)")
+ for s in r.samples:
+ print(f" {s}")
+
+ print("\n" + "=" * 78)
+ greens = sum(1 for r in results if r.status == "GREEN")
+ reds = sum(1 for r in results if r.status == "RED")
+ errs = sum(1 for r in results if r.status == "ERROR")
+ print(f"SUMMARY: {greens} GREEN {reds} RED {errs} ERROR ({len(results)} total)")
+ if reds == 0 and errs == 0:
+ print("VERDICT: alle Migrationsroutinen koennen entfernt werden.")
+ elif errs > 0:
+ print("VERDICT: Audit unvollstaendig (Fehler) -- bitte Skript fixen.")
+ else:
+ print("VERDICT: erst Daten bereinigen, dann Routinen entfernen.")
+ print("=" * 78)
+
+
+def _purgeRagOrphans() -> int:
+ """Loescht alle FileContentIndex-Rows ohne mandateId UND ohne featureInstanceId.
+
+ Returns: Anzahl geloeschter Rows.
+ """
+ knowDb = _getKnowledgeDb()
+ rows = knowDb.getRecordset(FileContentIndex)
+ orphans = [
+ r for r in rows
+ if not (r.get("mandateId") or "").strip()
+ and not (r.get("featureInstanceId") or "").strip()
+ ]
+ if not orphans:
+ print("Keine RAG-Orphans gefunden -- nichts zu purgen.")
+ return 0
+
+ print(f"Purge {len(orphans)} RAG-Orphan(s):")
+ deleted = 0
+ for r in orphans:
+ rid = r.get("id")
+ try:
+ knowDb.recordDelete(FileContentIndex, str(rid))
+ deleted += 1
+ print(f" geloescht: {rid} {r.get('fileName')}")
+ except Exception as exc:
+ print(f" FEHLER {rid}: {type(exc).__name__}: {exc}", file=sys.stderr)
+ print(f"Purge abgeschlossen: {deleted}/{len(orphans)} geloescht.")
+ return deleted
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(
+ description="Audit-Skript fuer Legacy-Bestaende (Bootstrap-Cleanup Plan C)"
+ )
+ parser.add_argument("--json", action="store_true", help="JSON-Output statt Text")
+ parser.add_argument(
+ "--purge-rag-orphans",
+ action="store_true",
+ help="WRITE: loescht FileContentIndex-Rows ohne mandateId UND featureInstanceId",
+ )
+ args = parser.parse_args()
+
+ if args.purge_rag_orphans:
+ try:
+ _purgeRagOrphans()
+ except Exception as exc:
+ print(f"FATAL: Purge fehlgeschlagen -- {type(exc).__name__}: {exc}", file=sys.stderr)
+ return 2
+ print()
+
+ try:
+ results = _runChecks()
+ except Exception as exc:
+ print(f"FATAL: konnte Audit nicht starten -- {type(exc).__name__}: {exc}", file=sys.stderr)
+ return 2
+
+ if args.json:
+ print(json.dumps([r.toDict() for r in results], indent=2, default=str))
+ else:
+ _printText(results)
+
+ if any(r.status == "ERROR" for r in results):
+ return 2
+ if any(r.status == "RED" for r in results):
+ return 1
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/scripts/stage0_filefolder_schema_check.py b/scripts/stage0_filefolder_schema_check.py
new file mode 100644
index 00000000..861d8671
--- /dev/null
+++ b/scripts/stage0_filefolder_schema_check.py
@@ -0,0 +1,58 @@
+"""Stage 0: verify FileFolder table + FileItem.folderId column in management DB.
+
+Run from the gateway directory (same as uvicorn):
+ python -m scripts.stage0_filefolder_schema_check
+"""
+from modules.connectors.connectorDbPostgre import getCachedConnector
+from modules.shared.configuration import APP_CONFIG
+
+managementDatabase = "poweron_management"
+
+dbHost = APP_CONFIG.get("DB_HOST", "_no_config_default_data")
+dbUser = APP_CONFIG.get("DB_USER")
+dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET")
+dbPort = int(APP_CONFIG.get("DB_PORT", 5432))
+
+c = getCachedConnector(
+ dbHost=dbHost,
+ dbDatabase=managementDatabase,
+ dbUser=dbUser,
+ dbPassword=dbPassword,
+ dbPort=dbPort,
+ userId=None,
+)
+if not c or not c.connection:
+ print("STAGE0: DB_CONNECTION=none (check config.ini / .env)")
+ raise SystemExit(2)
+
+cur = c.connection.cursor()
+
+
+def _scalar(cur):
+ row = cur.fetchone()
+ if row is None:
+ return None
+ if isinstance(row, dict):
+ return next(iter(row.values()))
+ return row[0]
+
+
+cur.execute(
+ """
+ SELECT EXISTS (
+ SELECT 1 FROM information_schema.tables
+ WHERE table_name = 'FileFolder'
+ ) AS ok
+ """
+)
+print("STAGE0: FileFolder_table=", _scalar(cur))
+cur.execute(
+ """
+ SELECT EXISTS (
+ SELECT 1 FROM information_schema.columns
+ WHERE table_name = 'FileItem' AND column_name = 'folderId'
+ ) AS ok
+ """
+)
+print("STAGE0: FileItem_folderId_column=", _scalar(cur))
+cur.close()
diff --git a/tests/serviceAi/__init__.py b/tests/serviceAi/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/serviceAi/test_allowed_models_whitelist.py b/tests/serviceAi/test_allowed_models_whitelist.py
new file mode 100644
index 00000000..4593afd9
--- /dev/null
+++ b/tests/serviceAi/test_allowed_models_whitelist.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+import pytest
+from modules.datamodels.datamodelAi import AiCallOptions
+
+
+def test_allowed_models_field_exists():
+ opts = AiCallOptions(allowedModels=["gpt-5-mini", "claude-4-7-opus"])
+ assert opts.allowedModels == ["gpt-5-mini", "claude-4-7-opus"]
+
+
+def test_allowed_models_default_none():
+ opts = AiCallOptions()
+ assert opts.allowedModels is None
diff --git a/tests/serviceGeneration/__init__.py b/tests/serviceGeneration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/serviceGeneration/test_inline_image_paragraph.py b/tests/serviceGeneration/test_inline_image_paragraph.py
new file mode 100644
index 00000000..be0c5d19
--- /dev/null
+++ b/tests/serviceGeneration/test_inline_image_paragraph.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+import pytest
+from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson
+
+
+def test_inline_image_in_paragraph():
+ md = "Results show  clearly."
+ result = markdownToDocumentJson(md, "Test")
+ runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"]
+ types = [r["type"] for r in runs]
+ assert "text" in types
+ assert "image" in types
+ imgRun = next(r for r in runs if r["type"] == "image")
+ assert imgRun.get("fileId") == "abc"
+
+
+def test_multiple_inline_images():
+ md = "A  B  C"
+ result = markdownToDocumentJson(md, "Test")
+ runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"]
+ images = [r for r in runs if r["type"] == "image"]
+ assert len(images) == 2
diff --git a/tests/serviceGeneration/test_md_to_json_consolidation.py b/tests/serviceGeneration/test_md_to_json_consolidation.py
new file mode 100644
index 00000000..83118374
--- /dev/null
+++ b/tests/serviceGeneration/test_md_to_json_consolidation.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+import pytest
+from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson
+
+
+def test_basic_paragraph():
+ result = markdownToDocumentJson("Hello world", "Test")
+ doc = result["documents"][0]
+ section = doc["sections"][0]
+ assert section["content_type"] == "paragraph"
+ assert section["elements"][0]["content"]["inlineRuns"][0] == {"type": "text", "value": "Hello world"}
+
+
+def test_inline_bold():
+ result = markdownToDocumentJson("This is **bold** text", "Test")
+ runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"]
+ assert any(r["type"] == "bold" and r["value"] == "bold" for r in runs)
+
+
+def test_inline_image():
+ result = markdownToDocumentJson("Text  more", "Test")
+ runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"]
+ assert any(r["type"] == "image" and r.get("fileId") == "abc123" for r in runs)
+
+
+def test_inline_link():
+ result = markdownToDocumentJson("Click [here](https://example.com)", "Test")
+ runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"]
+ assert any(r["type"] == "link" and r.get("href") == "https://example.com" for r in runs)
+
+
+def test_table_cells_are_inline_runs():
+ md = "| A | B |\n| --- | --- |\n| **x** | y |"
+ result = markdownToDocumentJson(md, "Test")
+ section = result["documents"][0]["sections"][0]
+ assert section["content_type"] == "table"
+ rows = section["elements"][0]["content"]["rows"]
+ assert isinstance(rows[0][0], list)
+
+
+def test_bullet_list_inline_runs():
+ md = "- Item **one**\n- Item two"
+ result = markdownToDocumentJson(md, "Test")
+ section = result["documents"][0]["sections"][0]
+ assert section["content_type"] == "bullet_list"
+ items = section["elements"][0]["content"]["items"]
+ assert isinstance(items[0], list)
+
+
+def test_standalone_image_block():
+ md = ""
+ result = markdownToDocumentJson(md, "Test")
+ section = result["documents"][0]["sections"][0]
+ assert section["content_type"] == "image"
+
+
+def test_heading_unchanged():
+ result = markdownToDocumentJson("# Title", "Test")
+ section = result["documents"][0]["sections"][0]
+ assert section["content_type"] == "heading"
+ assert section["elements"][0]["content"]["text"] == "Title"
+ assert section["elements"][0]["content"]["level"] == 1
+
+
+def test_code_block_unchanged():
+ md = "```python\nprint('hi')\n```"
+ result = markdownToDocumentJson(md, "Test")
+ section = result["documents"][0]["sections"][0]
+ assert section["content_type"] == "code_block"
+ assert section["elements"][0]["content"]["code"] == "print('hi')"
diff --git a/tests/serviceGeneration/test_style_resolver.py b/tests/serviceGeneration/test_style_resolver.py
new file mode 100644
index 00000000..6b2b649a
--- /dev/null
+++ b/tests/serviceGeneration/test_style_resolver.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+import pytest
+from modules.serviceCenter.services.serviceGeneration.styleDefaults import resolveStyle, DEFAULT_STYLE
+
+
+def test_resolve_none_returns_defaults():
+ result = resolveStyle(None)
+ assert result == DEFAULT_STYLE
+
+
+def test_resolve_empty_returns_defaults():
+ result = resolveStyle({})
+ assert result == DEFAULT_STYLE
+
+
+def test_override_single_color():
+ result = resolveStyle({"colors": {"primary": "#FF0000"}})
+ assert result["colors"]["primary"] == "#FF0000"
+ assert result["colors"]["secondary"] == DEFAULT_STYLE["colors"]["secondary"]
+
+
+def test_override_nested_heading():
+ result = resolveStyle({"headings": {"h1": {"sizePt": 30}}})
+ assert result["headings"]["h1"]["sizePt"] == 30
+ assert result["headings"]["h1"]["weight"] == "bold"
+
+
+def test_override_font():
+ result = resolveStyle({"fonts": {"primary": "Arial"}})
+ assert result["fonts"]["primary"] == "Arial"
+ assert result["fonts"]["monospace"] == "Consolas"
+
+
+def test_full_style_passthrough():
+ custom = {"fonts": {"primary": "Helvetica", "monospace": "Monaco"}}
+ result = resolveStyle(custom)
+ assert result["fonts"]["primary"] == "Helvetica"
+ assert result["fonts"]["monospace"] == "Monaco"
diff --git a/tests/unit/bootstrap/test_mandateNameMigration.py b/tests/unit/bootstrap/test_mandateNameMigration.py
deleted file mode 100644
index d09a6846..00000000
--- a/tests/unit/bootstrap/test_mandateNameMigration.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) 2025 Patrick Motsch
-# All rights reserved.
-"""
-Unit tests for ``_migrateMandateNameLabelSlugRules`` in interfaceBootstrap.
-
-Covers:
-- legacy ``name``/``label`` rows get fixed (label fill, slug rename),
-- collisions across legacy rows resolve via -2/-3 suffixes in stable id order,
-- valid rows are left untouched (idempotency),
-- second invocation is a no-op.
-"""
-
-from typing import Any, Dict, List, Optional
-
-import pytest
-
-from modules.datamodels.datamodelUam import Mandate
-from modules.interfaces.interfaceBootstrap import _migrateMandateNameLabelSlugRules
-from modules.shared.mandateNameUtils import isValidMandateName
-
-
-class _FakeDb:
- """Minimal connector simulating getRecordset(Mandate)+recordModify(Mandate, id, data)."""
-
- def __init__(self, rows: List[Dict[str, Any]]):
- self.rows: List[Dict[str, Any]] = [dict(r) for r in rows]
- self.modifyCalls: List[Dict[str, Any]] = []
-
- def getRecordset(self, model, recordFilter: Optional[Dict[str, Any]] = None):
- if model is not Mandate:
- return []
- if not recordFilter:
- return [dict(r) for r in self.rows]
- out = []
- for r in self.rows:
- if all(r.get(k) == v for k, v in recordFilter.items()):
- out.append(dict(r))
- return out
-
- def recordModify(self, model, recordId: str, data: Dict[str, Any]):
- self.modifyCalls.append({"id": str(recordId), "data": dict(data)})
- for r in self.rows:
- if str(r.get("id")) == str(recordId):
- r.update(data)
- return r
- return None
-
-
-def _row(mid: str, name: Any, label: Any = None) -> Dict[str, Any]:
- return {"id": mid, "name": name, "label": label}
-
-
-class TestMigrationFillsLabel:
- def test_emptyLabelGetsNameAsLabel(self):
- db = _FakeDb([_row("a1", "good-name", None)])
- _migrateMandateNameLabelSlugRules(db)
- assert db.rows[0]["label"] == "good-name"
- assert db.rows[0]["name"] == "good-name"
-
- def test_emptyLabelAndEmptyNameFallsBackToMandate(self):
- db = _FakeDb([_row("a1", "", "")])
- _migrateMandateNameLabelSlugRules(db)
- assert db.rows[0]["label"] == "Mandate"
- assert isValidMandateName(db.rows[0]["name"])
-
-
-class TestMigrationRenamesInvalidNames:
- def test_invalidNameGetsSlugFromLabel(self):
- db = _FakeDb([_row("a1", "Home patrick", "Home Patrick")])
- _migrateMandateNameLabelSlugRules(db)
- assert db.rows[0]["name"] == "home-patrick"
- assert db.rows[0]["label"] == "Home Patrick"
-
- def test_umlautsTransliterated(self):
- db = _FakeDb([_row("a1", "Müller AG", "Müller AG")])
- _migrateMandateNameLabelSlugRules(db)
- assert db.rows[0]["name"] == "mueller-ag"
-
-
-class TestMigrationCollisions:
- def test_collisionsResolveByStableIdOrder(self):
- rows = [
- _row("z1", "Home patrick", "Home Patrick"),
- _row("a1", "home-patrick", "Home Patrick Two"),
- ]
- db = _FakeDb(rows)
- _migrateMandateNameLabelSlugRules(db)
- byId = {r["id"]: r for r in db.rows}
- assert byId["a1"]["name"] == "home-patrick"
- assert byId["z1"]["name"] == "home-patrick-2"
-
- def test_threeWayCollisionGetsThirdSuffix(self):
- rows = [
- _row("id-aaa", "home-patrick", "Home Patrick"),
- _row("id-bbb", "Home patrick", "Home Patrick"),
- _row("id-ccc", "home patrick", "Home Patrick"),
- ]
- db = _FakeDb(rows)
- _migrateMandateNameLabelSlugRules(db)
- names = sorted(r["name"] for r in db.rows)
- assert names == ["home-patrick", "home-patrick-2", "home-patrick-3"]
-
-
-class TestMigrationIdempotency:
- def test_secondRunIsNoop(self):
- rows = [
- _row("a1", "home-patrick", "Home Patrick"),
- _row("b1", "Home Müller", ""),
- ]
- db = _FakeDb(rows)
- _migrateMandateNameLabelSlugRules(db)
- assert all(isValidMandateName(r["name"]) for r in db.rows)
- firstChanges = list(db.modifyCalls)
- db.modifyCalls.clear()
- _migrateMandateNameLabelSlugRules(db)
- assert db.modifyCalls == [], (
- f"expected no further changes after first migration, got {db.modifyCalls}; "
- f"firstRun changes: {firstChanges}"
- )
-
- def test_validRowsLeftUntouched(self):
- rows = [_row("a1", "root", "Root"), _row("b1", "alpina-treuhand", "Alpina Treuhand AG")]
- db = _FakeDb(rows)
- _migrateMandateNameLabelSlugRules(db)
- assert db.modifyCalls == []
-
-
-class TestMigrationEmpty:
- def test_emptyDbDoesNothing(self):
- db = _FakeDb([])
- _migrateMandateNameLabelSlugRules(db)
- assert db.modifyCalls == []
diff --git a/tests/unit/interfaces/test_folderRbac.py b/tests/unit/interfaces/test_folderRbac.py
new file mode 100644
index 00000000..049f392d
--- /dev/null
+++ b/tests/unit/interfaces/test_folderRbac.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2026 Patrick Motsch
+# All rights reserved.
+"""Unit tests for folder RBAC two-user matrix (ownership & scope visibility)."""
+
+import uuid
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from typing import Dict, Any, List, Optional
+
+from modules.datamodels.datamodelFiles import FileFolder, FileItem
+from modules.datamodels.datamodelUam import User, UserPermissions, AccessLevel
+from modules.interfaces.interfaceDbManagement import ComponentObjects, FileNotFoundError
+
+
+_MANDATE_ID = "mandate-test-1"
+_FEATURE_INSTANCE_ID = "fi-test-1"
+_USER_A = "user-a-id"
+_USER_B = "user-b-id"
+
+
+# ── Fakes & helpers ──────────────────────────────────────────────────────────
+
+class _FakeDb:
+ """In-memory database mock."""
+
+ def __init__(self):
+ self._tables: Dict[str, Dict[str, Dict[str, Any]]] = {}
+ self.connection = MagicMock()
+
+ def getRecordset(self, modelClass, recordFilter=None):
+ tableName = modelClass.__name__
+ records = list(self._tables.get(tableName, {}).values())
+ if not recordFilter:
+ return records
+ return [
+ r for r in records
+ if all(r.get(k) == v for k, v in recordFilter.items())
+ ]
+
+ def recordCreate(self, modelClass, data):
+ tableName = modelClass.__name__
+ self._tables.setdefault(tableName, {})
+ rec = data.model_dump() if hasattr(data, "model_dump") else dict(data)
+ rec.setdefault("id", str(uuid.uuid4()))
+ self._tables[tableName][rec["id"]] = rec
+ return rec
+
+ def recordModify(self, modelClass, recordId, updates):
+ tbl = self._tables.get(modelClass.__name__, {})
+ if recordId in tbl:
+ tbl[recordId].update(updates)
+ return True
+ return False
+
+ def recordDelete(self, modelClass, recordId):
+ tbl = self._tables.get(modelClass.__name__, {})
+ if recordId in tbl:
+ del tbl[recordId]
+ return True
+ return False
+
+ def updateContext(self, userId):
+ pass
+
+ def _ensure_connection(self):
+ pass
+
+ def _ensureTableExists(self, modelClass):
+ return True
+
+ def seed(self, modelClass, record: Dict[str, Any]):
+ tableName = modelClass.__name__
+ self._tables.setdefault(tableName, {})
+ self._tables[tableName][record["id"]] = dict(record)
+
+
+def _makeUser(userId, username="testuser"):
+ return User(id=userId, username=username, language="en")
+
+
+def _makeRbac(
+ createLevel=AccessLevel.ALL,
+ readLevel=AccessLevel.ALL,
+ updateLevel=AccessLevel.MY,
+ deleteLevel=AccessLevel.MY,
+):
+ """Default: regular user can read all, but write only own records."""
+ rbac = Mock()
+ perms = UserPermissions(
+ view=True,
+ read=readLevel,
+ create=createLevel,
+ update=updateLevel,
+ delete=deleteLevel,
+ )
+ rbac.getUserPermissions.return_value = perms
+ return rbac
+
+
+def _buildComponent(userId, fakeDb, rbac=None):
+ with patch.object(ComponentObjects, "__init__", lambda self: None):
+ comp = ComponentObjects()
+ comp.db = fakeDb
+ comp.currentUser = _makeUser(userId)
+ comp.userId = userId
+ comp.mandateId = _MANDATE_ID
+ comp.featureInstanceId = _FEATURE_INSTANCE_ID
+ comp.rbac = rbac or _makeRbac()
+ comp.userLanguage = "en"
+ return comp
+
+
+def _makeFolder(
+ folderId=None, name="Folder", parentId=None,
+ userId=_USER_A, scope="personal", neutralize=False,
+):
+ return {
+ "id": folderId or str(uuid.uuid4()),
+ "name": name,
+ "parentId": parentId,
+ "mandateId": _MANDATE_ID,
+ "featureInstanceId": _FEATURE_INSTANCE_ID,
+ "scope": scope,
+ "neutralize": neutralize,
+ "sysCreatedBy": userId,
+ "sysCreatedAt": 1700000000.0,
+ "sysModifiedAt": 1700000000.0,
+ "sysModifiedBy": None,
+ }
+
+
+def _makeFile(fileId=None, folderId=None, userId=_USER_A, scope="personal"):
+ return {
+ "id": fileId or str(uuid.uuid4()),
+ "fileName": "test.txt",
+ "mimeType": "text/plain",
+ "fileHash": "abc123",
+ "fileSize": 100,
+ "folderId": folderId,
+ "mandateId": _MANDATE_ID,
+ "featureInstanceId": _FEATURE_INSTANCE_ID,
+ "scope": scope,
+ "neutralize": False,
+ "sysCreatedBy": userId,
+ "sysCreatedAt": 1700000000.0,
+ "sysModifiedAt": 1700000000.0,
+ "sysModifiedBy": None,
+ "tags": None,
+ "description": None,
+ "status": None,
+ }
+
+
+def _scopeAwareMock(fakeDb):
+ """Side-effect for getRecordsetWithRBAC that simulates scope-based visibility.
+
+ Visibility rules:
+ - Owner (sysCreatedBy == currentUser.id) always sees the record
+ - scope='global' -> visible to everyone
+ - scope='mandate' -> visible when mandateId matches
+ - scope='featureInstance' -> visible when featureInstanceId matches
+ - scope='personal' -> owner only (already covered above)
+ """
+ def _fn(connector, modelClass, currentUser, recordFilter=None, **kwargs):
+ requestMandateId = kwargs.get("mandateId", _MANDATE_ID)
+ requestFiId = kwargs.get("featureInstanceId", _FEATURE_INSTANCE_ID)
+ allRecords = fakeDb.getRecordset(modelClass, recordFilter=recordFilter)
+ visible = []
+ for rec in allRecords:
+ if rec.get("sysCreatedBy") == currentUser.id:
+ visible.append(rec)
+ continue
+ scope = rec.get("scope", "personal")
+ if scope == "global":
+ visible.append(rec)
+ elif scope == "mandate" and rec.get("mandateId") == requestMandateId:
+ visible.append(rec)
+ elif scope == "featureInstance" and rec.get("featureInstanceId") == requestFiId:
+ visible.append(rec)
+ return visible
+ return _fn
+
+
+# ── Test class ───────────────────────────────────────────────────────────────
+
+@patch("modules.interfaces.interfaceDbManagement.getRecordsetWithRBAC")
+class TestFolderRbac:
+ """Two-user matrix: ownership, scope visibility, and write-access guards."""
+
+ # ── 1. Ownership visibility ───────────────────────────────────────────
+
+ def testUserAFolderInOwnTreeNotInUserBOwnTree(self, mockRbacGet):
+ """User A's personal folder appears in A's own tree, not in B's."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", name="A-Folder", userId=_USER_A))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compA = _buildComponent(_USER_A, fakeDb)
+ ownA = compA.getOwnFolderTree()
+ assert any(f["id"] == "fa-1" for f in ownA)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ ownB = compB.getOwnFolderTree()
+ assert not any(f["id"] == "fa-1" for f in ownB)
+
+ # ── 2. Scope change -> shared visibility ──────────────────────────────
+
+ def testScopeChangeToMandateMakesVisibleToUserB(self, mockRbacGet):
+ """Changing scope from personal to mandate makes the folder appear
+ in User B's shared tree."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="personal", userId=_USER_A))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ sharedBefore = compB.getSharedFolderTree()
+ assert not any(f["id"] == "fa-1" for f in sharedBefore)
+
+ fakeDb.recordModify(FileFolder, "fa-1", {"scope": "mandate"})
+
+ sharedAfter = compB.getSharedFolderTree()
+ assert any(f["id"] == "fa-1" for f in sharedAfter)
+
+ # ── 3-7. Non-owner cannot mutate ──────────────────────────────────────
+
+ def testUserBCannotRenameFolderOfUserA(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ with pytest.raises(PermissionError):
+ compB.renameFolder("fa-1", "Hijacked")
+
+ def testUserBCannotMoveFolderOfUserA(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A))
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fb-1", scope="mandate", userId=_USER_B))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ with pytest.raises(PermissionError):
+ compB.moveFolder("fa-1", "fb-1")
+
+ def testUserBCannotDeleteFolderOfUserA(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ with pytest.raises(PermissionError):
+ compB.deleteFolderCascade("fa-1")
+
+ def testUserBCannotPatchScopeOnFolderOfUserA(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ with pytest.raises(PermissionError):
+ compB.patchFolderScope("fa-1", "personal")
+
+ def testUserBCannotPatchNeutralizeOnFolderOfUserA(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ with pytest.raises(PermissionError):
+ compB.patchFolderNeutralize("fa-1", True)
+
+ # ── 8. contextOrphan ──────────────────────────────────────────────────
+
+ def testContextOrphanWhenParentFolderNotShared(self, mockRbacGet):
+ """User A's parent folder is personal, child folder is mandate.
+ User B sees only the child, flagged as contextOrphan."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="parent-f", name="Private Parent", userId=_USER_A, scope="personal",
+ ))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="child-f", name="Shared Child", userId=_USER_A,
+ parentId="parent-f", scope="mandate",
+ ))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ shared = compB.getSharedFolderTree()
+
+ assert len(shared) == 1
+ assert shared[0]["id"] == "child-f"
+ assert shared[0]["contextOrphan"] is True
+
+ # ── 9. Shared folder children visible ─────────────────────────────────
+
+ def testSharedFolderMakesChildrenVisible(self, mockRbacGet):
+ """When User A shares a folder tree (scope=mandate), all child folders
+ become visible in User B's shared tree."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="root-f", name="Root", userId=_USER_A, scope="mandate",
+ ))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="child1-f", name="Child 1", userId=_USER_A,
+ parentId="root-f", scope="mandate",
+ ))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="child2-f", name="Child 2", userId=_USER_A,
+ parentId="root-f", scope="mandate",
+ ))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="grandchild-f", name="Grandchild", userId=_USER_A,
+ parentId="child1-f", scope="mandate",
+ ))
+ mockRbacGet.side_effect = _scopeAwareMock(fakeDb)
+
+ compB = _buildComponent(_USER_B, fakeDb)
+ shared = compB.getSharedFolderTree()
+
+ sharedIds = {f["id"] for f in shared}
+ assert sharedIds == {"root-f", "child1-f", "child2-f", "grandchild-f"}
+
+ byId = {f["id"]: f for f in shared}
+ assert byId["root-f"]["contextOrphan"] is False
+ assert byId["child1-f"]["contextOrphan"] is False
+ assert byId["child2-f"]["contextOrphan"] is False
+ assert byId["grandchild-f"]["contextOrphan"] is False
diff --git a/tests/unit/rbac/test_sysadmin_migration.py b/tests/unit/rbac/test_sysadmin_migration.py
deleted file mode 100644
index 8ca077bf..00000000
--- a/tests/unit/rbac/test_sysadmin_migration.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2025 Patrick Motsch
-# All rights reserved.
-"""
-Unit tests for the one-shot sysadmin role -> isPlatformAdmin migration.
-
-Covers acceptance criteria from
-``wiki/c-work/4-done/2026-04-sysadmin-authority-split.md``:
-
-- AC#4 -> Existing sysadmin role-holders are promoted to ``isPlatformAdmin=True``
- and the legacy role is removed (Role + UserMandateRole + AccessRules)
- when the gateway boots.
-- AC#10 -> The migration is idempotent and removes ALL artefacts (Role,
- AccessRules, UserMandateRole) of the legacy ``sysadmin`` role.
-
-Strategy: use an in-memory fake ``DatabaseConnector`` that records calls
-and returns deterministic recordsets for ``Role``/``UserMandateRole``/
-``UserMandate``/``UserInDB``/``AccessRule`` lookups.
-"""
-
-from __future__ import annotations
-
-from typing import Any, Dict, List
-from unittest.mock import Mock
-
-from modules.interfaces.interfaceBootstrap import _migrateAndDropSysAdminRole
-from modules.datamodels.datamodelMembership import UserMandate, UserMandateRole
-from modules.datamodels.datamodelRbac import AccessRule, Role
-from modules.datamodels.datamodelUam import UserInDB
-
-
-_ROOT_MANDATE_ID = "root-mandate-id"
-_SYSADMIN_ROLE_ID = "sysadmin-role-id"
-_USER_MANDATE_ID = "user-mandate-id"
-_USER_ID = "legacy-user-id"
-_UMR_ROW_ID = "umr-row-id"
-_ACCESS_RULE_ID = "access-rule-id"
-
-
-def _buildFakeDb(
- *,
- sysadminRoles: List[Dict[str, Any]],
- umRoleRows: List[Dict[str, Any]],
- userMandateRows: List[Dict[str, Any]],
- users: List[Dict[str, Any]],
- accessRules: List[Dict[str, Any]],
-) -> Mock:
- """Build a fake ``DatabaseConnector`` that maps model -> recordset."""
-
- deletes: List[tuple] = []
- modifies: List[tuple] = []
-
- def _getRecordset(model, recordFilter=None, **_): # noqa: ANN001
- recordFilter = recordFilter or {}
- if model is Role:
- label = recordFilter.get("roleLabel")
- mandateId = recordFilter.get("mandateId")
- if label == "sysadmin" and mandateId == _ROOT_MANDATE_ID:
- return list(sysadminRoles)
- return []
- if model is UserMandateRole:
- wanted = recordFilter.get("roleId")
- return [r for r in umRoleRows if r.get("roleId") == wanted]
- if model is UserMandate:
- wanted = recordFilter.get("id")
- return [r for r in userMandateRows if r.get("id") == wanted]
- if model is UserInDB:
- wanted = recordFilter.get("id")
- return [r for r in users if r.get("id") == wanted]
- if model is AccessRule:
- wanted = recordFilter.get("roleId")
- return [r for r in accessRules if r.get("roleId") == wanted]
- return []
-
- def _recordModify(model, recordId, payload): # noqa: ANN001
- modifies.append((model, recordId, payload))
- # Reflect the change so a subsequent migration call is idempotent.
- if model is UserInDB:
- for u in users:
- if u.get("id") == recordId:
- u.update(payload)
- return True
-
- def _recordDelete(model, recordId): # noqa: ANN001
- deletes.append((model, recordId))
- if model is UserMandateRole:
- umRoleRows[:] = [r for r in umRoleRows if r.get("id") != recordId]
- elif model is AccessRule:
- accessRules[:] = [r for r in accessRules if r.get("id") != recordId]
- elif model is Role:
- sysadminRoles[:] = [r for r in sysadminRoles if r.get("id") != recordId]
- return True
-
- db = Mock()
- db.getRecordset = Mock(side_effect=_getRecordset)
- db.recordModify = Mock(side_effect=_recordModify)
- db.recordDelete = Mock(side_effect=_recordDelete)
- db._modifies = modifies # exposed for assertions
- db._deletes = deletes
- return db
-
-
-def _seed():
- return {
- "sysadminRoles": [{"id": _SYSADMIN_ROLE_ID, "roleLabel": "sysadmin",
- "mandateId": _ROOT_MANDATE_ID}],
- "umRoleRows": [{"id": _UMR_ROW_ID, "roleId": _SYSADMIN_ROLE_ID,
- "userMandateId": _USER_MANDATE_ID}],
- "userMandateRows": [{"id": _USER_MANDATE_ID, "userId": _USER_ID,
- "mandateId": _ROOT_MANDATE_ID}],
- "users": [{"id": _USER_ID, "username": "legacy",
- "isSysAdmin": False, "isPlatformAdmin": False}],
- "accessRules": [{"id": _ACCESS_RULE_ID, "roleId": _SYSADMIN_ROLE_ID}],
- }
-
-
-# ---------------------------------------------------------------------------
-# AC #4 — promote + drop on first run
-# ---------------------------------------------------------------------------
-
-
-def testMigrationPromotesUserAndDropsArtefacts():
- """AC#4: legacy holder is promoted; Role+AccessRule+UMR are deleted."""
- seed = _seed()
- db = _buildFakeDb(**seed)
-
- _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID)
-
- # User got isPlatformAdmin=True
- assert seed["users"][0]["isPlatformAdmin"] is True
- assert any(
- m[0] is UserInDB and m[2] == {"isPlatformAdmin": True}
- for m in db._modifies
- ), "Expected UserInDB.isPlatformAdmin promotion call"
-
- # All three artefact tables had their rows deleted.
- deletedModels = {m[0] for m in db._deletes}
- assert UserMandateRole in deletedModels, "UserMandateRole row not deleted"
- assert AccessRule in deletedModels, "AccessRule row not deleted"
- assert Role in deletedModels, "Sysadmin Role record not deleted"
-
- # And the seeded lists are empty after the migration.
- assert seed["umRoleRows"] == []
- assert seed["accessRules"] == []
- assert seed["sysadminRoles"] == []
-
-
-# ---------------------------------------------------------------------------
-# AC #10 — idempotent: a second run is a no-op
-# ---------------------------------------------------------------------------
-
-
-def testMigrationIsIdempotent():
- """AC#10: a second invocation finds no sysadmin role and exits silently."""
- seed = _seed()
- db = _buildFakeDb(**seed)
-
- _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID)
- firstModifies = list(db._modifies)
- firstDeletes = list(db._deletes)
-
- _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID)
-
- # No additional writes on the second call.
- assert db._modifies == firstModifies, (
- "Second migration call must not perform additional writes"
- )
- assert db._deletes == firstDeletes, (
- "Second migration call must not perform additional deletes"
- )
-
-
-def testMigrationSkipsAlreadyPromotedUsers():
- """If a user already has ``isPlatformAdmin=True``, no redundant write."""
- seed = _seed()
- seed["users"][0]["isPlatformAdmin"] = True # already promoted
- db = _buildFakeDb(**seed)
-
- _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID)
-
- # No promotion write for an already-promoted user.
- promotionWrites = [
- m for m in db._modifies
- if m[0] is UserInDB and m[2].get("isPlatformAdmin") is True
- ]
- assert promotionWrites == [], (
- "Should not re-write isPlatformAdmin if user already has it"
- )
-
- # But role + access-rule cleanup still happens.
- deletedModels = {m[0] for m in db._deletes}
- assert Role in deletedModels
- assert AccessRule in deletedModels
- assert UserMandateRole in deletedModels
-
-
-def testMigrationOnEmptyDbIsNoop():
- """No legacy sysadmin role at all -> no calls, no errors."""
- db = _buildFakeDb(
- sysadminRoles=[],
- umRoleRows=[],
- userMandateRows=[],
- users=[],
- accessRules=[],
- )
-
- _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID)
-
- assert db._modifies == []
- assert db._deletes == []
diff --git a/tests/unit/routes/test_folder_crud.py b/tests/unit/routes/test_folder_crud.py
new file mode 100644
index 00000000..86eaf480
--- /dev/null
+++ b/tests/unit/routes/test_folder_crud.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2026 Patrick Motsch
+# All rights reserved.
+"""Unit tests for folder CRUD operations in ComponentObjects."""
+
+import uuid
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from typing import Dict, Any, List, Optional
+
+from modules.datamodels.datamodelFiles import FileFolder, FileItem
+from modules.datamodels.datamodelUam import User, UserPermissions, AccessLevel
+from modules.interfaces.interfaceDbManagement import ComponentObjects, FileNotFoundError
+
+
+_MANDATE_ID = "mandate-test-1"
+_FEATURE_INSTANCE_ID = "fi-test-1"
+_USER_ID = "user-a-id"
+
+
+# ── Fakes & helpers ──────────────────────────────────────────────────────────
+
+class _FakeDb:
+ """In-memory database mock that mimics DatabaseConnector for unit tests."""
+
+ def __init__(self):
+ self._tables: Dict[str, Dict[str, Dict[str, Any]]] = {}
+ self.connection = MagicMock()
+
+ def getRecordset(self, modelClass, recordFilter=None):
+ tableName = modelClass.__name__
+ records = list(self._tables.get(tableName, {}).values())
+ if not recordFilter:
+ return records
+ return [
+ r for r in records
+ if all(r.get(k) == v for k, v in recordFilter.items())
+ ]
+
+ def recordCreate(self, modelClass, data):
+ tableName = modelClass.__name__
+ self._tables.setdefault(tableName, {})
+ rec = data.model_dump() if hasattr(data, "model_dump") else dict(data)
+ rec.setdefault("id", str(uuid.uuid4()))
+ self._tables[tableName][rec["id"]] = rec
+ return rec
+
+ def recordModify(self, modelClass, recordId, updates):
+ tableName = modelClass.__name__
+ tbl = self._tables.get(tableName, {})
+ if recordId in tbl:
+ tbl[recordId].update(updates)
+ return True
+ return False
+
+ def recordDelete(self, modelClass, recordId):
+ tableName = modelClass.__name__
+ tbl = self._tables.get(tableName, {})
+ if recordId in tbl:
+ del tbl[recordId]
+ return True
+ return False
+
+ def updateContext(self, userId):
+ pass
+
+ def _ensure_connection(self):
+ pass
+
+ def _ensureTableExists(self, modelClass):
+ return True
+
+ def seed(self, modelClass, record: Dict[str, Any]):
+ tableName = modelClass.__name__
+ self._tables.setdefault(tableName, {})
+ self._tables[tableName][record["id"]] = dict(record)
+
+
+def _makeUser(userId=_USER_ID, username="testuser"):
+ return User(id=userId, username=username, language="en")
+
+
+def _makeRbac(
+ createLevel=AccessLevel.ALL,
+ readLevel=AccessLevel.ALL,
+ updateLevel=AccessLevel.ALL,
+ deleteLevel=AccessLevel.ALL,
+):
+ rbac = Mock()
+ perms = UserPermissions(
+ view=True,
+ read=readLevel,
+ create=createLevel,
+ update=updateLevel,
+ delete=deleteLevel,
+ )
+ rbac.getUserPermissions.return_value = perms
+ return rbac
+
+
+def _buildComponent(
+ userId=_USER_ID,
+ fakeDb=None,
+ rbac=None,
+ mandateId=_MANDATE_ID,
+ featureInstanceId=_FEATURE_INSTANCE_ID,
+):
+ """Construct a ComponentObjects with mocked internals (no real DB)."""
+ with patch.object(ComponentObjects, "__init__", lambda self: None):
+ comp = ComponentObjects()
+ comp.db = fakeDb or _FakeDb()
+ comp.currentUser = _makeUser(userId)
+ comp.userId = userId
+ comp.mandateId = mandateId
+ comp.featureInstanceId = featureInstanceId
+ comp.rbac = rbac or _makeRbac()
+ comp.userLanguage = "en"
+ return comp
+
+
+def _rbacFromFakeDb(fakeDb):
+ """Side-effect for getRecordsetWithRBAC that delegates to _FakeDb."""
+ def _fn(connector, modelClass, currentUser, recordFilter=None, **kwargs):
+ return fakeDb.getRecordset(modelClass, recordFilter=recordFilter)
+ return _fn
+
+
+def _makeFolder(
+ folderId=None, name="Folder", parentId=None,
+ userId=_USER_ID, scope="personal", neutralize=False,
+):
+ return {
+ "id": folderId or str(uuid.uuid4()),
+ "name": name,
+ "parentId": parentId,
+ "mandateId": _MANDATE_ID,
+ "featureInstanceId": _FEATURE_INSTANCE_ID,
+ "scope": scope,
+ "neutralize": neutralize,
+ "sysCreatedBy": userId,
+ "sysCreatedAt": 1700000000.0,
+ "sysModifiedAt": 1700000000.0,
+ "sysModifiedBy": None,
+ }
+
+
+def _makeFile(fileId=None, folderId=None, userId=_USER_ID, scope="personal"):
+ return {
+ "id": fileId or str(uuid.uuid4()),
+ "fileName": "test.txt",
+ "mimeType": "text/plain",
+ "fileHash": "abc123",
+ "fileSize": 100,
+ "folderId": folderId,
+ "mandateId": _MANDATE_ID,
+ "featureInstanceId": _FEATURE_INSTANCE_ID,
+ "scope": scope,
+ "neutralize": False,
+ "sysCreatedBy": userId,
+ "sysCreatedAt": 1700000000.0,
+ "sysModifiedAt": 1700000000.0,
+ "sysModifiedBy": None,
+ "tags": None,
+ "description": None,
+ "status": None,
+ }
+
+
+# ── Test class ───────────────────────────────────────────────────────────────
+
+@patch("modules.interfaces.interfaceDbManagement.getRecordsetWithRBAC")
+class TestFolderCrud:
+ """Tests for folder create / rename / move / delete / patch operations."""
+
+ # ── Create ────────────────────────────────────────────────────────────
+
+ def testCreateFolderHappyPath(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.createFolder("Test Folder")
+
+ assert result["name"] == "Test Folder"
+ assert result["scope"] == "personal"
+ assert result["parentId"] is None
+ assert result["mandateId"] == _MANDATE_ID
+
+ def testCreateFolderWithParent(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="parent-1", name="Parent"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.createFolder("Child Folder", parentId="parent-1")
+
+ assert result["name"] == "Child Folder"
+ assert result["parentId"] == "parent-1"
+
+ def testCreateFolderMissingNameNoInterfaceValidation(self, mockRbacGet):
+ """Interface does not validate empty name; the route layer returns 400."""
+ fakeDb = _FakeDb()
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.createFolder("")
+ assert result["name"] == ""
+
+ # ── Rename ────────────────────────────────────────────────────────────
+
+ def testRenameFolderHappyPath(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", name="Old Name"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.renameFolder("f-1", "New Name")
+
+ assert result["name"] == "New Name"
+ assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["name"] == "New Name"
+
+ def testRenameFolderNotFound(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ with pytest.raises(FileNotFoundError):
+ comp.renameFolder("nonexistent", "New Name")
+
+ # ── Move ──────────────────────────────────────────────────────────────
+
+ def testMoveFolderHappyPath(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", name="Movable"))
+ fakeDb.seed(FileFolder, _makeFolder(folderId="t-1", name="Target"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.moveFolder("f-1", "t-1")
+
+ assert result["parentId"] == "t-1"
+ assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["parentId"] == "t-1"
+
+ def testMoveFolderToRoot(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", name="Nested", parentId="old"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.moveFolder("f-1", None)
+
+ assert result["parentId"] is None
+
+ def testMoveFolderCircularReference(self, mockRbacGet):
+ """A -> B -> C: moving A under C creates a cycle."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="a", name="A", parentId=None))
+ fakeDb.seed(FileFolder, _makeFolder(folderId="b", name="B", parentId="a"))
+ fakeDb.seed(FileFolder, _makeFolder(folderId="c", name="C", parentId="b"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ with pytest.raises(ValueError, match="circular reference"):
+ comp.moveFolder("a", "c")
+
+ # ── Delete cascade ────────────────────────────────────────────────────
+
+ def testDeleteFolderCascade(self, mockRbacGet):
+ """Deleting root folder removes root + child + their files."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="root", name="Root"))
+ fakeDb.seed(FileFolder, _makeFolder(folderId="child", name="Child", parentId="root"))
+ fakeDb.seed(FileItem, _makeFile(fileId="file-1", folderId="root"))
+ fakeDb.seed(FileItem, _makeFile(fileId="file-2", folderId="child"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.deleteFolderCascade("root")
+
+ assert result["deletedFolders"] == 2
+ assert result["deletedFiles"] == 2
+
+ def testDeleteFolderNotFound(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ with pytest.raises(FileNotFoundError):
+ comp.deleteFolderCascade("nonexistent")
+
+ # ── Patch scope ───────────────────────────────────────────────────────
+
+ def testPatchScopeNoCascade(self, mockRbacGet):
+ """Change folder scope without cascading to files."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", scope="personal"))
+ fakeDb.seed(FileItem, _makeFile(fileId="file-1", folderId="f-1"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.patchFolderScope("f-1", "mandate", cascadeToFiles=False)
+
+ assert result["scope"] == "mandate"
+ assert result["filesUpdated"] == 0
+ assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["scope"] == "mandate"
+ assert fakeDb.getRecordset(FileItem, {"id": "file-1"})[0]["scope"] == "personal"
+
+ def testPatchScopeWithCascade(self, mockRbacGet):
+ """cascadeToFiles=True updates only owned files in the folder."""
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", scope="personal"))
+ fakeDb.seed(FileItem, _makeFile(fileId="own-file", folderId="f-1"))
+ fakeDb.seed(FileItem, _makeFile(fileId="other-file", folderId="f-1", userId="user-b"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.patchFolderScope("f-1", "mandate", cascadeToFiles=True)
+
+ assert result["filesUpdated"] == 1
+ assert fakeDb.getRecordset(FileItem, {"id": "own-file"})[0]["scope"] == "mandate"
+ assert fakeDb.getRecordset(FileItem, {"id": "other-file"})[0]["scope"] == "personal"
+
+ def testPatchScopeInvalid(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ with pytest.raises(ValueError, match="Invalid scope"):
+ comp.patchFolderScope("f-1", "invalid_scope")
+
+ # ── Patch neutralize ──────────────────────────────────────────────────
+
+ def testPatchNeutralizeToggle(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", neutralize=False))
+ fakeDb.seed(FileItem, _makeFile(fileId="file-1", folderId="f-1"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ resultOn = comp.patchFolderNeutralize("f-1", True)
+ assert resultOn["neutralize"] is True
+ assert resultOn["filesUpdated"] == 1
+ assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["neutralize"] is True
+ assert fakeDb.getRecordset(FileItem, {"id": "file-1"})[0]["neutralize"] is True
+
+ resultOff = comp.patchFolderNeutralize("f-1", False)
+ assert resultOff["neutralize"] is False
+ assert fakeDb.getRecordset(FileItem, {"id": "file-1"})[0]["neutralize"] is False
+
+ # ── Tree queries ──────────────────────────────────────────────────────
+
+ def testGetOwnFolderTree(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="own-1", name="Mine"))
+ fakeDb.seed(FileFolder, _makeFolder(folderId="other-1", name="Theirs", userId="user-b"))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.getOwnFolderTree()
+
+ assert len(result) == 1
+ assert result[0]["id"] == "own-1"
+
+ def testGetSharedFolderTreeWithContextOrphan(self, mockRbacGet):
+ fakeDb = _FakeDb()
+ fakeDb.seed(FileFolder, _makeFolder(folderId="own", name="Own"))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="shared-root", name="Shared Root", userId="user-b", scope="mandate",
+ ))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="shared-child", name="Shared Child", userId="user-b",
+ parentId="shared-root", scope="mandate",
+ ))
+ fakeDb.seed(FileFolder, _makeFolder(
+ folderId="orphan", name="Orphan", userId="user-b",
+ parentId="invisible-parent", scope="mandate",
+ ))
+ comp = _buildComponent(fakeDb=fakeDb)
+ mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb)
+
+ result = comp.getSharedFolderTree()
+
+ ids = {r["id"] for r in result}
+ assert "own" not in ids
+ assert "shared-root" in ids
+ assert "shared-child" in ids
+ assert "orphan" in ids
+
+ byId = {r["id"]: r for r in result}
+ assert byId["shared-root"]["contextOrphan"] is False
+ assert byId["shared-child"]["contextOrphan"] is False
+ assert byId["orphan"]["contextOrphan"] is True
diff --git a/tests/unit/services/test_bootstrap_clickup.py b/tests/unit/services/test_bootstrap_clickup.py
new file mode 100644
index 00000000..87c08c3d
--- /dev/null
+++ b/tests/unit/services/test_bootstrap_clickup.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap ClickUp tests with a fake service + knowledge service.
+
+Verifies:
+- Teams → spaces → lists (folderless + folder-based) → tasks traversal.
+- Each task produces a `requestIngestion` call with `sourceKind="clickup_task"`
+ and header + description content-objects.
+- `date_updated` is forwarded as contentVersion → idempotency.
+- Recency filter drops tasks older than `maxAgeDays`.
+- maxWorkspaces / maxListsPerWorkspace / maxTasks caps are respected.
+"""
+
+import asyncio
+import os
+import sys
+import time
+from types import SimpleNamespace
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
+ bootstrapClickup,
+ ClickupBootstrapLimits,
+ _syntheticTaskId,
+)
+
+
+def _nowMs(offsetDays: int = 0) -> str:
+ return str(int((time.time() + offsetDays * 86400) * 1000))
+
+
+class _FakeClickupService:
+ """Records API calls; serves a canned 1-team / 1-space / 1-list / 2-task layout."""
+
+ def __init__(self, taskCount=2, oldTask=False):
+ self._taskCount = taskCount
+ self._oldTask = oldTask # when True, the second task is 400 days old
+ self.calls = []
+
+ async def getAuthorizedTeams(self):
+ self.calls.append(("getAuthorizedTeams",))
+ return {"teams": [{"id": "team-1", "name": "Acme"}]}
+
+ async def getSpaces(self, team_id: str):
+ self.calls.append(("getSpaces", team_id))
+ return {"spaces": [{"id": "space-1", "name": "Engineering"}]}
+
+ async def getFolderlessLists(self, space_id: str):
+ self.calls.append(("getFolderlessLists", space_id))
+ return {"lists": [{"id": "list-1", "name": "Sprint 1"}]}
+
+ async def getFolders(self, space_id: str):
+ self.calls.append(("getFolders", space_id))
+ return {"folders": [{"id": "folder-1", "name": "Subproject"}]}
+
+ async def getListsInFolder(self, folder_id: str):
+ self.calls.append(("getListsInFolder", folder_id))
+ return {"lists": [{"id": "list-2", "name": "Sub-tasks"}]}
+
+ async def getTasksInList(self, list_id: str, *, page=0, include_closed=False, subtasks=True):
+ self.calls.append(("getTasksInList", list_id, page, include_closed))
+ if page > 0:
+ return {"tasks": []}
+ tasks = []
+ for i in range(self._taskCount):
+ tid = f"{list_id}-task-{i}"
+ offsetDays = -400 if (self._oldTask and i == 1) else 0
+ tasks.append({
+ "id": tid,
+ "name": f"Task {i} of {list_id}",
+ "description": f"Plain description for task {i}",
+ "text_content": f"Rich content for task {i}",
+ "status": {"status": "open" if i == 0 else "closed"},
+ "assignees": [{"username": "alice"}],
+ "tags": [{"name": "urgent"}],
+ "date_updated": _nowMs(offsetDays),
+ "date_created": _nowMs(-1),
+ "url": f"https://app.clickup.com/t/{tid}",
+ })
+ return {"tasks": tasks}
+
+
+class _FakeKnowledgeService:
+ def __init__(self, duplicateIds=None):
+ self.calls = []
+ self._duplicates = duplicateIds or set()
+
+ async def requestIngestion(self, job):
+ self.calls.append(job)
+ status = "duplicate" if job.sourceId in self._duplicates else "indexed"
+ return SimpleNamespace(
+ jobId=job.sourceId, status=status, contentHash="h",
+ fileId=job.sourceId, index=None, error=None,
+ )
+
+
+def _adapter(svc):
+ return SimpleNamespace(_svc=svc)
+
+
+def test_bootstrap_walks_team_space_lists_and_tasks():
+ svc = _FakeClickupService(taskCount=2)
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapClickup(
+ connectionId="c1",
+ adapter=_adapter(svc),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=ClickupBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ # 2 lists (folderless list-1 + folder's list-2) × 2 tasks each = 4 tasks
+ assert result["indexed"] == 4
+ assert result["workspaces"] == 1
+ assert result["lists"] == 2
+ sourceIds = {c.sourceId for c in knowledge.calls}
+ assert len(sourceIds) == 4
+ for job in knowledge.calls:
+ assert job.sourceKind == "clickup_task"
+ assert job.mimeType == "application/vnd.clickup.task+json"
+ assert job.mandateId == "m1"
+ assert job.provenance["connectionId"] == "c1"
+ assert job.provenance["authority"] == "clickup"
+ assert job.provenance["teamId"] == "team-1"
+ assert job.contentVersion # numeric millisecond string
+ # At least the header content-object is present.
+ ids = [co["contentObjectId"] for co in job.contentObjects]
+ assert "header" in ids
+
+
+def test_bootstrap_reports_duplicates_on_second_run():
+ svc = _FakeClickupService(taskCount=1)
+ duplicates = {
+ _syntheticTaskId("c1", "list-1-task-0"),
+ _syntheticTaskId("c1", "list-2-task-0"),
+ }
+ knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapClickup(
+ connectionId="c1",
+ adapter=_adapter(svc),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=ClickupBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 0
+ assert result["skippedDuplicate"] == 2
+
+
+def test_bootstrap_skips_tasks_older_than_maxAgeDays():
+ svc = _FakeClickupService(taskCount=2, oldTask=True)
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapClickup(
+ connectionId="c1",
+ adapter=_adapter(svc),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=ClickupBootstrapLimits(maxAgeDays=180),
+ )
+
+ result = asyncio.run(_run())
+ # 2 lists × (1 recent + 1 skipped old) = 2 indexed + 2 skippedPolicy
+ assert result["indexed"] == 2
+ assert result["skippedPolicy"] == 2
+
+
+def test_bootstrap_maxTasks_caps_ingestion():
+ svc = _FakeClickupService(taskCount=2)
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapClickup(
+ connectionId="c1",
+ adapter=_adapter(svc),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=ClickupBootstrapLimits(maxAgeDays=None, maxTasks=3),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 3
+
+
+if __name__ == "__main__":
+ test_bootstrap_walks_team_space_lists_and_tasks()
+ test_bootstrap_reports_duplicates_on_second_run()
+ test_bootstrap_skips_tasks_older_than_maxAgeDays()
+ test_bootstrap_maxTasks_caps_ingestion()
+ print("OK — bootstrapClickup tests passed")
diff --git a/tests/unit/services/test_bootstrap_gdrive.py b/tests/unit/services/test_bootstrap_gdrive.py
new file mode 100644
index 00000000..1b88677e
--- /dev/null
+++ b/tests/unit/services/test_bootstrap_gdrive.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap Google Drive tests with a fake adapter + knowledge service.
+
+Verifies:
+- Drive walk traverses root → subfolders, respecting `maxDepth`.
+- Every file triggers `requestIngestion` with `sourceKind="gdrive_item"`.
+- Duplicate runs (same modifiedTime revision) report `skippedDuplicate`.
+- Provenance carries `authority="google"` and the Drive file id.
+- Recency filter skips files older than `maxAgeDays`.
+"""
+
+import asyncio
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from types import SimpleNamespace
+from typing import Any, Dict, List, Optional
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
+ bootstrapGdrive,
+ GdriveBootstrapLimits,
+ _syntheticFileId,
+)
+
+
+@dataclass
+class _ExtEntry:
+ name: str
+ path: str
+ isFolder: bool = False
+ size: Optional[int] = None
+ mimeType: Optional[str] = None
+ metadata: Dict[str, Any] = None
+
+
+def _today_iso(offsetDays: int = 0) -> str:
+ return (datetime.now(timezone.utc) + timedelta(days=offsetDays)).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+class _FakeDriveAdapter:
+ """Minimal DriveAdapter stand-in.
+
+ Layout:
+ "/" (root) → 2 files + 1 folder (sub)
+ "/sub_id" → 1 file
+ """
+
+ def __init__(self, recent_only: bool = True):
+ self.downloaded: List[str] = []
+ self._recent = _today_iso(0)
+ self._old = _today_iso(-400)
+ self._recent_only = recent_only
+
+ async def browse(self, path: str, filter=None, limit=None):
+ if path in ("/", "", "root"):
+ return [
+ _ExtEntry(
+ name="f1.txt", path="/f1", size=20,
+ mimeType="text/plain",
+ metadata={"id": "f1", "modifiedTime": self._recent},
+ ),
+ _ExtEntry(
+ name="f2.txt", path="/f2", size=20,
+ mimeType="text/plain",
+ metadata={"id": "f2", "modifiedTime": self._recent if self._recent_only else self._old},
+ ),
+ _ExtEntry(
+ name="Subfolder", path="/sub_id", isFolder=True,
+ mimeType="application/vnd.google-apps.folder",
+ metadata={"id": "sub_id", "modifiedTime": self._recent},
+ ),
+ ]
+ if path == "/sub_id":
+ return [
+ _ExtEntry(
+ name="f3.txt", path="/f3", size=20,
+ mimeType="text/plain",
+ metadata={"id": "f3", "modifiedTime": self._recent},
+ ),
+ ]
+ return []
+
+ async def download(self, path: str) -> bytes:
+ self.downloaded.append(path)
+ return path.encode("utf-8")
+
+
+class _FakeKnowledgeService:
+ def __init__(self, duplicateIds=None):
+ self.calls: List[SimpleNamespace] = []
+ self._duplicateIds = duplicateIds or set()
+
+ async def requestIngestion(self, job):
+ self.calls.append(job)
+ status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
+ return SimpleNamespace(
+ jobId=f"{job.sourceKind}:{job.sourceId}",
+ status=status, contentHash="h",
+ fileId=job.sourceId, index=None, error=None,
+ )
+
+
+def _fakeRunExtraction(data, name, mime, options):
+ return SimpleNamespace(
+ parts=[
+ SimpleNamespace(
+ id="p1",
+ data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
+ typeGroup="text",
+ label="page:1",
+ metadata={"pageIndex": 0},
+ )
+ ]
+ )
+
+
+def test_bootstrap_walks_drive_and_subfolders():
+ adapter = _FakeDriveAdapter()
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGdrive(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ limits=GdriveBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ assert len(knowledge.calls) == 3
+ sourceIds = {c.sourceId for c in knowledge.calls}
+ assert sourceIds == {
+ _syntheticFileId("c1", "f1"),
+ _syntheticFileId("c1", "f2"),
+ _syntheticFileId("c1", "f3"),
+ }
+ assert result["indexed"] == 3
+ assert result["skippedDuplicate"] == 0
+ assert adapter.downloaded == ["/f1", "/f2", "/f3"]
+
+
+def test_bootstrap_reports_duplicates_on_second_run():
+ adapter = _FakeDriveAdapter()
+ duplicateIds = {
+ _syntheticFileId("c1", "f1"),
+ _syntheticFileId("c1", "f2"),
+ _syntheticFileId("c1", "f3"),
+ }
+ knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGdrive(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ limits=GdriveBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 0
+ assert result["skippedDuplicate"] == 3
+
+
+def test_bootstrap_skips_files_older_than_maxAgeDays():
+ adapter = _FakeDriveAdapter(recent_only=False) # f2 is 400 days old
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGdrive(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ limits=GdriveBootstrapLimits(maxAgeDays=180),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 2 # f1, f3
+ assert result["skippedPolicy"] == 1 # f2 filtered out
+
+
+def test_bootstrap_passes_connection_provenance():
+ adapter = _FakeDriveAdapter()
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGdrive(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ limits=GdriveBootstrapLimits(maxAgeDays=None),
+ )
+
+ asyncio.run(_run())
+ for job in knowledge.calls:
+ assert job.sourceKind == "gdrive_item"
+ assert job.mandateId == "m1"
+ assert job.provenance["connectionId"] == "c1"
+ assert job.provenance["authority"] == "google"
+ assert job.provenance["service"] == "drive"
+ assert job.contentVersion # modifiedTime ISO string
+
+
+if __name__ == "__main__":
+ test_bootstrap_walks_drive_and_subfolders()
+ test_bootstrap_reports_duplicates_on_second_run()
+ test_bootstrap_skips_files_older_than_maxAgeDays()
+ test_bootstrap_passes_connection_provenance()
+ print("OK — bootstrapGdrive tests passed")
diff --git a/tests/unit/services/test_bootstrap_gmail.py b/tests/unit/services/test_bootstrap_gmail.py
new file mode 100644
index 00000000..4f7cfe4d
--- /dev/null
+++ b/tests/unit/services/test_bootstrap_gmail.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap Gmail tests with a fake googleGet + knowledge service.
+
+Verifies:
+- Default labels (INBOX + SENT) are traversed.
+- Each message produces a requestIngestion call with sourceKind=gmail_message
+ and structured contentObjects (header / snippet / body).
+- Pagination via `nextPageToken` is followed.
+- historyId is forwarded as contentVersion → idempotency.
+- MIME body extraction walks nested parts (multipart/alternative).
+"""
+
+import asyncio
+import base64
+import os
+import sys
+from types import SimpleNamespace
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+ bootstrapGmail,
+ GmailBootstrapLimits,
+ _syntheticMessageId,
+ _buildContentObjects,
+ _walkPayloadForBody,
+)
+
+
+def _b64url(text: str) -> str:
+ return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=")
+
+
+def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"):
+ return {
+ "id": mid,
+ "threadId": f"thread-{mid}",
+ "historyId": historyId,
+ "internalDate": "1700000000000",
+ "snippet": body[:120],
+ "payload": {
+ "headers": [
+ {"name": "Subject", "value": subject},
+ {"name": "From", "value": "Alice "},
+ {"name": "To", "value": "Bob "},
+ {"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"},
+ ],
+ "mimeType": "text/plain",
+ "body": {"data": _b64url(body), "size": len(body)},
+ "parts": [],
+ },
+ }
+
+
+class _FakeGoogleGet:
+ """Records URLs + returns the wired-up page or message response."""
+
+ def __init__(self, messages_by_label, paginated_label=None, page2=None):
+ self._messages = messages_by_label
+ self._paginated = paginated_label
+ self._page2 = page2 or []
+ self._served_first_page = set()
+ self.requested = []
+
+ async def __call__(self, url: str):
+ self.requested.append(url)
+ # List page: contains `/users/me/messages?labelIds=...`
+ if "/users/me/messages?" in url:
+ for label, msgs in self._messages.items():
+ if f"labelIds={label}" in url:
+ if (
+ label == self._paginated
+ and label not in self._served_first_page
+ ):
+ self._served_first_page.add(label)
+ return {
+ "messages": [{"id": m["id"]} for m in msgs],
+ "nextPageToken": "token-2",
+ }
+ if label == self._paginated and "pageToken=token-2" in url:
+ return {
+ "messages": [{"id": m["id"]} for m in self._page2],
+ }
+ return {"messages": [{"id": m["id"]} for m in msgs]}
+ return {"messages": []}
+ # Detail fetch: /users/me/messages/{id}?format=full
+ if "/users/me/messages/" in url and "format=full" in url:
+ msgId = url.split("/users/me/messages/")[-1].split("?")[0]
+ for msgs in self._messages.values():
+ for m in msgs:
+ if m["id"] == msgId:
+ return m
+ for m in self._page2:
+ if m["id"] == msgId:
+ return m
+ return {"error": "not found"}
+
+
+class _FakeKnowledgeService:
+ def __init__(self, duplicateIds=None):
+ self.calls = []
+ self._duplicates = duplicateIds or set()
+
+ async def requestIngestion(self, job):
+ self.calls.append(job)
+ status = "duplicate" if job.sourceId in self._duplicates else "indexed"
+ return SimpleNamespace(
+ jobId=job.sourceId, status=status, contentHash="h",
+ fileId=job.sourceId, index=None, error=None,
+ )
+
+
+def test_buildContentObjects_emits_header_snippet_body():
+ parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000)
+ ids = [p["contentObjectId"] for p in parts]
+ assert ids == ["header", "snippet", "body"]
+ header = parts[0]["data"]
+ assert "Subject: Hi" in header
+ assert "From: Alice " in header
+ assert "To: Bob " in header
+
+
+def test_walkPayloadForBody_prefers_plain_over_html():
+ payload = {
+ "mimeType": "multipart/alternative",
+ "parts": [
+ {"mimeType": "text/plain", "body": {"data": _b64url("plain body")}},
+ {"mimeType": "text/html", "body": {"data": _b64url("html body
")}},
+ ],
+ }
+ bodies = _walkPayloadForBody(payload)
+ assert bodies["text"] == "plain body"
+ assert bodies["html"] == "html body
"
+
+
+def test_walkPayloadForBody_falls_back_to_html():
+ payload = {
+ "mimeType": "multipart/alternative",
+ "parts": [
+ {"mimeType": "text/html", "body": {"data": _b64url("only html
")}},
+ ],
+ }
+ bodies = _walkPayloadForBody(payload)
+ assert bodies["text"] == ""
+ assert "only html" in bodies["html"]
+
+
+def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent():
+ fake_get = _FakeGoogleGet({
+ "INBOX": [_msg("m1"), _msg("m2")],
+ "SENT": [_msg("m3")],
+ })
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGmail(
+ connectionId="c1",
+ adapter=SimpleNamespace(_token="t"),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=GmailBootstrapLimits(maxAgeDays=None),
+ googleGetFn=fake_get,
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 3
+ sourceIds = {c.sourceId for c in knowledge.calls}
+ assert sourceIds == {
+ _syntheticMessageId("c1", "m1"),
+ _syntheticMessageId("c1", "m2"),
+ _syntheticMessageId("c1", "m3"),
+ }
+ for job in knowledge.calls:
+ assert job.sourceKind == "gmail_message"
+ assert job.mimeType == "message/rfc822"
+ assert job.provenance["connectionId"] == "c1"
+ assert job.provenance["authority"] == "google"
+ assert job.provenance["service"] == "gmail"
+ assert job.contentVersion == "h1"
+ assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
+
+
+def test_bootstrap_gmail_follows_pagination():
+ fake_get = _FakeGoogleGet(
+ messages_by_label={"INBOX": [_msg("m1")], "SENT": []},
+ paginated_label="INBOX",
+ page2=[_msg("m2"), _msg("m3")],
+ )
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGmail(
+ connectionId="c1",
+ adapter=SimpleNamespace(_token="t"),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=GmailBootstrapLimits(maxAgeDays=None),
+ googleGetFn=fake_get,
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 3
+
+
+def test_bootstrap_gmail_reports_duplicates():
+ fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []})
+ duplicates = {
+ _syntheticMessageId("c1", "m1"),
+ _syntheticMessageId("c1", "m2"),
+ }
+ knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapGmail(
+ connectionId="c1",
+ adapter=SimpleNamespace(_token="t"),
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=GmailBootstrapLimits(maxAgeDays=None),
+ googleGetFn=fake_get,
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 0
+ assert result["skippedDuplicate"] == 2
+
+
+if __name__ == "__main__":
+ test_buildContentObjects_emits_header_snippet_body()
+ test_walkPayloadForBody_prefers_plain_over_html()
+ test_walkPayloadForBody_falls_back_to_html()
+ test_bootstrap_gmail_indexes_messages_from_inbox_and_sent()
+ test_bootstrap_gmail_follows_pagination()
+ test_bootstrap_gmail_reports_duplicates()
+ print("OK — bootstrapGmail tests passed")
diff --git a/tests/unit/services/test_bootstrap_outlook.py b/tests/unit/services/test_bootstrap_outlook.py
new file mode 100644
index 00000000..26664eaa
--- /dev/null
+++ b/tests/unit/services/test_bootstrap_outlook.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap Outlook tests with a fake adapter + knowledge service.
+
+Verifies:
+- Well-known folders (inbox, sentitems) are discovered via Graph.
+- Each message produces a `requestIngestion` call with sourceKind=outlook_message
+ and structured contentObjects (header / snippet / body).
+- Pagination via `@odata.nextLink` is followed.
+- changeKey is forwarded as contentVersion → idempotency.
+"""
+
+import asyncio
+import os
+import sys
+from types import SimpleNamespace
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
+ bootstrapOutlook,
+ OutlookBootstrapLimits,
+ _syntheticMessageId,
+ _buildContentObjects,
+)
+
+
+class _FakeOutlookAdapter:
+ def __init__(self, messages_by_folder, paginated_folder=None, page2=None):
+ self._folders = {"inbox": "INBOX-ID", "sentitems": "SENT-ID"}
+ self._messages = messages_by_folder
+ self._paginated_folder = paginated_folder
+ self._page2 = page2 or []
+ self.requested_endpoints = []
+
+ async def _graphGet(self, endpoint: str):
+ self.requested_endpoints.append(endpoint)
+ if endpoint.startswith("me/mailFolders/") and "/messages" not in endpoint:
+ wellKnown = endpoint.split("/")[-1]
+ fid = self._folders.get(wellKnown)
+ if not fid:
+ return {"error": "not found"}
+ return {"id": fid, "displayName": wellKnown}
+ # message page request: e.g. me/mailFolders/INBOX-ID/messages?...
+ for fid, messages in self._messages.items():
+ if f"me/mailFolders/{fid}/messages" in endpoint:
+ page = {"value": messages}
+ if fid == self._paginated_folder and "skiptoken" not in endpoint:
+ page["@odata.nextLink"] = (
+ "https://graph.microsoft.com/v1.0/"
+ f"me/mailFolders/{fid}/messages?$skiptoken=abc"
+ )
+ elif fid == self._paginated_folder and "skiptoken" in endpoint:
+ page = {"value": self._page2}
+ return page
+ return {"value": []}
+
+ async def browse(self, path):
+ return []
+
+
+class _FakeKnowledgeService:
+ def __init__(self, duplicateIds=None):
+ self.calls = []
+ self._duplicates = duplicateIds or set()
+
+ async def requestIngestion(self, job):
+ self.calls.append(job)
+ status = "duplicate" if job.sourceId in self._duplicates else "indexed"
+ return SimpleNamespace(
+ jobId=job.sourceId, status=status, contentHash="h",
+ fileId=job.sourceId, index=None, error=None,
+ )
+
+
+def _msg(mid: str, subject: str = "Hi", change: str = "ck1"):
+ return {
+ "id": mid,
+ "subject": subject,
+ "from": {"emailAddress": {"name": "Alice", "address": "a@x.com"}},
+ "toRecipients": [{"emailAddress": {"name": "Bob", "address": "b@x.com"}}],
+ "ccRecipients": [],
+ "receivedDateTime": "2025-01-01T10:00:00Z",
+ "bodyPreview": "Hello world",
+ "body": {"contentType": "text", "content": "Hello world\nThis is the body."},
+ "internetMessageId": f"<{mid}@local>",
+ "hasAttachments": False,
+ "changeKey": change,
+ }
+
+
+def test_buildContentObjects_emits_header_snippet_body():
+ parts = _buildContentObjects(_msg("m1"), maxBodyChars=8000)
+ ids = [p["contentObjectId"] for p in parts]
+ assert ids == ["header", "snippet", "body"]
+ header = parts[0]["data"]
+ assert "Subject: Hi" in header
+ assert "From: Alice " in header
+ assert "To: Bob " in header
+
+
+def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent():
+ adapter = _FakeOutlookAdapter({
+ "INBOX-ID": [_msg("m1"), _msg("m2")],
+ "SENT-ID": [_msg("m3")],
+ })
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapOutlook(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=OutlookBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 3
+ sourceIds = {c.sourceId for c in knowledge.calls}
+ assert sourceIds == {
+ _syntheticMessageId("c1", "m1"),
+ _syntheticMessageId("c1", "m2"),
+ _syntheticMessageId("c1", "m3"),
+ }
+ for job in knowledge.calls:
+ assert job.sourceKind == "outlook_message"
+ assert job.mimeType == "message/rfc822"
+ assert job.provenance["connectionId"] == "c1"
+ assert job.provenance["service"] == "outlook"
+ assert job.contentVersion == "ck1"
+ assert any(co["contentObjectId"] == "header" for co in job.contentObjects)
+
+
+def test_bootstrap_outlook_follows_pagination():
+ adapter = _FakeOutlookAdapter(
+ messages_by_folder={"INBOX-ID": [_msg("m1")], "SENT-ID": []},
+ paginated_folder="INBOX-ID",
+ page2=[_msg("m2"), _msg("m3")],
+ )
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapOutlook(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=OutlookBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 3
+
+
+def test_bootstrap_outlook_reports_duplicates():
+ adapter = _FakeOutlookAdapter({
+ "INBOX-ID": [_msg("m1"), _msg("m2")],
+ "SENT-ID": [],
+ })
+ duplicates = {
+ _syntheticMessageId("c1", "m1"),
+ _syntheticMessageId("c1", "m2"),
+ }
+ knowledge = _FakeKnowledgeService(duplicateIds=duplicates)
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapOutlook(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ limits=OutlookBootstrapLimits(maxAgeDays=None),
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 0
+ assert result["skippedDuplicate"] == 2
+
+
+if __name__ == "__main__":
+ test_buildContentObjects_emits_header_snippet_body()
+ test_bootstrap_outlook_indexes_messages_from_inbox_and_sent()
+ test_bootstrap_outlook_follows_pagination()
+ test_bootstrap_outlook_reports_duplicates()
+ print("OK — bootstrapOutlook tests passed")
diff --git a/tests/unit/services/test_bootstrap_sharepoint.py b/tests/unit/services/test_bootstrap_sharepoint.py
new file mode 100644
index 00000000..8b011357
--- /dev/null
+++ b/tests/unit/services/test_bootstrap_sharepoint.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Bootstrap SharePoint tests with a fake adapter + knowledge service.
+
+Verifies:
+- Every discovered file triggers `requestIngestion`.
+- Duplicate runs (same eTag revisions) report `skippedDuplicate`.
+- Synthetic fileIds are stable across runs so idempotency works end-to-end.
+"""
+
+import asyncio
+import os
+import sys
+from dataclasses import dataclass
+from types import SimpleNamespace
+from typing import Any, Dict, List, Optional
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
+ bootstrapSharepoint,
+ _syntheticFileId,
+)
+
+
+@dataclass
+class _ExtEntry:
+ name: str
+ path: str
+ isFolder: bool = False
+ size: Optional[int] = None
+ mimeType: Optional[str] = None
+ metadata: Dict[str, Any] = None
+
+
+class _FakeSpAdapter:
+ """Minimal SharepointAdapter stand-in.
+
+ Layout:
+ "/" → 1 site
+ "/sites/site-1" → 2 files (f1, f2) + 1 folder (sub)
+ "/sites/site-1/sub" → 1 file (f3)
+ """
+
+ def __init__(self):
+ self.downloaded: List[str] = []
+
+ async def browse(self, path: str, filter=None, limit=None):
+ if path == "/":
+ return [
+ _ExtEntry(
+ name="Site 1",
+ path="/sites/site-1",
+ isFolder=True,
+ metadata={"id": "site-1"},
+ ),
+ ]
+ if path == "/sites/site-1":
+ return [
+ _ExtEntry(
+ name="f1.txt", path="/sites/site-1/f1.txt",
+ mimeType="text/plain", size=20,
+ metadata={"id": "f1", "revision": "etag-f1"},
+ ),
+ _ExtEntry(
+ name="f2.txt", path="/sites/site-1/f2.txt",
+ mimeType="text/plain", size=20,
+ metadata={"id": "f2", "revision": "etag-f2"},
+ ),
+ _ExtEntry(
+ name="sub", path="/sites/site-1/sub",
+ isFolder=True, metadata={"id": "sub"},
+ ),
+ ]
+ if path == "/sites/site-1/sub":
+ return [
+ _ExtEntry(
+ name="f3.txt", path="/sites/site-1/sub/f3.txt",
+ mimeType="text/plain", size=20,
+ metadata={"id": "f3", "revision": "etag-f3"},
+ ),
+ ]
+ return []
+
+ async def download(self, path: str) -> bytes:
+ self.downloaded.append(path)
+ return path.encode("utf-8")
+
+
+class _FakeKnowledgeService:
+ """Records requestIngestion calls and returns the scripted handles."""
+
+ def __init__(self, duplicateIds=None):
+ self.calls: List[SimpleNamespace] = []
+ self._duplicateIds = duplicateIds or set()
+
+ async def requestIngestion(self, job):
+ self.calls.append(job)
+ status = "duplicate" if job.sourceId in self._duplicateIds else "indexed"
+ return SimpleNamespace(
+ jobId=f"{job.sourceKind}:{job.sourceId}",
+ status=status,
+ contentHash="h",
+ fileId=job.sourceId,
+ index=None,
+ error=None,
+ )
+
+
+def _fakeRunExtraction(data, name, mime, options):
+ """Produce a single synthetic text part so `_toContentObjects` returns one."""
+ return SimpleNamespace(
+ parts=[
+ SimpleNamespace(
+ id="p1",
+ data=data.decode("utf-8") if isinstance(data, bytes) else str(data),
+ typeGroup="text",
+ label="page:1",
+ metadata={"pageIndex": 0},
+ )
+ ]
+ )
+
+
+def test_bootstrap_walks_sites_and_subfolders():
+ adapter = _FakeSpAdapter()
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapSharepoint(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ )
+
+ result = asyncio.run(_run())
+ assert len(knowledge.calls) == 3
+ sourceIds = {c.sourceId for c in knowledge.calls}
+ assert sourceIds == {
+ _syntheticFileId("c1", "f1"),
+ _syntheticFileId("c1", "f2"),
+ _syntheticFileId("c1", "f3"),
+ }
+ assert result["indexed"] == 3
+ assert result["skippedDuplicate"] == 0
+ assert adapter.downloaded == [
+ "/sites/site-1/f1.txt",
+ "/sites/site-1/f2.txt",
+ "/sites/site-1/sub/f3.txt",
+ ]
+
+
+def test_bootstrap_reports_duplicates_on_second_run():
+ adapter = _FakeSpAdapter()
+ duplicateIds = {
+ _syntheticFileId("c1", "f1"),
+ _syntheticFileId("c1", "f2"),
+ _syntheticFileId("c1", "f3"),
+ }
+ knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds)
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapSharepoint(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ )
+
+ result = asyncio.run(_run())
+ assert result["indexed"] == 0
+ assert result["skippedDuplicate"] == 3
+
+
+def test_bootstrap_passes_connection_provenance():
+ adapter = _FakeSpAdapter()
+ knowledge = _FakeKnowledgeService()
+ connection = SimpleNamespace(mandateId="m1", userId="u1")
+
+ async def _run():
+ return await bootstrapSharepoint(
+ connectionId="c1",
+ adapter=adapter,
+ connection=connection,
+ knowledgeService=knowledge,
+ runExtractionFn=_fakeRunExtraction,
+ )
+
+ asyncio.run(_run())
+ for job in knowledge.calls:
+ assert job.sourceKind == "sharepoint_item"
+ assert job.mandateId == "m1"
+ assert job.provenance["connectionId"] == "c1"
+ assert job.provenance["authority"] == "msft"
+ assert job.provenance["service"] == "sharepoint"
+ assert job.contentVersion and job.contentVersion.startswith("etag-")
+
+
+if __name__ == "__main__":
+ test_bootstrap_walks_sites_and_subfolders()
+ test_bootstrap_reports_duplicates_on_second_run()
+ test_bootstrap_passes_connection_provenance()
+ print("OK — bootstrapSharepoint tests passed")
diff --git a/tests/unit/services/test_clean_email_body.py b/tests/unit/services/test_clean_email_body.py
new file mode 100644
index 00000000..a3ee01df
--- /dev/null
+++ b/tests/unit/services/test_clean_email_body.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Unit tests for cleanEmailBody.
+
+Covers: HTML→text normalisation, quoted-reply removal, signature removal,
+whitespace collapse and truncation. The utility is used during Outlook
+bootstrap; buggy cleaning would leak quoted threads / signatures into every
+embedding.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.subTextClean import (
+ cleanEmailBody,
+)
+
+
+def test_strips_html_tags_and_scripts():
+ html = (
+ ""
+ "Hello world
"
+ ""
+ )
+ cleaned = cleanEmailBody(html)
+ assert "Hello" in cleaned
+ assert "world" in cleaned
+ assert "<" not in cleaned
+ assert "alert" not in cleaned
+
+
+def test_strips_quoted_reply_english():
+ body = (
+ "Actual answer from me.\n\n"
+ "On Mon, 1 Jan 2024 at 10:00, Someone wrote:\n"
+ "> Original question?\n"
+ "> Second line.\n"
+ )
+ cleaned = cleanEmailBody(body)
+ assert "Actual answer" in cleaned
+ assert "Original question" not in cleaned
+ assert "wrote:" not in cleaned
+
+
+def test_strips_quoted_reply_german():
+ body = (
+ "Meine Antwort.\n\n"
+ "Am 1. Januar 2024 um 10:00 schrieb Max Muster :\n"
+ "> Ursprüngliche Frage?\n"
+ )
+ cleaned = cleanEmailBody(body)
+ assert "Meine Antwort" in cleaned
+ assert "Ursprüngliche Frage" not in cleaned
+
+
+def test_strips_signature_after_dashes():
+ body = (
+ "Kurze Nachricht.\n"
+ "\n"
+ "--\n"
+ "Max Muster\n"
+ "Vorstand, Beispiel GmbH\n"
+ )
+ cleaned = cleanEmailBody(body)
+ assert "Kurze Nachricht" in cleaned
+ assert "Beispiel GmbH" not in cleaned
+
+
+def test_strips_signature_salutation_de():
+ body = (
+ "Die eigentliche Information steht hier.\n\n"
+ "Mit freundlichen Grüßen\n"
+ "Max Muster"
+ )
+ cleaned = cleanEmailBody(body)
+ assert "eigentliche Information" in cleaned
+ assert "Max Muster" not in cleaned
+
+
+def test_truncate_to_max_chars():
+ body = "abc " * 5000
+ cleaned = cleanEmailBody(body, maxChars=200)
+ assert len(cleaned) <= 201 # includes trailing ellipsis
+
+
+def test_empty_input_returns_empty_string():
+ assert cleanEmailBody("") == ""
+ assert cleanEmailBody(None) == "" # type: ignore[arg-type]
+
+
+def test_collapses_whitespace():
+ body = "A lot of spaces\n\n\n\nand blank lines"
+ cleaned = cleanEmailBody(body)
+ assert " " not in cleaned
+ assert "\n\n\n" not in cleaned
+
+
+if __name__ == "__main__":
+ test_strips_html_tags_and_scripts()
+ test_strips_quoted_reply_english()
+ test_strips_quoted_reply_german()
+ test_strips_signature_after_dashes()
+ test_strips_signature_salutation_de()
+ test_truncate_to_max_chars()
+ test_empty_input_returns_empty_string()
+ test_collapses_whitespace()
+ print("OK — cleanEmailBody tests passed")
diff --git a/tests/unit/services/test_connection_purge.py b/tests/unit/services/test_connection_purge.py
new file mode 100644
index 00000000..c32cb5b3
--- /dev/null
+++ b/tests/unit/services/test_connection_purge.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Purge tests for KnowledgeObjects.deleteFileContentIndexByConnectionId.
+
+Ensures that a `connection.revoked` event wipes every FileContentIndex + chunk
+linked to the given connectionId while leaving entries from other connections
+(or upload-files with connectionId=None) intact.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk
+from modules.interfaces.interfaceDbKnowledge import KnowledgeObjects
+
+
+class _FakeDb:
+ """Minimal in-memory stand-in for ``KnowledgeObjects.db``.
+
+ Supports just the subset of APIs that deleteFileContentIndexByConnectionId
+ touches: getRecordset(FileContentIndex|ContentChunk, ...) + recordDelete.
+ """
+
+ def __init__(self):
+ self.indexRows: dict = {}
+ self.chunks: dict = {}
+
+ def addIndex(self, row: dict) -> None:
+ self.indexRows[row["id"]] = row
+
+ def addChunk(self, row: dict) -> None:
+ self.chunks[row["id"]] = row
+
+ def getRecordset(self, modelClass, recordFilter=None, **_):
+ filter_ = recordFilter or {}
+ if modelClass is FileContentIndex:
+ rows = list(self.indexRows.values())
+ elif modelClass is ContentChunk:
+ rows = list(self.chunks.values())
+ else:
+ return []
+
+ def match(row):
+ for k, v in filter_.items():
+ if row.get(k) != v:
+ return False
+ return True
+
+ return [r for r in rows if match(r)]
+
+ def recordDelete(self, modelClass, recordId):
+ if modelClass is FileContentIndex:
+ return self.indexRows.pop(recordId, None) is not None
+ if modelClass is ContentChunk:
+ return self.chunks.pop(recordId, None) is not None
+ return False
+
+
+def _buildKnowledge():
+ """Instantiate KnowledgeObjects without triggering the real DB bootstrap."""
+ ko = KnowledgeObjects.__new__(KnowledgeObjects)
+ ko.currentUser = None
+ ko.userId = None
+ ko._scopeCache = {}
+ ko.db = _FakeDb()
+ return ko
+
+
+def test_purge_by_connection_removes_only_matching_rows():
+ ko = _buildKnowledge()
+ ko.db.addIndex({"id": "sp1", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"})
+ ko.db.addIndex({"id": "sp2", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"})
+ ko.db.addIndex({"id": "upload", "connectionId": None, "mandateId": "m1", "sourceKind": "file"})
+ ko.db.addIndex({"id": "other", "connectionId": "cy", "mandateId": "m1", "sourceKind": "outlook_message"})
+ ko.db.addChunk({"id": "c1", "fileId": "sp1"})
+ ko.db.addChunk({"id": "c2", "fileId": "sp1"})
+ ko.db.addChunk({"id": "c3", "fileId": "sp2"})
+ ko.db.addChunk({"id": "c4", "fileId": "upload"})
+ ko.db.addChunk({"id": "c5", "fileId": "other"})
+
+ result = ko.deleteFileContentIndexByConnectionId("cx")
+
+ assert result == {"indexRows": 2, "chunks": 3}
+ assert "sp1" not in ko.db.indexRows
+ assert "sp2" not in ko.db.indexRows
+ assert "upload" in ko.db.indexRows
+ assert "other" in ko.db.indexRows
+ assert set(ko.db.chunks.keys()) == {"c4", "c5"}
+
+
+def test_purge_with_empty_connection_id_is_a_noop():
+ ko = _buildKnowledge()
+ ko.db.addIndex({"id": "sp1", "connectionId": "cx"})
+ ko.db.addChunk({"id": "c1", "fileId": "sp1"})
+
+ result = ko.deleteFileContentIndexByConnectionId("")
+
+ assert result == {"indexRows": 0, "chunks": 0}
+ assert "sp1" in ko.db.indexRows
+
+
+def test_purge_unknown_connection_returns_zero():
+ ko = _buildKnowledge()
+ ko.db.addIndex({"id": "sp1", "connectionId": "cx"})
+
+ result = ko.deleteFileContentIndexByConnectionId("nope")
+
+ assert result == {"indexRows": 0, "chunks": 0}
+ assert "sp1" in ko.db.indexRows
+
+
+if __name__ == "__main__":
+ test_purge_by_connection_removes_only_matching_rows()
+ test_purge_with_empty_connection_id_is_a_noop()
+ test_purge_unknown_connection_returns_zero()
+ print("OK — connection-purge tests passed")
diff --git a/tests/unit/services/test_extraction_merge_strategy.py b/tests/unit/services/test_extraction_merge_strategy.py
new file mode 100644
index 00000000..784bb783
--- /dev/null
+++ b/tests/unit/services/test_extraction_merge_strategy.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Test that runExtraction preserves per-part granularity when mergeStrategy=None.
+
+The default MergeStrategy concatenates all text parts into a single ContentPart, which
+collapses multi-page documents into one blob. This destroys RAG retrieval because every
+document ends up as a single ContentChunk with a "blurred average" embedding.
+
+Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve
+per-page / per-section chunks.
+"""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.datamodels.datamodelExtraction import (
+ ContentPart,
+ ExtractionOptions,
+ MergeStrategy,
+)
+from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
+from modules.serviceCenter.services.serviceExtraction.subRegistry import (
+ ChunkerRegistry,
+ Extractor,
+ ExtractorRegistry,
+)
+
+
+class _FakeMultiPagePdfExtractor(Extractor):
+ """Emits one text ContentPart per simulated page."""
+
+ def __init__(self, pageCount: int = 10):
+ self.pageCount = pageCount
+
+ def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
+ return mimeType == "application/pdf"
+
+ def getSupportedExtensions(self):
+ return [".pdf"]
+
+ def getSupportedMimeTypes(self):
+ return ["application/pdf"]
+
+ def extract(self, fileBytes: bytes, context):
+ return [
+ ContentPart(
+ id=f"page-{i}",
+ parentId=None,
+ label=f"page_{i + 1}",
+ typeGroup="text",
+ mimeType="text/plain",
+ data=f"Page {i + 1} content — distinct semantic anchor #{i}",
+ metadata={"pageIndex": i, "size": 64},
+ )
+ for i in range(self.pageCount)
+ ]
+
+
+def _buildRegistry(pageCount: int) -> ExtractorRegistry:
+ registry = ExtractorRegistry()
+ fake = _FakeMultiPagePdfExtractor(pageCount)
+ registry.register("application/pdf", fake)
+ registry.register("pdf", fake)
+ return registry
+
+
+def test_default_options_merge_all_text_parts_into_one():
+ """Regression safeguard: default ExtractionOptions still merges (legacy behaviour).
+
+ Non-ingestion callers (AI processing, summarization) rely on this default.
+ """
+ registry = _buildRegistry(pageCount=5)
+ extracted = runExtraction(
+ registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
+ ExtractionOptions(),
+ )
+ textParts = [p for p in extracted.parts if p.typeGroup == "text"]
+ assert len(textParts) == 1, (
+ f"Default options should merge all text parts into one, got {len(textParts)}"
+ )
+ assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, (
+ "Merged text should contain content from all pages"
+ )
+ print("test_default_options_merge_all_text_parts_into_one [PASS]")
+
+
+def test_merge_none_preserves_all_text_parts():
+ """Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion."""
+ registry = _buildRegistry(pageCount=500)
+ extracted = runExtraction(
+ registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
+ ExtractionOptions(mergeStrategy=None),
+ )
+ textParts = [p for p in extracted.parts if p.typeGroup == "text"]
+ assert len(textParts) == 500, (
+ f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}"
+ )
+ assert textParts[0].label == "page_1"
+ assert textParts[-1].label == "page_500"
+ print("test_merge_none_preserves_all_text_parts [PASS]")
+
+
+def test_explicit_merge_strategy_still_merges():
+ """Callers can still opt in to merging by passing an explicit MergeStrategy."""
+ registry = _buildRegistry(pageCount=3)
+ extracted = runExtraction(
+ registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf",
+ ExtractionOptions(mergeStrategy=MergeStrategy()),
+ )
+ textParts = [p for p in extracted.parts if p.typeGroup == "text"]
+ assert len(textParts) == 1, (
+ f"Explicit MergeStrategy should merge, got {len(textParts)} parts"
+ )
+ print("test_explicit_merge_strategy_still_merges [PASS]")
+
+
+if __name__ == "__main__":
+ test_default_options_merge_all_text_parts_into_one()
+ test_merge_none_preserves_all_text_parts()
+ test_explicit_merge_strategy_still_merges()
+ print("\nAll merge-strategy tests passed.")
diff --git a/tests/unit/services/test_ingestion_hash_stability.py b/tests/unit/services/test_ingestion_hash_stability.py
new file mode 100644
index 00000000..df25a4f0
--- /dev/null
+++ b/tests/unit/services/test_ingestion_hash_stability.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Test that _computeIngestionHash is stable across re-extractions of the same source.
+
+Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion
+hash MUST therefore be derived from content (contentType + data + order) only —
+otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and
+triggers full re-embedding.
+"""
+
+import os
+import sys
+import uuid
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
+ _computeIngestionHash,
+)
+
+
+def _makeObjects(seed: str = "alpha"):
+ """Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would."""
+ return [
+ {
+ "contentObjectId": str(uuid.uuid4()),
+ "contentType": "text",
+ "data": f"Page 1 of {seed}",
+ },
+ {
+ "contentObjectId": str(uuid.uuid4()),
+ "contentType": "text",
+ "data": f"Page 2 of {seed}",
+ },
+ {
+ "contentObjectId": str(uuid.uuid4()),
+ "contentType": "binary",
+ "data": "",
+ },
+ ]
+
+
+def test_hash_stable_across_uuid_regeneration():
+ """Same content + different contentObjectIds → same hash."""
+ a = _makeObjects("alpha")
+ b = _makeObjects("alpha") # identical data, fresh UUIDs
+ assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b]
+ assert _computeIngestionHash(a) == _computeIngestionHash(b)
+
+
+def test_hash_changes_when_data_changes():
+ a = _makeObjects("alpha")
+ b = _makeObjects("beta")
+ assert _computeIngestionHash(a) != _computeIngestionHash(b)
+
+
+def test_hash_is_order_sensitive():
+ """Reordered pages produce a different hash (different document)."""
+ a = _makeObjects("alpha")
+ b = list(reversed(a))
+ assert _computeIngestionHash(a) != _computeIngestionHash(b)
+
+
+def test_hash_distinguishes_text_vs_binary_with_same_payload():
+ a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}]
+ b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}]
+ assert _computeIngestionHash(a) != _computeIngestionHash(b)
+
+
+def test_hash_handles_empty_input():
+ assert _computeIngestionHash([]) == _computeIngestionHash([])
+
+
+if __name__ == "__main__":
+ test_hash_stable_across_uuid_regeneration()
+ test_hash_changes_when_data_changes()
+ test_hash_is_order_sensitive()
+ test_hash_distinguishes_text_vs_binary_with_same_payload()
+ test_hash_handles_empty_input()
+ print("OK — all 5 ingestion-hash stability tests passed")
diff --git a/tests/unit/services/test_knowledge_ingest_consumer.py b/tests/unit/services/test_knowledge_ingest_consumer.py
new file mode 100644
index 00000000..6b27a6e8
--- /dev/null
+++ b/tests/unit/services/test_knowledge_ingest_consumer.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Unit tests for KnowledgeIngestionConsumer event dispatch.
+
+- `connection.established` → enqueue a `connection.bootstrap` job.
+- `connection.revoked` → synchronous purge via KnowledgeObjects.
+"""
+
+import asyncio
+import os
+import sys
+import types
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as consumer
+
+
+def _resetRegistration(monkeypatch):
+ """Force the module-level guard to register fresh in each test."""
+ monkeypatch.setattr(consumer, "_registered", False)
+
+
+def test_onConnectionEstablished_enqueues_bootstrap(monkeypatch):
+ startedJobs = []
+
+ async def _fakeStartJob(jobType, payload, **kwargs):
+ startedJobs.append({"jobType": jobType, "payload": payload, "kwargs": kwargs})
+ return "job-1"
+
+ monkeypatch.setattr(consumer, "startJob", _fakeStartJob)
+ consumer._onConnectionEstablished(
+ connectionId="c1", authority="msft", userId="u1"
+ )
+ # Drain pending tasks created by the consumer.
+ loop = asyncio.new_event_loop()
+ try:
+ asyncio.set_event_loop(loop)
+ # If the consumer created a Task on a closed loop the fake startJob
+ # was still called synchronously via asyncio.run — in either case we
+ # check the recorded call.
+ finally:
+ loop.close()
+
+ assert len(startedJobs) == 1
+ assert startedJobs[0]["jobType"] == consumer.BOOTSTRAP_JOB_TYPE
+ assert startedJobs[0]["payload"]["connectionId"] == "c1"
+ assert startedJobs[0]["payload"]["authority"] == "msft"
+ assert startedJobs[0]["kwargs"]["triggeredBy"] == "u1"
+
+
+def test_onConnectionEstablished_ignores_missing_id(monkeypatch):
+ called = []
+
+ async def _fakeStartJob(*a, **kw):
+ called.append(1)
+ return "x"
+
+ monkeypatch.setattr(consumer, "startJob", _fakeStartJob)
+ consumer._onConnectionEstablished(connectionId="", authority="msft")
+ assert called == []
+
+
+def test_onConnectionRevoked_runs_sync_purge(monkeypatch):
+ class _FakeKnowledge:
+ def __init__(self):
+ self.calls = []
+
+ def deleteFileContentIndexByConnectionId(self, cid):
+ self.calls.append(cid)
+ return {"indexRows": 2, "chunks": 5}
+
+ fakeKnow = _FakeKnowledge()
+
+ def _fakeGetInterface(_user=None):
+ return fakeKnow
+
+ monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface)
+ consumer._onConnectionRevoked(
+ connectionId="c1", authority="msft", userId="u1", reason="disconnected"
+ )
+ assert fakeKnow.calls == ["c1"]
+
+
+def test_onConnectionRevoked_ignores_missing_id(monkeypatch):
+ seen = []
+
+ def _fakeGetInterface(_user=None):
+ class _K:
+ def deleteFileContentIndexByConnectionId(self, cid):
+ seen.append(cid)
+ return {"indexRows": 0, "chunks": 0}
+
+ return _K()
+
+ monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface)
+ consumer._onConnectionRevoked(connectionId="")
+ assert seen == []
+
+
+def test_bootstrap_job_skips_unsupported_authority(monkeypatch):
+ async def _run():
+ result = await consumer._bootstrapJobHandler(
+ {"payload": {"connectionId": "c1", "authority": "slack"}},
+ lambda *_: None,
+ )
+ return result
+
+ result = asyncio.run(_run())
+ assert result["skipped"] is True
+ assert result["authority"] == "slack"
+ assert result["reason"] == "unsupported_authority"
+
+
+def test_bootstrap_job_dispatches_msft_parts(monkeypatch):
+ calls = {"sp": 0, "ol": 0}
+
+ async def _fakeSp(connectionId, progressCb=None):
+ calls["sp"] += 1
+ return {"indexed": 1}
+
+ async def _fakeOl(connectionId, progressCb=None):
+ calls["ol"] += 1
+ return {"indexed": 2}
+
+ fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint")
+ fakeSharepoint.bootstrapSharepoint = _fakeSp
+ fakeOutlook = types.ModuleType("subConnectorSyncOutlook")
+ fakeOutlook.bootstrapOutlook = _fakeOl
+ monkeypatch.setitem(
+ sys.modules,
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint",
+ fakeSharepoint,
+ )
+ monkeypatch.setitem(
+ sys.modules,
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook",
+ fakeOutlook,
+ )
+
+ async def _run():
+ return await consumer._bootstrapJobHandler(
+ {"payload": {"connectionId": "c1", "authority": "msft"}},
+ lambda *_: None,
+ )
+
+ result = asyncio.run(_run())
+ assert calls == {"sp": 1, "ol": 1}
+ assert result["sharepoint"] == {"indexed": 1}
+ assert result["outlook"] == {"indexed": 2}
+
+
+def test_bootstrap_job_dispatches_google_parts(monkeypatch):
+ calls = {"gd": 0, "gm": 0}
+
+ async def _fakeGd(connectionId, progressCb=None):
+ calls["gd"] += 1
+ return {"indexed": 7}
+
+ async def _fakeGm(connectionId, progressCb=None):
+ calls["gm"] += 1
+ return {"indexed": 11}
+
+ fakeGdrive = types.ModuleType("subConnectorSyncGdrive")
+ fakeGdrive.bootstrapGdrive = _fakeGd
+ fakeGmail = types.ModuleType("subConnectorSyncGmail")
+ fakeGmail.bootstrapGmail = _fakeGm
+ monkeypatch.setitem(
+ sys.modules,
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive",
+ fakeGdrive,
+ )
+ monkeypatch.setitem(
+ sys.modules,
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail",
+ fakeGmail,
+ )
+
+ async def _run():
+ return await consumer._bootstrapJobHandler(
+ {"payload": {"connectionId": "c1", "authority": "google"}},
+ lambda *_: None,
+ )
+
+ result = asyncio.run(_run())
+ assert calls == {"gd": 1, "gm": 1}
+ assert result["drive"] == {"indexed": 7}
+ assert result["gmail"] == {"indexed": 11}
+
+
+def test_bootstrap_job_dispatches_clickup_part(monkeypatch):
+ calls = {"cu": 0}
+
+ async def _fakeCu(connectionId, progressCb=None):
+ calls["cu"] += 1
+ return {"indexed": 4}
+
+ fakeClickup = types.ModuleType("subConnectorSyncClickup")
+ fakeClickup.bootstrapClickup = _fakeCu
+ monkeypatch.setitem(
+ sys.modules,
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup",
+ fakeClickup,
+ )
+
+ async def _run():
+ return await consumer._bootstrapJobHandler(
+ {"payload": {"connectionId": "c1", "authority": "clickup"}},
+ lambda *_: None,
+ )
+
+ result = asyncio.run(_run())
+ assert calls == {"cu": 1}
+ assert result["clickup"] == {"indexed": 4}
+
+
+if __name__ == "__main__":
+ # Usable without pytest fixtures for a quick smoke run.
+ class _MP:
+ def __init__(self):
+ self.undos = []
+
+ def setattr(self, target, name_or_value, value=None):
+ if value is None:
+ # target is an object, name_or_value is value → no, original signature
+ raise SystemExit("use pytest monkeypatch in CLI")
+ self.undos.append((target, name_or_value, getattr(target, name_or_value)))
+ setattr(target, name_or_value, value)
+
+ def setitem(self, mapping, key, value):
+ self.undos.append((mapping, key, mapping.get(key)))
+ mapping[key] = value
+
+ print("Run via pytest: pytest tests/unit/services/test_knowledge_ingest_consumer.py")
diff --git a/tests/unit/services/test_p1d_consent_prefs.py b/tests/unit/services/test_p1d_consent_prefs.py
new file mode 100644
index 00000000..e00b0dfc
--- /dev/null
+++ b/tests/unit/services/test_p1d_consent_prefs.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""Unit tests for P1d: consent gating, preference parsing, and walker behaviour.
+
+Tests
+-----
+1. Bootstrap runner skips when ``knowledgeIngestionEnabled=False``.
+2. ``loadConnectionPrefs`` returns safe defaults when preferences are absent.
+3. ``loadConnectionPrefs`` maps all §2.6 keys correctly from a full prefs dict.
+4. Gmail walker passes ``neutralize=True`` and ``mailContentDepth`` to IngestionJob.
+5. Gmail walker produces only a header content-object when depth="metadata".
+6. ClickUp walker skips description when scope="titles".
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+import sys
+import types
+import unittest
+from typing import Any, Dict, Optional
+from unittest.mock import AsyncMock, MagicMock, patch
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
+
+
+# ---------------------------------------------------------------------------
+# 1. Bootstrap runner consent gate
+# ---------------------------------------------------------------------------
+
+class TestBootstrapConsentGate(unittest.TestCase):
+ """_bootstrapJobHandler must no-op when knowledgeIngestionEnabled is False."""
+
+ def _makeJob(self, connectionId="c-test", authority="google"):
+ return {"payload": {"connectionId": connectionId, "authority": authority}}
+
+ def _makeConn(self, enabled: bool):
+ conn = MagicMock()
+ conn.knowledgeIngestionEnabled = enabled
+ return conn
+
+ def test_skips_when_consent_disabled(self):
+ from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut
+
+ fake_root = MagicMock()
+ fake_root.getUserConnectionById.return_value = self._makeConn(False)
+
+ with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root):
+ result = asyncio.get_event_loop().run_until_complete(
+ sut._bootstrapJobHandler(self._makeJob(), lambda *a: None)
+ )
+
+ assert result.get("skipped") is True
+ assert result.get("reason") == "consent_disabled"
+ fake_root.getUserConnectionById.assert_called_once_with("c-test")
+
+ def test_proceeds_when_consent_enabled(self):
+ """When consent is enabled, the handler should call at least one walker."""
+ from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut
+
+ fake_root = MagicMock()
+ fake_root.getUserConnectionById.return_value = self._makeConn(True)
+
+ # Patch the inner walker so it doesn't do real I/O.
+ async def _fakeBootstrap(**kwargs):
+ return {"indexed": 0}
+
+ with (
+ patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root),
+ patch(
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive.bootstrapGdrive",
+ new=AsyncMock(return_value={"indexed": 0}),
+ ),
+ patch(
+ "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail.bootstrapGmail",
+ new=AsyncMock(return_value={"indexed": 0}),
+ ),
+ ):
+ result = asyncio.get_event_loop().run_until_complete(
+ sut._bootstrapJobHandler(self._makeJob(authority="google"), lambda *a: None)
+ )
+
+ # Should not have 'skipped' at the top level.
+ assert result.get("skipped") is not True
+ assert result.get("authority") == "google"
+
+
+# ---------------------------------------------------------------------------
+# 2 + 3. loadConnectionPrefs
+# ---------------------------------------------------------------------------
+
+class TestLoadConnectionPrefs(unittest.TestCase):
+ def _makeConn(self, prefs: Optional[Dict[str, Any]]):
+ conn = MagicMock()
+ conn.knowledgePreferences = prefs
+ return conn
+
+ def _mockRoot(self, prefs):
+ root = MagicMock()
+ root.getUserConnectionById.return_value = self._makeConn(prefs)
+ return root
+
+ def test_returns_safe_defaults_when_prefs_none(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import (
+ ConnectionIngestionPrefs,
+ loadConnectionPrefs,
+ )
+
+ with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(None)):
+ prefs = loadConnectionPrefs("x")
+
+ assert prefs.neutralizeBeforeEmbed is False
+ assert prefs.mailContentDepth == "full"
+ assert prefs.mailIndexAttachments is False
+ assert prefs.maxAgeDays == 90
+ assert prefs.clickupScope == "title_description"
+ assert prefs.gmailEnabled is True
+ assert prefs.driveEnabled is True
+
+ def test_maps_all_keys(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+
+ raw = {
+ "neutralizeBeforeEmbed": True,
+ "mailContentDepth": "metadata",
+ "mailIndexAttachments": True,
+ "filesIndexBinaries": False,
+ "clickupScope": "with_comments",
+ "maxAgeDays": 30,
+ "surfaceToggles": {
+ "google": {"gmail": False, "drive": True},
+ "msft": {"sharepoint": False, "outlook": True},
+ },
+ }
+
+ with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)):
+ prefs = loadConnectionPrefs("x")
+
+ assert prefs.neutralizeBeforeEmbed is True
+ assert prefs.mailContentDepth == "metadata"
+ assert prefs.mailIndexAttachments is True
+ assert prefs.filesIndexBinaries is False
+ assert prefs.clickupScope == "with_comments"
+ assert prefs.maxAgeDays == 30
+ assert prefs.gmailEnabled is False
+ assert prefs.driveEnabled is True
+ assert prefs.sharepointEnabled is False
+ assert prefs.outlookEnabled is True
+
+ def test_invalid_depth_falls_back_to_default(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
+
+ raw = {"mailContentDepth": "everything_please"}
+
+ with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)):
+ prefs = loadConnectionPrefs("x")
+
+ assert prefs.mailContentDepth == "full"
+
+
+# ---------------------------------------------------------------------------
+# 4. Gmail walker passes neutralize + mailContentDepth to IngestionJob
+# ---------------------------------------------------------------------------
+
+class TestGmailWalkerPrefs(unittest.TestCase):
+ def _make_message(self, *, subject="Test", snippet="hello", body_text="full body"):
+ import base64
+ encoded = base64.urlsafe_b64encode(body_text.encode()).decode()
+ return {
+ "id": "msg-1",
+ "historyId": "h-42",
+ "threadId": "t-1",
+ "snippet": snippet,
+ "payload": {
+ "mimeType": "multipart/alternative",
+ "headers": [
+ {"name": "Subject", "value": subject},
+ {"name": "From", "value": "alice@example.com"},
+ {"name": "To", "value": "bob@example.com"},
+ {"name": "Date", "value": "Mon, 20 Apr 2026 10:00:00 +0000"},
+ ],
+ "parts": [
+ {
+ "mimeType": "text/plain",
+ "body": {"data": encoded},
+ }
+ ],
+ },
+ }
+
+ def test_neutralize_flag_forwarded(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+ GmailBootstrapLimits,
+ _ingestMessage,
+ GmailBootstrapResult,
+ )
+ from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+ captured_jobs = []
+
+ async def fake_requestIngestion(job: IngestionJob):
+ captured_jobs.append(job)
+ return MagicMock(status="indexed", error=None)
+
+ ks = MagicMock()
+ ks.requestIngestion = fake_requestIngestion
+
+ limits = GmailBootstrapLimits(neutralize=True, mailContentDepth="full")
+ result = GmailBootstrapResult(connectionId="c-1")
+
+ asyncio.get_event_loop().run_until_complete(
+ _ingestMessage(
+ googleGetFn=AsyncMock(return_value={}),
+ knowledgeService=ks,
+ connectionId="c-1",
+ mandateId="",
+ userId="u-1",
+ labelId="INBOX",
+ message=self._make_message(),
+ limits=limits,
+ result=result,
+ progressCb=None,
+ )
+ )
+
+ assert len(captured_jobs) == 1
+ assert captured_jobs[0].neutralize is True
+
+ def test_metadata_depth_yields_only_header(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+ _buildContentObjects,
+ )
+
+ message = self._make_message(snippet="hi", body_text="should be excluded")
+ parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="metadata")
+ ids = [p["contentObjectId"] for p in parts]
+ assert ids == ["header"]
+
+ def test_snippet_depth_yields_header_and_snippet(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
+ _buildContentObjects,
+ )
+
+ message = self._make_message(snippet="hi", body_text="should be excluded")
+ parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="snippet")
+ ids = [p["contentObjectId"] for p in parts]
+ assert "header" in ids
+ assert "snippet" in ids
+ assert "body" not in ids
+
+
+# ---------------------------------------------------------------------------
+# 5. ClickUp walker respects clickupScope="titles"
+# ---------------------------------------------------------------------------
+
+class TestClickupWalkerScope(unittest.TestCase):
+ def _make_task(self):
+ return {
+ "id": "task-1",
+ "name": "Ship feature X",
+ "date_updated": "1713888000000",
+ "description": "This should be omitted",
+ "text_content": "Also omitted",
+ "status": {"status": "open"},
+ "assignees": [],
+ "tags": [],
+ "list": {"name": "Backlog"},
+ "folder": {},
+ "space": {"name": "Engineering"},
+ }
+
+ def test_titles_scope_omits_description(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
+ ClickupBootstrapLimits,
+ _buildContentObjects,
+ )
+
+ limits = ClickupBootstrapLimits(clickupScope="titles")
+ parts = _buildContentObjects(self._make_task(), limits)
+ ids = [p["contentObjectId"] for p in parts]
+ assert ids == ["header"]
+ assert "description" not in ids
+
+ def test_with_description_scope_includes_description(self):
+ from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
+ ClickupBootstrapLimits,
+ _buildContentObjects,
+ )
+
+ limits = ClickupBootstrapLimits(clickupScope="title_description")
+ parts = _buildContentObjects(self._make_task(), limits)
+ ids = [p["contentObjectId"] for p in parts]
+ assert "header" in ids
+ assert "description" in ids
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/unit/workflows/test_automation2_graphUtils.py b/tests/unit/workflows/test_automation2_graphUtils.py
index ff5df2cc..5ea7126a 100644
--- a/tests/unit/workflows/test_automation2_graphUtils.py
+++ b/tests/unit/workflows/test_automation2_graphUtils.py
@@ -66,6 +66,17 @@ class TestResolveParameterReferences:
value = "Land: {{n1.country}}"
assert resolveParameterReferences(value, node_outputs) == "Land: CH"
+ def test_legacy_string_template_loop_current_item_nested(self):
+ """Same shape as executionEngine sets on loop node id during body iteration."""
+ node_outputs = {
+ "loop93": {
+ "currentItem": {"subject": "Hello", "body": {"content": "World"}},
+ "currentIndex": 0,
+ },
+ }
+ value = "Subj: {{loop93.currentItem.subject}} Body: {{loop93.currentItem.body.content}}"
+ assert resolveParameterReferences(value, node_outputs) == "Subj: Hello Body: World"
+
class TestWildcardIteration:
"""Phase-4 typed Bindings-Resolver: ``*`` segment iterates over a list.