Merge pull request #152 from valueonag/int

Int
This commit is contained in:
Patrick Motsch 2026-05-03 22:26:04 +02:00 committed by GitHub
commit 2f87fae44d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
128 changed files with 11513 additions and 2786 deletions

13
app.py
View file

@ -405,6 +405,16 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.warning(f"BackgroundJob recovery failed (non-critical): {e}")
# Subscribe knowledge ingestion to connection lifecycle events so OAuth
# connect/disconnect reliably trigger bootstrap/purge.
try:
from modules.serviceCenter.services.serviceKnowledge.subConnectorIngestConsumer import (
registerKnowledgeIngestionConsumer,
)
registerKnowledgeIngestionConsumer()
except Exception as e:
logger.warning(f"KnowledgeIngestionConsumer registration failed (non-critical): {e}")
yield
# --- Stop Managers ---
@ -672,6 +682,9 @@ app.include_router(navigationRouter)
from modules.routes.routeWorkflowDashboard import router as workflowDashboardRouter
app.include_router(workflowDashboardRouter)
from modules.routes.routeAutomationWorkspace import router as automationWorkspaceRouter
app.include_router(automationWorkspaceRouter)
# ============================================================================
# PLUG&PLAY FEATURE ROUTERS
# Dynamically load routers from feature containers in modules/features/

View file

@ -210,6 +210,9 @@ class ClickupListsAdapter(ServiceAdapter):
data = await self._svc.getTask(task_id)
if isinstance(data, dict) and data.get("error"):
return json.dumps(data).encode("utf-8")
returnedId = data.get("id", "") if isinstance(data, dict) else ""
if returnedId and returnedId != task_id:
logger.warning(f"ClickUp download: requested task_id={task_id} but API returned id={returnedId}")
payload = json.dumps(data, indent=2).encode("utf-8")
return DownloadResult(data=payload, fileName=f"task-{task_id}.json", mimeType="application/json")

View file

@ -126,6 +126,11 @@ def _stripGraphBase(url: str) -> str:
def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry:
isFolder = "folder" in item
# Graph exposes the driveItem content hash as ``eTag`` (quoted) or
# ``cTag``; we normalise to a "revision" string so callers can use it as a
# stable ``contentVersion`` for idempotent ingestion without re-downloading
# file bytes.
revision = item.get("eTag") or item.get("cTag")
return ExternalEntry(
name=item.get("name", ""),
path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""),
@ -137,6 +142,9 @@ def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> Exter
"id": item.get("id"),
"webUrl": item.get("webUrl"),
"childCount": item.get("folder", {}).get("childCount") if isFolder else None,
"revision": revision,
"lastModifiedDateTime": item.get("lastModifiedDateTime"),
"parentReference": item.get("parentReference", {}),
},
)
@ -167,21 +175,36 @@ class SharepointAdapter(_GraphApiMixin, ServiceAdapter):
return await self._discoverSites()
if not folderPath or folderPath == "/":
endpoint = f"sites/{siteId}/drive/root/children"
endpoint: Optional[str] = f"sites/{siteId}/drive/root/children?$top=200"
else:
cleanPath = folderPath.lstrip("/")
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children"
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children?$top=200"
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
return []
# Follow @odata.nextLink until a hard cap is reached so large libraries
# are fully enumerated (required for bootstrap). Per-page size uses
# Graph's max supported value to minimise round-trips.
effectiveLimit = int(limit) if limit is not None else None
items: List[Dict[str, Any]] = []
hardCap = 5000
while endpoint and len(items) < hardCap:
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
break
for raw in result.get("value", []) or []:
items.append(raw)
if effectiveLimit is not None and len(items) >= effectiveLimit:
break
if effectiveLimit is not None and len(items) >= effectiveLimit:
break
nextLink = result.get("@odata.nextLink")
endpoint = _stripGraphBase(nextLink) if nextLink else None
entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])]
entries = [_graphItemToExternalEntry(item, path) for item in items]
if filter:
entries = [e for e in entries if _matchFilter(e, filter)]
if limit is not None:
entries = entries[: max(1, int(limit))]
if effectiveLimit is not None:
entries = entries[: max(1, effectiveLimit)]
return entries
async def _discoverSites(self) -> List[ExternalEntry]:

View file

@ -162,6 +162,7 @@ class AiCallOptions(BaseModel):
# Provider filtering (from UI multiselect or automation config)
allowedProviders: Optional[List[str]] = Field(default=None, description="List of allowed AI providers to use (empty = all RBAC-permitted)")
allowedModels: Optional[List[str]] = Field(default=None, description="Whitelist of allowed model names (AND-filter with allowedProviders). None/empty = all allowed.")
class AiCallRequest(BaseModel):

View file

@ -110,11 +110,13 @@ class DocumentReferenceList(BaseModel):
# docItem:documentId
references.append(DocumentItemReference(documentId=parts[0]))
# Unknown format - skip or log warning
else:
# Try to parse as simple string (backward compatibility)
# Assume it's a label if it doesn't match known patterns
if refStr:
if not refStr:
continue
import re
if re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', refStr, re.I):
references.append(DocumentItemReference(documentId=refStr))
else:
references.append(DocumentListReference(label=refStr))
return cls(references=references)
@ -153,9 +155,12 @@ def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList:
return coerceDocumentReferenceList(value[innerKey])
docId = value.get("documentId") or value.get("id")
if docId:
docIdStr = str(docId)
if docIdStr.startswith("docItem:") or docIdStr.startswith("docList:"):
return DocumentReferenceList.from_string_list([docIdStr])
return DocumentReferenceList(references=[
DocumentItemReference(
documentId=str(docId),
documentId=docIdStr,
fileName=value.get("fileName") or value.get("name"),
)
])
@ -178,10 +183,15 @@ def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList:
continue
docId = item.get("documentId") or item.get("id")
if docId:
references.append(DocumentItemReference(
documentId=str(docId),
fileName=item.get("fileName") or item.get("name"),
))
docIdStr = str(docId)
if docIdStr.startswith("docItem:") or docIdStr.startswith("docList:"):
parsed = DocumentReferenceList.from_string_list([docIdStr])
references.extend(parsed.references)
else:
references.append(DocumentItemReference(
documentId=docIdStr,
fileName=item.get("fileName") or item.get("name"),
))
elif item.get("label"):
references.append(DocumentListReference(
label=str(item["label"]),

View file

@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel):
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Merging strategy
mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
mergeStrategy: Optional[MergeStrategy] = Field(
default_factory=MergeStrategy,
description=(
"Strategy for merging extraction results. Pass None to skip merging entirely "
"(required for per-chunk ingestion pipelines like RAG, where per-page/per-section "
"granularity must be preserved for embedding)."
),
)
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")

View file

@ -1,82 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""FileFolder: hierarchical folder structure for file organization."""
from typing import Optional
from pydantic import BaseModel, Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
import uuid
@i18nModel("Dateiordner")
class FileFolder(PowerOnModel):
"""Hierarchischer Ordner fuer die Dateiverwaltung."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
)
name: str = Field(
description="Folder name",
json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True},
)
parentId: Optional[str] = Field(
default=None,
description="Parent folder ID (null = root)",
json_schema_extra={
"label": "Uebergeordneter Ordner",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
mandateId: Optional[str] = Field(
default=None,
description="Mandate context",
json_schema_extra={
"label": "Mandanten-ID",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"},
},
)
featureInstanceId: Optional[str] = Field(
default=None,
description="Feature instance context",
json_schema_extra={
"label": "Feature-Instanz-ID",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
scope: str = Field(
default="personal",
description="Data visibility scope: personal, featureInstance, mandate, global. Inherited by files in this folder.",
json_schema_extra={
"label": "Sichtbarkeit",
"frontend_type": "select",
"frontend_readonly": False,
"frontend_required": False,
"frontend_options": [
{"value": "personal", "label": "Persönlich"},
{"value": "featureInstance", "label": "Feature-Instanz"},
{"value": "mandate", "label": "Mandant"},
{"value": "global", "label": "Global"},
],
},
)
neutralize: bool = Field(
default=False,
description="Whether files in this folder should be neutralized before AI processing. Inherited by new/moved files.",
json_schema_extra={
"label": "Neutralisieren",
"frontend_type": "checkbox",
"frontend_readonly": False,
"frontend_required": False,
},
)

View file

@ -10,6 +10,69 @@ import uuid
import base64
@i18nModel("Ordner")
class FileFolder(PowerOnModel):
"""Persistenter Datei-Ordner im Management-DB-Kontext (RBAC wie FileItem)."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
)
name: str = Field(
description="Display name of the folder",
json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True},
)
parentId: Optional[str] = Field(
default=None,
description="Parent folder id; empty or None for root",
json_schema_extra={
"label": "Uebergeordneter Ordner",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
mandateId: Optional[str] = Field(
default="",
description="ID of the mandate this folder belongs to",
json_schema_extra={
"label": "Mandant",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"},
},
)
featureInstanceId: Optional[str] = Field(
default="",
description="ID of the feature instance this folder belongs to",
json_schema_extra={
"label": "Feature-Instanz",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
scope: str = Field(
default="personal",
description="Data visibility scope: personal, featureInstance, mandate, global",
json_schema_extra={"label": "Sichtbarkeit", "frontend_type": "select", "frontend_readonly": False, "frontend_required": False, "frontend_options": [
{"value": "personal", "label": "Persönlich"},
{"value": "featureInstance", "label": "Feature-Instanz"},
{"value": "mandate", "label": "Mandant"},
{"value": "global", "label": "Global"},
]},
)
neutralize: bool = Field(
default=False,
description="Whether files in this folder should be neutralized before AI processing",
json_schema_extra={"label": "Neutralisieren", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False},
)
@i18nModel("Datei")
class FileItem(PowerOnModel):
"""Metadaten einer gespeicherten Datei."""
@ -44,6 +107,17 @@ class FileItem(PowerOnModel):
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
folderId: Optional[str] = Field(
default=None,
description="ID of the folder containing this file (if any)",
json_schema_extra={
"label": "Ordner",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
mimeType: str = Field(
description="MIME type of the file",
json_schema_extra={"label": "MIME-Typ", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
@ -68,17 +142,6 @@ class FileItem(PowerOnModel):
description="Tags for categorization and search",
json_schema_extra={"label": "Tags", "frontend_type": "tags", "frontend_readonly": False, "frontend_required": False},
)
folderId: Optional[str] = Field(
default=None,
description="ID of the parent folder",
json_schema_extra={
"label": "Ordner-ID",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
description: Optional[str] = Field(
default=None,
description="User-provided description of the file",

View file

@ -6,7 +6,7 @@ Unified JSON document schema and helpers used by both generation prompts and ren
This defines a single canonical template and the supported section types.
"""
from typing import List
from typing import List, Literal, TypedDict
# Canonical list of supported section types across the system
supportedSectionTypes: List[str] = [
@ -18,6 +18,21 @@ supportedSectionTypes: List[str] = [
"image",
]
class InlineRun(TypedDict, total=False):
"""Single inline content run. Every paragraph/cell/list-item is a List[InlineRun]."""
type: Literal["text", "image", "link", "bold", "italic", "code"]
value: str # text content (for text/bold/italic/code/link-label)
fileId: str # for type=image: reference to FileItem
base64Data: str # for type=image: resolved base64 (post-processing)
mimeType: str # for type=image: e.g. "image/png"
widthPt: int # for type=image: optional render width
href: str # for type=link: URL target
supportedInlineRunTypes: List[str] = [
"text", "image", "link", "bold", "italic", "code",
]
# Canonical JSON template used for AI generation (documents array + sections)
# This template is used for STRUCTURE generation - sections have empty elements arrays.
# For content generation, elements arrays will be populated later.

View file

@ -90,6 +90,16 @@ class FileContentIndex(PowerOnModel):
description="Data visibility scope: personal, featureInstance, mandate, global",
json_schema_extra={"label": "Sichtbarkeit"},
)
sourceKind: str = Field(
default="file",
description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...",
json_schema_extra={"label": "Quellenart"},
)
connectionId: Optional[str] = Field(
default=None,
description="UserConnection ID if this index entry originates from an external connector",
json_schema_extra={"label": "Connection-ID"},
)
neutralizationStatus: Optional[str] = Field(
default=None,
description="Neutralization status: completed, failed, skipped, None = not required",

View file

@ -13,6 +13,42 @@ import math
T = TypeVar('T')
# ---------------------------------------------------------------------------
# Table Grouping models
# ---------------------------------------------------------------------------
class TableGroupNode(BaseModel):
"""
A single node in a user-defined group tree for a FormGeneratorTable.
Items belong to exactly one group (no multi-membership).
Groups can be nested to arbitrary depth via subGroups.
"""
id: str
name: str
itemIds: List[str] = Field(default_factory=list)
subGroups: List['TableGroupNode'] = Field(default_factory=list)
order: int = 0
isExpanded: bool = True
TableGroupNode.model_rebuild()
class TableGrouping(BaseModel):
"""
Persisted grouping configuration for one (user, contextKey) pair.
Stored in table_groupings in poweron_app (auto-created).
contextKey convention: API path without /api/ prefix and without trailing slash.
Examples: "connections", "prompts", "admin/users", "trustee/{instanceId}/documents"
"""
id: str
userId: str
contextKey: str
rootGroups: List[TableGroupNode] = Field(default_factory=list)
updatedAt: Optional[float] = None
class SortField(BaseModel):
"""
Single sort field configuration.
@ -24,12 +60,23 @@ class SortField(BaseModel):
class PaginationParams(BaseModel):
"""
Complete pagination state including page, sorting, and filters.
Grouping extensions (both optional omit when not using grouping):
groupId Scope the request to items belonging to this group.
The backend resolves it to an itemIds IN-filter before
applying normal pagination/search/filter logic.
Also applied for mode=ids and mode=filterValues so that
bulk-select and filter-dropdowns respect the group scope.
saveGroupTree If present the backend persists this tree for the current
(user, contextKey) pair *before* fetching, then returns
the confirmed tree in the response groupTree field.
Omit on every request that does not change the group tree.
"""
page: int = Field(ge=1, description="Current page number (1-based)")
pageSize: int = Field(ge=1, le=1000, description="Number of items per page")
sort: List[SortField] = Field(default_factory=list, description="List of sort fields in priority order")
filters: Optional[Dict[str, Any]] = Field(
default=None,
default=None,
description="""Filter criteria dictionary. Supports:
- General search: {"search": "text"} - searches across all text fields (case-insensitive)
- Field-specific filters:
@ -38,6 +85,14 @@ class PaginationParams(BaseModel):
- Supported operators: equals/eq, contains, startsWith, endsWith, gt, gte, lt, lte, in, notIn
- Multiple filters are combined with AND logic"""
)
groupId: Optional[str] = Field(
default=None,
description="Scope request to items of this group (resolved server-side to itemIds IN-filter)",
)
saveGroupTree: Optional[List[Dict[str, Any]]] = Field(
default=None,
description="If set, persist this group tree before fetching (optimistic save)",
)
class PaginationRequest(BaseModel):
@ -74,10 +129,19 @@ class PaginationMetadata(BaseModel):
class PaginatedResponse(BaseModel, Generic[T]):
"""
Response containing paginated data and metadata.
groupTree is included when the endpoint supports table grouping and the
current user has a saved group tree for the requested contextKey.
It is None when grouping is not configured for the endpoint or the user
has not created any groups yet. Frontend must treat None as an empty tree.
"""
items: List[T] = Field(..., description="Array of items for current page")
pagination: Optional[PaginationMetadata] = Field(..., description="Pagination metadata (None if pagination not applied)")
groupTree: Optional[List[TableGroupNode]] = Field(
default=None,
description="Current group tree for this (user, contextKey) pair — None if no grouping configured",
)
model_config = ConfigDict(arbitrary_types_allowed=True)
@ -85,29 +149,33 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any]
"""
Normalize pagination dictionary to handle frontend variations.
Moves top-level "search" field into filters if present.
Grouping fields (groupId, saveGroupTree) are passed through as-is.
Args:
pagination_dict: Raw pagination dictionary from frontend
Returns:
Normalized pagination dictionary ready for PaginationParams parsing
"""
if not pagination_dict:
return pagination_dict
# Create a copy to avoid modifying the original
normalized = dict(pagination_dict)
# Ensure required fields have sensible defaults
if "page" not in normalized:
normalized["page"] = 1
if "pageSize" not in normalized:
normalized["pageSize"] = 25
# Move top-level "search" into filters if present
if "search" in normalized:
if "filters" not in normalized or normalized["filters"] is None:
normalized["filters"] = {}
normalized["filters"]["search"] = normalized.pop("search")
# groupId / saveGroupTree are valid PaginationParams fields — pass through unchanged.
# No transformation needed; Pydantic will validate them.
return normalized

View file

@ -475,7 +475,23 @@ class UserConnection(PowerOnModel):
description="OAuth scopes granted for this connection",
json_schema_extra={"frontend_type": "list", "frontend_readonly": True, "frontend_required": False, "label": "Gewährte Berechtigungen"},
)
knowledgeIngestionEnabled: bool = Field(
default=False,
description="Whether the user has consented to knowledge ingestion for this connection",
json_schema_extra={"frontend_type": "boolean", "frontend_readonly": False, "frontend_required": False, "label": "Wissensdatenbank aktiv"},
)
knowledgePreferences: Optional[Dict[str, Any]] = Field(
default=None,
description=(
"Per-connection knowledge ingestion preferences. schemaVersion=1 keys: "
"neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), "
"mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), "
"clickupScope (titles|title_description|with_comments), "
"surfaceToggles (dict per authority), maxAgeDays (int)."
),
json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"},
)
@computed_field
@property
def connectionReference(self) -> str:

View file

@ -174,14 +174,26 @@ async def indexSessionData(
for c in chunks
]
await knowledgeService.indexFile(
fileId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="coaching_session",
sourceId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"lane": "feature",
"feature": "commcoach",
"sessionId": sessionId,
"contextId": contextId,
"messageCount": len(messages or []),
},
)
)
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
except Exception as e:

View file

@ -72,7 +72,7 @@ class AutoWorkflow(PowerOnModel):
},
)
featureInstanceId: str = Field(
description="Feature instance ID",
description="Feature instance ID (GE owner instance / RBAC scope)",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
@ -81,6 +81,17 @@ class AutoWorkflow(PowerOnModel):
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
targetFeatureInstanceId: Optional[str] = Field(
default=None,
description="Target feature instance for execution data scope. NULL for templates, mandatory for non-templates.",
json_schema_extra={
"frontend_type": "select",
"frontend_readonly": False,
"frontend_required": False,
"label": "Ziel-Instanz",
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
label: str = Field(
description="User-friendly workflow name",
json_schema_extra={"frontend_type": "text", "frontend_required": True, "label": "Bezeichnung"},

View file

@ -217,6 +217,8 @@ class GraphicalEditorObjects:
data["id"] = str(uuid.uuid4())
data["mandateId"] = self.mandateId
data["featureInstanceId"] = self.featureInstanceId
if not data.get("targetFeatureInstanceId") and not data.get("isTemplate"):
data["targetFeatureInstanceId"] = self.featureInstanceId
if "active" not in data or data.get("active") is None:
data["active"] = True
data["invocations"] = normalize_invocations_list(data.get("invocations"))

View file

@ -3,6 +3,15 @@
from modules.shared.i18nRegistry import t
_AI_COMMON_PARAMS = [
{"name": "requireNeutralization", "type": "boolean", "required": False,
"frontendType": "checkbox", "default": False,
"description": t("Eingaben fuer diesen Call neutralisieren")},
{"name": "allowedModels", "type": "array", "required": False,
"frontendType": "modelMultiSelect", "default": [],
"description": t("Erlaubte LLM-Modelle (leer = alle erlaubten)")},
]
AI_NODES = [
{
"id": "ai.prompt",
@ -10,16 +19,21 @@ AI_NODES = [
"label": t("Prompt"),
"description": t("Prompt eingeben und KI führt aus"),
"parameters": [
{"name": "aiPrompt", "type": "string", "required": True, "frontendType": "textarea",
{"name": "aiPrompt", "type": "string", "required": True, "frontendType": "templateTextarea",
"description": t("KI-Prompt")},
{"name": "resultType", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["txt", "json", "md", "csv", "xml", "html", "pdf", "docx", "xlsx", "pptx", "png", "jpg"]},
"description": t("Ausgabeformat"), "default": "txt"},
{"name": "documentList", "type": "string", "required": False, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "dataRef",
"description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "context", "type": "string", "required": False, "frontendType": "dataRef",
"description": t("Kontextdaten fuer den Prompt (Upstream-Output binden)"), "default": ""},
{"name": "documentTheme", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["general", "finance", "legal", "technical", "hr"]},
"description": t("Dokument-Thema (Style-Hinweis fuer den Renderer)"), "default": "general"},
{"name": "simpleMode", "type": "boolean", "required": False, "frontendType": "checkbox",
"description": t("Einfacher Modus"), "default": True},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": [
@ -38,7 +52,7 @@ AI_NODES = [
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
"description": t("Recherche-Anfrage")},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
@ -53,12 +67,12 @@ AI_NODES = [
"label": t("Dokument zusammenfassen"),
"description": t("Dokumentinhalt zusammenfassen"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
"description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "summaryLength", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["brief", "medium", "detailed"]},
"description": t("Kurz, mittel oder ausführlich"), "default": "medium"},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@ -73,11 +87,11 @@ AI_NODES = [
"label": t("Dokument übersetzen"),
"description": t("Dokument in Zielsprache übersetzen"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
"description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "targetLanguage", "type": "string", "required": True, "frontendType": "text",
"description": t("Zielsprache (z.B. de, en, French)")},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@ -92,12 +106,12 @@ AI_NODES = [
"label": t("Dokument konvertieren"),
"description": t("Dokument in anderes Format konvertieren"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
"description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""},
{"name": "targetFormat", "type": "string", "required": True, "frontendType": "select",
"frontendOptions": {"options": ["docx", "pdf", "xlsx", "csv", "txt", "html", "json", "md"]},
"description": t("Zielformat")},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@ -114,7 +128,7 @@ AI_NODES = [
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
"description": t("Generierungs-Prompt")},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
@ -134,7 +148,7 @@ AI_NODES = [
{"name": "resultType", "type": "string", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["py", "js", "ts", "html", "java", "cpp", "txt", "json", "csv", "xml"]},
"description": t("Datei-Endung der erzeugten Code-Datei"), "default": "py"},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
@ -154,7 +168,7 @@ AI_NODES = [
"description": t("Konsolidierungsmodus"), "default": "summarize"},
{"name": "prompt", "type": "string", "required": False, "frontendType": "textarea",
"description": t("Optionaler Prompt für die Konsolidierung"), "default": ""},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["AggregateResult", "Transit"]}},

View file

@ -62,7 +62,7 @@ EMAIL_NODES = [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("E-Mail-Konto")},
{"name": "context", "type": "string", "required": False, "frontendType": "textarea",
{"name": "context", "type": "string", "required": False, "frontendType": "templateTextarea",
"description": t("Kontext / Brief-Beschreibung für die KI-Komposition"), "default": ""},
{"name": "to", "type": "string", "required": False, "frontendType": "text",
"description": t("Empfänger (komma-separiert, optional für Entwurf)"), "default": ""},

View file

@ -83,7 +83,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
PortField(name="listId", type="str", description="ClickUp-Listen-ID"),
PortField(name="name", type="str", required=False, description="Listenname"),
PortField(name="spaceId", type="str", required=False, description="Space-ID"),
PortField(name="folderId", type="str", required=False, description="Ordner-ID"),
PortField(name="groupId", type="str", required=False, description="Gruppen-ID für die Gruppierungszuordnung"),
PortField(name="connection", type="ConnectionRef", required=False,
description="ClickUp-Verbindung"),
]),

View file

@ -111,6 +111,44 @@ def _validateInstanceAccess(instanceId: str, context: RequestContext) -> str:
return str(instance.mandateId) if instance.mandateId else ""
def _validateTargetInstance(
workflowData: Dict[str, Any],
ownerInstanceId: str,
context: RequestContext,
) -> None:
"""Enforce targetFeatureInstanceId rules for non-template workflows.
- Templates (isTemplate=True) may omit targetFeatureInstanceId.
- Non-templates MUST have a non-empty targetFeatureInstanceId.
- If the targetFeatureInstanceId differs from the GE owner instance,
the user must also have FeatureAccess on that target instance.
"""
if workflowData.get("isTemplate"):
return
targetId = workflowData.get("targetFeatureInstanceId")
if not targetId:
return
if targetId == ownerInstanceId:
return
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
targetInstance = rootInterface.getFeatureInstance(targetId)
if not targetInstance:
raise HTTPException(
status_code=400,
detail=routeApiMsg("targetFeatureInstanceId refers to a non-existent feature instance"),
)
targetAccess = rootInterface.getFeatureAccess(str(context.user.id), targetId)
if not targetAccess or not targetAccess.enabled:
raise HTTPException(
status_code=403,
detail=routeApiMsg("Access denied to target feature instance"),
)
@router.get("/{instanceId}/node-types")
@limiter.limit("60/minute")
def get_node_types(
@ -318,9 +356,12 @@ async def post_execute(
workflowId = body.get("workflowId")
req_nodes = graph.get("nodes") or []
workflow_for_envelope: Optional[Dict[str, Any]] = None
targetFeatureInstanceId: Optional[str] = None
if workflowId and not str(workflowId).startswith("transient-"):
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
workflow_for_envelope = iface.getWorkflow(workflowId)
if workflow_for_envelope:
targetFeatureInstanceId = workflow_for_envelope.get("targetFeatureInstanceId")
if workflowId and len(req_nodes) == 0:
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
wf = iface.getWorkflow(workflowId)
@ -328,10 +369,18 @@ async def post_execute(
graph = wf["graph"]
logger.info("graphicalEditor execute: loaded graph from workflow %s", workflowId)
workflow_for_envelope = wf
targetFeatureInstanceId = wf.get("targetFeatureInstanceId")
if not workflowId:
import uuid
workflowId = f"transient-{uuid.uuid4().hex[:12]}"
logger.info("graphicalEditor execute: using transient workflowId=%s", workflowId)
if targetFeatureInstanceId and targetFeatureInstanceId != instanceId:
_validateTargetInstance(
{"targetFeatureInstanceId": targetFeatureInstanceId},
instanceId,
context,
)
nodes_count = len(graph.get("nodes") or [])
connections_count = len(graph.get("connections") or [])
logger.info(
@ -363,6 +412,7 @@ async def post_execute(
automation2_interface=ge_interface,
run_envelope=run_env,
label=_wfLabel,
targetFeatureInstanceId=targetFeatureInstanceId,
)
logger.info(
"graphicalEditor execute result: success=%s error=%s nodeOutputs_keys=%s failedNode=%s paused=%s",
@ -1371,6 +1421,7 @@ def create_workflow(
) -> dict:
"""Create a new workflow."""
mandateId = _validateInstanceAccess(instanceId, context)
_validateTargetInstance(body, instanceId, context)
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
created = iface.createWorkflow(body)
return created
@ -1388,6 +1439,11 @@ def update_workflow(
"""Update a workflow."""
mandateId = _validateInstanceAccess(instanceId, context)
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
existing = iface.getWorkflow(workflowId)
if not existing:
raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found"))
merged = {**existing, **body}
_validateTargetInstance(merged, instanceId, context)
updated = iface.updateWorkflow(workflowId, body)
if not updated:
raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found"))

View file

@ -361,6 +361,17 @@ QUICK_ACTIONS = [
# The placeholder {{featureInstanceId}} is replaced by _copyTemplateWorkflows.
# ---------------------------------------------------------------------------
_FINANCE_STYLE_HINT = (
"\n\nWenn du ein Dokument erstellst, verwende einen professionellen Finanz-Stil:\n"
"- Schriftart: Calibri\n"
"- Primaerfarbe: #1F3864 (Dunkelblau)\n"
"- Akzentfarbe: #2980B9\n"
"- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n"
"- Konservatives, seriöses Layout\n"
"Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen."
)
def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]:
"""Build a standard analysis graph: trigger -> refreshAccountingData -> ai.prompt."""
return {
@ -370,8 +381,9 @@ def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]:
"parameters": {"featureInstanceId": "{{featureInstanceId}}", "forceRefresh": False}, "position": {"x": 250, "y": 0}},
{"id": "analyse", "type": "ai.prompt", "label": "Analyse", "_method": "ai", "_action": "process",
"parameters": {
"aiPrompt": prompt,
"aiPrompt": prompt + _FINANCE_STYLE_HINT,
"context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]},
"requireNeutralization": False,
"simpleMode": False,
}, "position": {"x": 500, "y": 0}},
],
@ -440,15 +452,33 @@ TEMPLATE_WORKFLOWS = [
{"id": "analyse", "type": "ai.prompt", "label": "Budget-Analyse", "_method": "ai", "_action": "process",
"parameters": {
"aiPrompt": (
"Fuehre einen Budget-Soll/Ist-Vergleich durch.\n"
"Die Budget-Datei (Excel) wurde als Dokument uebergeben. "
"Die aktuellen Buchhaltungsdaten sind im Kontext verfuegbar.\n"
"1. Lies die Soll-Werte aus dem uebergebenen Budget-Dokument\n"
"2. Vergleiche sie mit den Ist-Werten aus der Buchhaltung pro Konto\n"
"3. Berechne die Abweichung (absolut und prozentual)\n"
"4. Erstelle ein Abweichungs-Chart (Balkendiagramm: Soll vs. Ist pro Konto)\n"
"5. Markiere kritische Abweichungen (>10%) und gib eine kurze Einschaetzung"
"Fuehre einen Budget-Soll/Ist-Vergleich durch und liefere EIN Excel-Dokument "
"mit folgender Struktur:\n\n"
"1. Tabelle \"Konten-Vergleich\" -- EINE Tabelle, EINE Zeile pro Konto:\n"
" Spalten: Konto-Nr | Konto-Name | Soll | Ist | Abweichung absolut | "
"Abweichung % | Status (OK / Warnung / Kritisch).\n"
"2. EINE Visualisierung \"Soll vs. Ist gesamt\" -- ein einziges "
"Balkendiagramm UNTER der Tabelle, das ALLE Konten in einer Grafik "
"gegenueberstellt (gruppierte Balken: Soll und Ist je Konto).\n"
"3. Kurzer Management-Summary-Absatz (3-5 Saetze) UNTER dem Chart "
"mit den 3 groessten Abweichungen (>10%) und einer fachlichen "
"Einschaetzung.\n\n"
"Verwende die uebergebene Budget-Datei als Soll-Quelle und die im "
"Kontext bereitgestellten Buchhaltungsdaten als Ist-Quelle.\n"
"WICHTIG: Erstelle KEINEN separaten Chart pro Konto. Nur EIN "
"Uebersichts-Chart ueber alle Konten ist gewuenscht.\n\n"
"Hinweis: Das documentTheme ist 'finance'. Wenn du ein Dokument erstellst, "
"verwende einen professionellen Finanz-Stil:\n"
"- Schriftart: Calibri\n"
"- Primaerfarbe: #1F3864 (Dunkelblau)\n"
"- Akzentfarbe: #2980B9\n"
"- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n"
"- Konservatives, seriöses Layout\n"
"Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen."
),
"resultType": "xlsx",
"documentTheme": "finance",
"requireNeutralization": False,
"documentList": {"type": "ref", "nodeId": "trigger", "path": ["payload", "documentList"]},
"context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]},
"simpleMode": False,

View file

@ -2,8 +2,8 @@
# All rights reserved.
"""Workspace feature data models — WorkspaceUserSettings."""
from typing import Optional
from pydantic import BaseModel, Field
from typing import List, Optional
from pydantic import Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
import uuid
@ -52,3 +52,18 @@ class WorkspaceUserSettings(PowerOnModel):
description="Max agent rounds override (None = instance default)",
json_schema_extra={"label": "Max. Agenten-Runden", "frontend_type": "number", "frontend_readonly": False, "frontend_required": False},
)
requireNeutralization: bool = Field(
default=False,
description="Default neutralization setting for this user",
json_schema_extra={"label": "Neutralisierung", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False},
)
allowedProviders: List[str] = Field(
default_factory=list,
description="Allowed AI providers (empty = all permitted by RBAC)",
json_schema_extra={"label": "Erlaubte Provider", "frontend_type": "multiselect", "frontend_readonly": False, "frontend_required": False},
)
allowedModels: List[str] = Field(
default_factory=list,
description="Allowed AI models (empty = all permitted)",
json_schema_extra={"label": "Erlaubte Modelle", "frontend_type": "modelMultiSelect", "frontend_readonly": False, "frontend_required": False},
)

View file

@ -110,6 +110,7 @@ class WorkspaceInputRequest(BaseModel):
workflowId: Optional[str] = Field(default=None, description="Continue existing workflow")
userLanguage: str = Field(default="en", description="User language code")
allowedProviders: List[str] = Field(default_factory=list, description="Restrict AI to these providers")
allowedModels: List[str] = Field(default_factory=list, description="Restrict AI to these models")
requireNeutralization: Optional[bool] = Field(default=None, description="Per-request neutralization override")
@ -635,6 +636,7 @@ async def streamWorkspaceStart(
userLanguage=userInput.userLanguage,
instanceConfig=instanceConfig,
allowedProviders=userInput.allowedProviders,
allowedModels=userInput.allowedModels,
requireNeutralization=userInput.requireNeutralization,
billingFeatureCode=wsBillingFeatureCode,
)
@ -692,6 +694,7 @@ async def _runWorkspaceAgent(
userLanguage: str = "en",
instanceConfig: Dict[str, Any] = None,
allowedProviders: List[str] = None,
allowedModels: List[str] = None,
requireNeutralization: Optional[bool] = None,
billingFeatureCode: Optional[str] = None,
):
@ -715,6 +718,9 @@ async def _runWorkspaceAgent(
logger.info(f"Workspace agent: allowedProviders={allowedProviders}")
else:
logger.debug("Workspace agent: no allowedProviders in request")
if allowedModels:
aiService.services.allowedModels = allowedModels
logger.info(f"Workspace agent: allowedModels={allowedModels}")
if requireNeutralization is not None:
ctx.requireNeutralization = requireNeutralization
@ -1202,7 +1208,7 @@ async def patchWorkspaceWorkflowAttachments(
# ---------------------------------------------------------------------------
# File and folder list endpoints
# File endpoints
# ---------------------------------------------------------------------------
@router.get("/{instanceId}/files")
@ -1210,7 +1216,6 @@ async def patchWorkspaceWorkflowAttachments(
async def listWorkspaceFiles(
request: Request,
instanceId: str = Path(...),
folderId: Optional[str] = Query(None),
tags: Optional[str] = Query(None),
search: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
@ -1265,30 +1270,6 @@ async def getFileContent(
return Response(content=content, media_type=mimeType)
@router.get("/{instanceId}/folders")
@limiter.limit("300/minute")
async def listWorkspaceFolders(
request: Request,
instanceId: str = Path(...),
parentId: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
):
_mandateId, _ = _validateInstanceAccess(instanceId, context)
try:
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(
user=context.user,
mandate_id=_mandateId or "",
feature_instance_id=instanceId,
)
chatService = getService("chat", ctx)
folders = chatService.listFolders(parentId=parentId)
return JSONResponse({"folders": folders or []})
except Exception:
return JSONResponse({"folders": []})
@router.get("/{instanceId}/datasources")
@limiter.limit("300/minute")
async def listWorkspaceDataSources(
@ -2139,6 +2120,76 @@ async def updateGeneralSettings(
return await getGeneralSettings(request, instanceId, context)
# =========================================================================
# User-level AI settings (neutralisation, providers, models)
# =========================================================================
@router.get("/{instanceId}/user-settings")
@limiter.limit("120/minute")
async def getWorkspaceUserSettings(
request: Request,
instanceId: str = Path(...),
context: RequestContext = Depends(getRequestContext),
):
"""Get the current user's workspace AI settings (auto-creates with defaults if not exists)."""
_mandateId, _ = _validateInstanceAccess(instanceId, context)
wsInterface = _getWorkspaceInterface(context, instanceId)
userId = str(context.user.id)
settings = wsInterface.getWorkspaceUserSettings(userId)
if settings:
return JSONResponse({
"requireNeutralization": settings.requireNeutralization,
"allowedProviders": settings.allowedProviders,
"allowedModels": settings.allowedModels,
})
data = {
"userId": userId,
"mandateId": str(context.mandateId) if context.mandateId else "",
"featureInstanceId": instanceId,
}
created = wsInterface.saveWorkspaceUserSettings(data)
return JSONResponse({
"requireNeutralization": created.requireNeutralization,
"allowedProviders": created.allowedProviders,
"allowedModels": created.allowedModels,
})
@router.put("/{instanceId}/user-settings")
@limiter.limit("120/minute")
async def putWorkspaceUserSettings(
request: Request,
instanceId: str = Path(...),
body: dict = Body(...),
context: RequestContext = Depends(getRequestContext),
):
"""Save the current user's workspace AI settings."""
_mandateId, _ = _validateInstanceAccess(instanceId, context)
wsInterface = _getWorkspaceInterface(context, instanceId)
userId = str(context.user.id)
data = {
"userId": userId,
"mandateId": str(context.mandateId) if context.mandateId else "",
"featureInstanceId": instanceId,
}
if "requireNeutralization" in body:
data["requireNeutralization"] = bool(body["requireNeutralization"])
if "allowedProviders" in body:
data["allowedProviders"] = body["allowedProviders"]
if "allowedModels" in body:
data["allowedModels"] = body["allowedModels"]
saved = wsInterface.saveWorkspaceUserSettings(data)
return JSONResponse({
"requireNeutralization": saved.requireNeutralization,
"allowedProviders": saved.allowedProviders,
"allowedModels": saved.allowedModels,
})
# =========================================================================
# RAG / Knowledge — anonymised instance statistics (presentation / KPIs)
# =========================================================================

View file

@ -0,0 +1,198 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Lightweight Bootstrap-Telemetrie fuer entfernte Migrationsroutinen.
Wenn eine idempotente Bootstrap-Migration (z.B. ``_migrateAndDropSysAdminRole``)
aus dem Boot-Pfad entfernt wird, koennte ein theoretischer Edge-Case (alte
DB-Restore, manueller INSERT) wieder Legacy-Daten ins System bringen. Damit das
nicht still bleibt, ruft ``initBootstrap`` nach Abschluss aller Init-Schritte
einmalig ``runLegacyDataChecks`` auf -- das logged WARN bei Restbestand.
Designprinzipien:
- KEINE Schreibzugriffe (rein lesend).
- Process-lokal gecached (``_cache``), damit identische Boots/Reloads den Check
nur einmal laufen lassen.
- Pro Check eine Recordset-Abfrage; Ausnahmen werden als WARN geloggt, nicht
re-raised, damit Telemetrie den Boot nie crasht.
"""
from __future__ import annotations
import logging
from typing import Any
from modules.connectors.connectorDbPostgre import DatabaseConnector
from modules.datamodels.datamodelRbac import Role
from modules.datamodels.datamodelUam import Mandate
from modules.shared.mandateNameUtils import isValidMandateName
logger = logging.getLogger(__name__)
_alreadyRan: bool = False
def runLegacyDataChecks(db: DatabaseConnector) -> None:
"""Logged WARN, falls noch Legacy-Daten existieren, die durch entfernte
Migrationsroutinen behandelt wurden. Prozessweit nur einmal aktiv.
Aufruf: am Ende von ``initBootstrap``.
"""
global _alreadyRan
if _alreadyRan:
return
_alreadyRan = True
_checkMandateDescription(db)
_checkMandateSlugRules(db)
_checkLegacyRootMandate(db)
_checkSysadminRole(db)
_backfillTargetFeatureInstanceId()
def _safe(checkName: str, fn) -> Any:
try:
return fn()
except Exception as exc:
logger.warning(
"Legacy-data telemetry check '%s' failed: %s: %s",
checkName, type(exc).__name__, exc,
)
return None
def _checkMandateDescription(db: DatabaseConnector) -> None:
def _do() -> None:
rows = db.getRecordset(Mandate)
bad = [
r.get("id") for r in rows
if r.get("description") and not r.get("label")
]
if bad:
logger.warning(
"Legacy-data check: %d Mandate row(s) still have description "
"but empty label (removed migration: _migrateMandateDescriptionToLabel). "
"Run scripts/script_db_audit_legacy_state.py for details. IDs: %s",
len(bad), bad[:5],
)
_safe("mandate-description", _do)
def _checkMandateSlugRules(db: DatabaseConnector) -> None:
def _do() -> None:
rows = db.getRecordset(Mandate)
seen: set[str] = set()
bad: list[str] = []
for r in sorted(rows, key=lambda x: str(x.get("id", ""))):
mid = r.get("id")
if not mid:
continue
name = (r.get("name") or "").strip()
labelRaw = r.get("label")
labelEmpty = not (labelRaw or "").strip() if labelRaw is not None else True
invalid = not isValidMandateName(name)
collides = name in seen
if not invalid and not collides:
seen.add(name)
if labelEmpty or invalid or collides:
bad.append(str(mid))
if bad:
logger.warning(
"Legacy-data check: %d Mandate row(s) violate slug/label rules "
"(removed migration: _migrateMandateNameLabelSlugRules). "
"Run scripts/script_db_audit_legacy_state.py for details. IDs: %s",
len(bad), bad[:5],
)
_safe("mandate-slug-rules", _do)
def _checkLegacyRootMandate(db: DatabaseConnector) -> None:
def _do() -> None:
legacy = db.getRecordset(Mandate, recordFilter={"name": "Root"})
rootRows = db.getRecordset(Mandate, recordFilter={"name": "root"})
legacyByFlag = [r for r in rootRows if not r.get("isSystem")]
all_ = list(legacy) + legacyByFlag
if all_:
logger.warning(
"Legacy-data check: %d Root-Mandate row(s) still in legacy form "
"(removed migration: initRootMandate-legacy-branch). IDs: %s",
len(all_), [r.get("id") for r in all_][:5],
)
_safe("root-mandate-legacy", _do)
def _checkSysadminRole(db: DatabaseConnector) -> None:
def _do() -> None:
rootMandates = db.getRecordset(
Mandate, recordFilter={"name": "root", "isSystem": True}
)
if not rootMandates:
return
rootId = str(rootMandates[0].get("id"))
rows = db.getRecordset(
Role,
recordFilter={
"roleLabel": "sysadmin",
"mandateId": rootId,
"featureInstanceId": None,
},
)
if rows:
logger.warning(
"Legacy-data check: %d 'sysadmin' role(s) still present in root mandate "
"(removed migration: _migrateAndDropSysAdminRole). "
"Authority is now User.isPlatformAdmin -- migrate manually. IDs: %s",
len(rows), [r.get("id") for r in rows],
)
_safe("sysadmin-role", _do)
def _backfillTargetFeatureInstanceId() -> None:
"""Idempotent backfill: set targetFeatureInstanceId = featureInstanceId
for all non-template AutoWorkflow rows where it is still NULL.
Connects to ``poweron_graphicaleditor`` independently.
"""
def _do() -> None:
from modules.shared.configuration import APP_CONFIG
from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow
dbHost = APP_CONFIG.get("DB_HOST", "localhost")
dbUser = APP_CONFIG.get("DB_USER")
dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD")
dbPort = int(APP_CONFIG.get("DB_PORT", 5432))
geDb = DatabaseConnector(
dbHost=dbHost,
dbDatabase="poweron_graphicaleditor",
dbUser=dbUser,
dbPassword=dbPassword,
dbPort=dbPort,
userId=None,
)
if not geDb._ensureTableExists(AutoWorkflow):
return
rows = geDb.getRecordset(AutoWorkflow) or []
backfilled = 0
for r in rows:
if r.get("isTemplate"):
continue
if r.get("targetFeatureInstanceId"):
continue
srcId = r.get("featureInstanceId")
if not srcId:
continue
geDb.recordModify(AutoWorkflow, r["id"], {"targetFeatureInstanceId": srcId})
backfilled += 1
if backfilled:
logger.info(
"targetFeatureInstanceId backfill: set %d non-template AutoWorkflow row(s) "
"to their featureInstanceId",
backfilled,
)
_safe("backfill-targetFeatureInstanceId", _do)

View file

@ -111,6 +111,19 @@ class AiObjects:
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1,
)
allowedModels = getattr(options, 'allowedModels', None) if options else None
if allowedModels:
filteredModels = [m for m in availableModels if m.name in allowedModels]
if filteredModels:
availableModels = filteredModels
else:
errorMsg = f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}"
logger.error(errorMsg)
return AiCallResponse(
content=errorMsg, modelName="error", priceCHF=0.0,
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1,
)
failoverModelList = modelSelector.getFailoverModelList(prompt, context, options, availableModels)
if not failoverModelList:
@ -364,6 +377,19 @@ class AiObjects:
)
return
allowedModels = getattr(options, 'allowedModels', None) if options else None
if allowedModels:
filtered = [m for m in availableModels if m.name in allowedModels]
if filtered:
availableModels = filtered
else:
yield AiCallResponse(
content=f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}",
modelName="error", priceCHF=0.0, processingTime=0.0,
bytesSent=0, bytesReceived=0, errorCount=1,
)
return
failoverModelList = modelSelector.getFailoverModelList(
request.prompt, request.context or "", options, availableModels
)
@ -516,6 +542,14 @@ class AiObjects:
else:
logger.warning(f"No embedding models match allowedProviders {allowedProviders}")
allowedModels = getattr(options, 'allowedModels', None) if options else None
if allowedModels:
filtered = [m for m in availableModels if m.name in allowedModels]
if filtered:
availableModels = filtered
else:
logger.warning(f"No embedding models match allowedModels {allowedModels}")
failoverModelList = modelSelector.getFailoverModelList(
combinedText, "", options, availableModels
)

View file

@ -56,14 +56,8 @@ def initBootstrap(db: DatabaseConnector) -> None:
logger.info("Starting system bootstrap")
# Initialize root mandate
mandateId = initRootMandate(db)
# Migrate existing mandate records: description -> label
_migrateMandateDescriptionToLabel(db)
_migrateMandateNameLabelSlugRules(db)
# Clean up duplicate roles and fix corrupted templates FIRST
_deduplicateRoles(db)
# Initialize system role TEMPLATES (mandateId=None, isSystemRole=True)
@ -76,14 +70,6 @@ def initBootstrap(db: DatabaseConnector) -> None:
# This also serves as migration for existing mandates that don't have instance roles yet
_ensureAllMandatesHaveSystemRoles(db)
# Migration: eliminate the legacy ``sysadmin`` role in root mandate
# (replaced by ``User.isPlatformAdmin`` flag — see
# wiki/c-work/4-done/2026-04-sysadmin-authority-split.md).
# Idempotent: noop after first successful run.
if mandateId:
_migrateAndDropSysAdminRole(db, mandateId)
# Ensure UI rules for navigation items (admin/user/viewer roles)
_ensureUiContextRules(db)
# Initialize admin user
@ -129,9 +115,22 @@ def initBootstrap(db: DatabaseConnector) -> None:
# Bootstrap system workflow templates for graphical editor
_bootstrapSystemTemplates(db)
# Sync feature template workflows (update graph of existing instance workflows
# whose templateSourceId matches a current code-defined template)
_syncFeatureTemplateWorkflows()
# Ensure billing settings and accounts exist for all mandates
_bootstrapBilling()
# Telemetrie: warne falls Restbestaende der entfernten idempotenten
# Migrationen wieder auftauchen (Edge-Case: alter DB-Restore o.ae.).
# Schreibt nicht, scheitert nicht den Boot.
try:
from modules.interfaces._legacyMigrationTelemetry import runLegacyDataChecks
runLegacyDataChecks(db)
except Exception as e:
logger.warning(f"Legacy-data telemetry skipped: {e}")
def _bootstrapBilling() -> None:
"""
@ -195,6 +194,97 @@ def _bootstrapSystemTemplates(db: DatabaseConnector) -> None:
logger.warning(f"System workflow template bootstrap failed: {e}")
def _syncFeatureTemplateWorkflows() -> None:
"""Sync existing instance-scoped workflows with current code-defined templates.
For each feature that exposes getTemplateWorkflows(), find all AutoWorkflow
rows whose templateSourceId matches a template ID and update their graph
if the code-defined version has changed. Preserves instance-specific
fields (label, tags, targetFeatureInstanceId, invocations, active).
Idempotent, runs on every boot.
"""
import json
try:
from modules.system.registry import loadFeatureMainModules
from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow
from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
mainModules = loadFeatureMainModules()
templatesBySourceId: dict = {}
for featureCode, mod in mainModules.items():
getTemplateWorkflows = getattr(mod, "getTemplateWorkflows", None)
if not getTemplateWorkflows:
continue
try:
templates = getTemplateWorkflows() or []
except Exception:
continue
for tpl in templates:
tplId = tpl.get("id")
if tplId:
templatesBySourceId[tplId] = tpl
if not templatesBySourceId:
logger.info("_syncFeatureTemplateWorkflows: no templates found, skipping")
return
logger.info(f"_syncFeatureTemplateWorkflows: found {len(templatesBySourceId)} template(s): {list(templatesBySourceId.keys())}")
greenfieldDb = DatabaseConnector(
dbHost=APP_CONFIG.get("DB_HOST", "localhost"),
dbDatabase=graphicalEditorDatabase,
dbUser=APP_CONFIG.get("DB_USER"),
dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"),
)
updated = 0
for sourceId, tpl in templatesBySourceId.items():
instances = greenfieldDb.getRecordset(AutoWorkflow, recordFilter={
"templateSourceId": sourceId,
"isTemplate": False,
})
if not instances:
continue
canonicalGraph = tpl.get("graph", {})
for inst in instances:
instId = inst.get("id") if isinstance(inst, dict) else getattr(inst, "id", None)
targetInstanceId = (
inst.get("targetFeatureInstanceId") if isinstance(inst, dict)
else getattr(inst, "targetFeatureInstanceId", None)
) or ""
graphJson = json.dumps(canonicalGraph)
graphJson = graphJson.replace("{{featureInstanceId}}", targetInstanceId)
newGraph = json.loads(graphJson)
existingGraph = inst.get("graph") if isinstance(inst, dict) else getattr(inst, "graph", None)
if isinstance(existingGraph, str):
try:
existingGraph = json.loads(existingGraph)
except Exception:
existingGraph = None
if existingGraph == newGraph:
logger.debug(f"_syncFeatureTemplateWorkflows: graph unchanged for workflow {instId} (template={sourceId})")
continue
logger.debug(f"_syncFeatureTemplateWorkflows: graph DIFFERS for workflow {instId} (template={sourceId}), updating")
greenfieldDb.recordModify(AutoWorkflow, instId, {"graph": newGraph})
updated += 1
logger.info(f"_syncFeatureTemplateWorkflows: updated graph for workflow {instId} (template={sourceId})")
if updated:
logger.info(f"_syncFeatureTemplateWorkflows: synced {updated} workflow(s) with current templates")
else:
logger.info("_syncFeatureTemplateWorkflows: all instance graphs already match current templates")
greenfieldDb.close()
except Exception as e:
logger.warning(f"Feature template workflow sync failed: {e}")
def _buildSystemTemplates():
"""Build the graph definitions for platform system templates."""
return [
@ -396,21 +486,12 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]:
Returns:
Mandate ID if created or found, None otherwise
"""
# Find existing root mandate by name AND isSystem flag
existingMandates = db.getRecordset(Mandate, recordFilter={"name": "root", "isSystem": True})
if existingMandates:
mandateId = existingMandates[0].get("id")
logger.info(f"Root mandate already exists with ID {mandateId}")
return mandateId
# Check for legacy root mandates (name="Root" without isSystem flag) and migrate
legacyMandates = db.getRecordset(Mandate, recordFilter={"name": "Root"})
if legacyMandates:
mandateId = legacyMandates[0].get("id")
logger.info(f"Migrating legacy Root mandate {mandateId}: setting name='root', isSystem=True")
db.recordModify(Mandate, mandateId, {"name": "root", "isSystem": True})
return mandateId
logger.info("Creating Root mandate")
rootMandate = Mandate(name="root", label="Root", isSystem=True, enabled=True)
createdMandate = db.recordCreate(Mandate, rootMandate)
@ -419,98 +500,6 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]:
return mandateId
def _migrateMandateDescriptionToLabel(db: DatabaseConnector) -> None:
"""
Migration: Rename 'description' field to 'label' in all Mandate records.
Copies existing 'description' values to 'label' and removes the old field.
Safe to run multiple times (idempotent).
"""
allMandates = db.getRecordset(Mandate)
migratedCount = 0
for mandateRecord in allMandates:
mandateId = mandateRecord.get("id")
hasDescription = "description" in mandateRecord and mandateRecord.get("description") is not None
hasLabel = "label" in mandateRecord and mandateRecord.get("label") is not None
if hasDescription and not hasLabel:
# Copy description to label
updateData = {"label": mandateRecord["description"]}
db.recordModify(Mandate, mandateId, updateData)
migratedCount += 1
logger.info(f"Migrated mandate {mandateId}: description -> label")
if migratedCount > 0:
logger.info(f"Migrated {migratedCount} mandate(s) from description to label")
else:
logger.debug("No mandate description->label migration needed")
def _migrateMandateNameLabelSlugRules(db: DatabaseConnector) -> None:
"""
Migration: normalize Mandate.name to the slug rules ([a-z0-9-], length 2..32, single
hyphen segments) and ensure Mandate.label is non-empty.
Rules (see wiki/c-work/1-plan/2026-04-mandate-name-label-logic.md):
1. If ``label`` is empty/None set ``label := name`` (or "Mandate" when both empty).
2. If ``name`` is not a valid slug, or collides with an earlier mandate in stable id
order, allocate a unique slug from the (now non-empty) ``label`` using
``slugifyMandateName`` + ``allocateUniqueMandateSlug``.
Idempotent: a second run is a no-op because all valid names stay valid and stay unique.
Each rename and label fill-in is logged for audit.
"""
from modules.shared.mandateNameUtils import (
allocateUniqueMandateSlug,
isValidMandateName,
slugifyMandateName,
)
allRows = db.getRecordset(Mandate)
if not allRows:
return
sortedRows = sorted(allRows, key=lambda r: str(r.get("id", "")))
used: set[str] = set()
labelFills = 0
nameRenames: list[tuple[str, str, str]] = []
for rec in sortedRows:
mid = rec.get("id")
if not mid:
continue
name = (rec.get("name") or "").strip()
labelRaw = rec.get("label")
label = (labelRaw or "").strip() if labelRaw is not None else ""
if not label:
label = name if name else "Mandate"
db.recordModify(Mandate, mid, {"label": label})
labelFills += 1
logger.info(f"Mandate {mid}: filled empty label with '{label}'")
nameFits = isValidMandateName(name)
nameCollides = name in used
if nameFits and not nameCollides:
used.add(name)
continue
base = slugifyMandateName(label) or "mn"
newName = allocateUniqueMandateSlug(base, used)
used.add(newName)
if newName != name:
db.recordModify(Mandate, mid, {"name": newName})
nameRenames.append((str(mid), name, newName))
logger.info(f"Mandate {mid}: renamed name '{name}' -> '{newName}'")
if labelFills or nameRenames:
logger.info(
"Mandate name/label slug migration: %d label fill-in(s), %d name rename(s)",
labelFills, len(nameRenames),
)
else:
logger.debug("No mandate name/label slug migration needed")
def initAdminUser(db: DatabaseConnector, mandateId: Optional[str]) -> Optional[str]:
"""
Creates the Admin user if it doesn't exist.
@ -837,101 +826,6 @@ def copySystemRolesToMandate(db: DatabaseConnector, mandateId: str) -> int:
return copiedCount
def _migrateAndDropSysAdminRole(db: DatabaseConnector, mandateId: str) -> None:
"""
One-shot migration: eliminate the legacy ``sysadmin`` role in the root mandate.
Authority semantics moved to two orthogonal flags on User:
- ``isSysAdmin`` Infrastructure-Operator (RBAC bypass)
- ``isPlatformAdmin`` Cross-Mandate-Governance (no bypass)
Migration steps (idempotent):
1. Find sysadmin role(s) in root mandate. If none exist done.
2. For every UserMandateRole row referencing such a role: set
``user.isPlatformAdmin = True`` (preserves cross-mandate authority).
3. Delete those UserMandateRole rows.
4. Delete AccessRules attached to the sysadmin role.
5. Delete the sysadmin Role record.
Args:
db: Database connector instance
mandateId: Root mandate ID
"""
sysadminRoles = db.getRecordset(
Role,
recordFilter={"roleLabel": "sysadmin", "mandateId": mandateId, "featureInstanceId": None},
)
if not sysadminRoles:
logger.debug("Sysadmin role migration: no legacy sysadmin role present, nothing to do")
return
sysadminRoleIds = [str(r.get("id")) for r in sysadminRoles if r.get("id")]
logger.warning(
f"Sysadmin role migration: found {len(sysadminRoleIds)} legacy sysadmin role(s) "
f"in root mandate, migrating to isPlatformAdmin flag"
)
# 1) Promote every holder to isPlatformAdmin=True
promoted = 0
for sysadminRoleId in sysadminRoleIds:
umRoleRows = db.getRecordset(
UserMandateRole, recordFilter={"roleId": sysadminRoleId}
)
userMandateIds = [str(r.get("userMandateId")) for r in umRoleRows if r.get("userMandateId")]
if not userMandateIds:
continue
# Resolve userIds via UserMandate
userIds = set()
for umId in userMandateIds:
ums = db.getRecordset(UserMandate, recordFilter={"id": umId})
for um in ums:
uid = um.get("userId") if isinstance(um, dict) else getattr(um, "userId", None)
if uid:
userIds.add(str(uid))
for userId in userIds:
users = db.getRecordset(UserInDB, recordFilter={"id": userId})
if not users:
continue
current = users[0].get("isPlatformAdmin", False)
if not current:
db.recordModify(UserInDB, userId, {"isPlatformAdmin": True})
promoted += 1
logger.warning(
f"Sysadmin role migration: granted isPlatformAdmin=True to user {userId}"
)
# 2) Delete UserMandateRole rows
for umRow in umRoleRows:
rowId = umRow.get("id") if isinstance(umRow, dict) else getattr(umRow, "id", None)
if rowId:
try:
db.recordDelete(UserMandateRole, str(rowId))
except Exception as e:
logger.error(f"Sysadmin role migration: failed to drop UserMandateRole {rowId}: {e}")
# 3) Delete AccessRules
accessRules = db.getRecordset(AccessRule, recordFilter={"roleId": sysadminRoleId})
for ar in accessRules:
arId = ar.get("id") if isinstance(ar, dict) else getattr(ar, "id", None)
if arId:
try:
db.recordDelete(AccessRule, str(arId))
except Exception as e:
logger.error(f"Sysadmin role migration: failed to drop AccessRule {arId}: {e}")
# 4) Delete the Role
try:
db.recordDelete(Role, sysadminRoleId)
except Exception as e:
logger.error(f"Sysadmin role migration: failed to drop Role {sysadminRoleId}: {e}")
logger.warning(
f"Sysadmin role migration: completed; promoted {promoted} user(s) to isPlatformAdmin"
)
def _getRoleId(db: DatabaseConnector, roleLabel: str) -> Optional[str]:
"""
Get role ID by label, using cache or database lookup.

View file

@ -1268,19 +1268,7 @@ class AppObjects:
result = []
for conn_dict in connections:
try:
# Create UserConnection object
connection = UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
connection = UserConnection.model_validate(conn_dict)
result.append(connection)
except Exception as e:
logger.error(
@ -1293,6 +1281,28 @@ class AppObjects:
logger.error(f"Error getting user connections: {str(e)}")
return []
def getActiveKnowledgeConnections(self) -> List[UserConnection]:
"""Return all UserConnections with knowledgeIngestionEnabled=True and status=active.
Used by the daily re-sync scheduler to determine which connections to re-index.
"""
try:
rows = self.db.getRecordset(
UserConnection,
recordFilter={"knowledgeIngestionEnabled": True, "status": ConnectionStatus.ACTIVE.value},
)
result = []
for row in rows or []:
try:
conn = UserConnection.model_validate(row) if isinstance(row, dict) else row
result.append(conn)
except Exception as _e:
logger.warning(f"getActiveKnowledgeConnections: could not parse row: {_e}")
return result
except Exception as e:
logger.error(f"getActiveKnowledgeConnections failed: {e}")
return []
def getUserConnectionById(self, connectionId: str) -> Optional[UserConnection]:
"""Get a single UserConnection by ID or by reference string (connection:authority:username)."""
try:
@ -1317,18 +1327,21 @@ class AppObjects:
if connections:
conn_dict = connections[0]
return UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
try:
return UserConnection.model_validate(conn_dict)
except Exception:
return UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
return None
except Exception as e:
logger.error(f"Error getting user connection by ID: {str(e)}")
@ -4014,6 +4027,59 @@ class AppObjects:
logger.error(f"Error deleting role {roleId}: {str(e)}")
raise
# -------------------------------------------------------------------------
# Table Grouping (user-defined groups for FormGeneratorTable instances)
# -------------------------------------------------------------------------
def getTableGrouping(self, contextKey: str):
"""
Load the group tree for the current user and the given contextKey.
Returns a TableGrouping instance or None if no grouping has been saved yet.
contextKey identifies the table instance, e.g. "connections", "prompts",
"admin/users", "trustee/{instanceId}/documents".
"""
from modules.datamodels.datamodelPagination import TableGrouping
try:
records = self.db.getRecordset(
TableGrouping,
recordFilter={"userId": str(self.userId), "contextKey": contextKey},
)
if not records:
return None
row = records[0]
return TableGrouping.model_validate(row) if isinstance(row, dict) else row
except Exception as e:
logger.error(f"getTableGrouping failed for user={self.userId} key={contextKey}: {e}")
return None
def upsertTableGrouping(self, contextKey: str, rootGroups: list):
"""
Create or replace the group tree for the current user and contextKey.
rootGroups is a list of TableGroupNode-compatible dicts (the full tree).
Returns the saved TableGrouping instance.
"""
from modules.datamodels.datamodelPagination import TableGrouping
from modules.shared.timeUtils import getUtcTimestamp
try:
existing = self.getTableGrouping(contextKey)
data = {
"id": existing.id if existing else str(uuid.uuid4()),
"userId": str(self.userId),
"contextKey": contextKey,
"rootGroups": rootGroups,
"updatedAt": getUtcTimestamp(),
}
if existing:
self.db.recordModify(TableGrouping, existing.id, data)
else:
self.db.recordCreate(TableGrouping, data)
return TableGrouping.model_validate(data)
except Exception as e:
logger.error(f"upsertTableGrouping failed for user={self.userId} key={contextKey}: {e}")
raise
# Public Methods

View file

@ -93,6 +93,46 @@ class KnowledgeObjects:
self.db.recordModify(FileContentIndex, fileId, {"status": status})
return True
def deleteFileContentIndexByConnectionId(self, connectionId: str) -> Dict[str, int]:
"""Delete all FileContentIndex rows (and their ContentChunks) for a connection.
Used when a UserConnection is revoked / disconnected so the knowledge corpus
no longer references data the user no longer grants access to. Returns a dict
with counts to support observability logs.
"""
if not connectionId:
return {"indexRows": 0, "chunks": 0}
rows = self.db.getRecordset(
FileContentIndex, recordFilter={"connectionId": connectionId}
)
mandateIds: set = set()
chunkCount = 0
indexCount = 0
for row in rows:
fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None)
mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "")
if not fid:
continue
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid})
for chunk in chunks:
if self.db.recordDelete(ContentChunk, chunk["id"]):
chunkCount += 1
if self.db.recordDelete(FileContentIndex, fid):
indexCount += 1
if mid:
mandateIds.add(str(mid))
for mid in mandateIds:
try:
from modules.interfaces.interfaceDbBilling import _getRootInterface
_getRootInterface().reconcileMandateStorageBilling(mid)
except Exception as ex:
logger.warning("reconcileMandateStorageBilling after connection purge failed: %s", ex)
return {"indexRows": indexCount, "chunks": chunkCount}
def deleteFileContentIndex(self, fileId: str) -> bool:
"""Delete a FileContentIndex and all associated ContentChunks."""
existing = self.getFileContentIndex(fileId)
@ -603,41 +643,10 @@ def aggregateMandateRagTotalBytes(mandateId: str) -> int:
if rid and str(rid) not in byId:
byId[str(rid)] = row
# DEPRECATED: file-ID-correlation fallback from poweron_management.
# Only needed for pre-migration data where mandateId/featureInstanceId on the
# FileContentIndex are empty. Safe to remove once all environments are migrated.
_fallbackCount = 0
try:
from modules.datamodels.datamodelFiles import FileItem
from modules.interfaces.interfaceDbManagement import ComponentObjects
mgmtDb = ComponentObjects().db
knowledgeIf = getInterface(None)
fileIds: set = set()
for f in mgmtDb.getRecordset(FileItem, recordFilter={"mandateId": mandateId}):
fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
if fid:
fileIds.add(str(fid))
for instId in instIds:
for f in mgmtDb.getRecordset(FileItem, recordFilter={"featureInstanceId": instId}):
fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
if fid:
fileIds.add(str(fid))
for fid in fileIds:
if fid in byId:
continue
row = knowledgeIf.getFileContentIndex(fid)
if row:
byId[fid] = row
_fallbackCount += 1
except Exception as e:
logger.warning("aggregateMandateRagTotalBytes fallback failed: %s", e)
total = sum(int(r.get("totalSize") or 0) for r in byId.values())
logger.info(
"aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes (fallback: %d)",
mandateId, len(byId), total, _fallbackCount,
"aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes",
mandateId, len(byId), total,
)
return total

View file

@ -19,8 +19,7 @@ from modules.interfaces.interfaceRbac import getRecordsetWithRBAC, getRecordsetP
from modules.security.rbac import RbacClass
from modules.datamodels.datamodelRbac import AccessRuleContext
from modules.datamodels.datamodelUam import AccessLevel
from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData
from modules.datamodels.datamodelFileFolder import FileFolder
from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData, FileFolder
from modules.datamodels.datamodelUtils import Prompt
from modules.datamodels.datamodelMessaging import (
MessagingSubscription,
@ -1068,7 +1067,242 @@ class ComponentObjects:
except Exception as e:
logger.error(f"Error converting file record: {str(e)}")
return None
# ── Folder methods ─────────────────────────────────────────────────────────
def getOwnFolderTree(self) -> List[Dict[str, Any]]:
"""Folders owned by the current user, filtered via RBAC."""
return getRecordsetWithRBAC(
self.db, FileFolder, self.currentUser,
recordFilter={"sysCreatedBy": self.userId},
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
)
def getSharedFolderTree(self) -> List[Dict[str, Any]]:
"""Folders visible via scope but NOT owned by the current user.
Adds contextOrphan=True when a folder's parentId is not in the result set."""
allFolders = getRecordsetWithRBAC(
self.db, FileFolder, self.currentUser,
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
)
shared = [f for f in allFolders if f.get("sysCreatedBy") != self.userId]
sharedIds = {f["id"] for f in shared}
for f in shared:
f["contextOrphan"] = bool(f.get("parentId") and f["parentId"] not in sharedIds)
return shared
def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]:
"""Return a single folder dict or None."""
results = getRecordsetWithRBAC(
self.db, FileFolder, self.currentUser,
recordFilter={"id": folderId},
mandateId=self.mandateId,
featureInstanceId=self.featureInstanceId,
)
return results[0] if results else None
def _isFolderOwner(self, folder) -> bool:
createdBy = (
getattr(folder, "sysCreatedBy", None)
or (folder.get("sysCreatedBy") if isinstance(folder, dict) else None)
)
return createdBy == self.userId
def _requireFolderWriteAccess(self, folder, folderId: str, operation: str = "update"):
"""Raise PermissionError if the user cannot mutate this folder.
Owners always can. Non-owners need RBAC ALL level."""
if self._isFolderOwner(folder):
return
from modules.interfaces.interfaceRbac import buildDataObjectKey
objectKey = buildDataObjectKey("FileFolder")
permissions = self.rbac.getUserPermissions(
self.currentUser, AccessRuleContext.DATA, objectKey,
mandateId=self.mandateId, featureInstanceId=self.featureInstanceId,
)
level = getattr(permissions, operation, None)
if level != AccessLevel.ALL:
raise PermissionError(
f"No permission to {operation} folder {folderId} (not owner, access level: {level})"
)
def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]:
if not self.checkRbacPermission(FileFolder, "create"):
raise PermissionError("No permission to create folders")
folder = FileFolder(
name=name,
parentId=parentId,
mandateId=self.mandateId or "",
featureInstanceId=self.featureInstanceId or "",
scope="personal",
neutralize=False,
)
self.db.recordCreate(FileFolder, folder)
return folder.model_dump()
def renameFolder(self, folderId: str, newName: str) -> Dict[str, Any]:
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._requireFolderWriteAccess(folder, folderId, "update")
self.db.recordModify(FileFolder, folderId, {"name": newName})
folder["name"] = newName
return folder
def moveFolder(self, folderId: str, newParentId: Optional[str] = None) -> Dict[str, Any]:
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._requireFolderWriteAccess(folder, folderId, "update")
if newParentId:
parent = self.getFolder(newParentId)
if not parent:
raise FileNotFoundError(f"Target parent folder {newParentId} not found")
self._requireFolderWriteAccess(parent, newParentId, "update")
# Circular-reference guard: newParentId must not be a descendant of folderId
if self._isDescendant(newParentId, folderId):
raise ValueError(f"Cannot move folder into its own subtree (circular reference)")
self.db.recordModify(FileFolder, folderId, {"parentId": newParentId})
folder["parentId"] = newParentId
return folder
def _isDescendant(self, candidateId: str, ancestorId: str) -> bool:
"""Return True if candidateId is a descendant of (or equal to) ancestorId."""
visited = set()
current = candidateId
while current:
if current == ancestorId:
return True
if current in visited:
break
visited.add(current)
f = self.getFolder(current)
current = f.get("parentId") if f else None
return False
def deleteFolderCascade(self, folderId: str) -> Dict[str, Any]:
"""Delete a folder and all owned sub-folders + their files."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._requireFolderWriteAccess(folder, folderId, "delete")
folderIds = self._collectChildFolderIds(folderId)
# Verify all child folders are owned
for fid in folderIds:
if fid == folderId:
continue
child = self.getFolder(fid)
if child and not self._isFolderOwner(child):
raise PermissionError(f"Cannot delete folder tree: sub-folder {fid} is not owned by you")
# Collect files in those folders
fileRows = []
for fid in folderIds:
items = self.db.getRecordset(FileItem, recordFilter={"folderId": fid})
fileRows.extend(items)
for item in fileRows:
itemOwner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
if itemOwner != self.userId:
itemId = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
raise PermissionError(f"Cannot delete folder tree: file {itemId} is not owned by you")
fileIds = [
(item.get("id") if isinstance(item, dict) else getattr(item, "id", None))
for item in fileRows
]
# Single transaction: delete FileData, FileItem, then FileFolder (children first)
self.db._ensure_connection()
try:
with self.db.connection.cursor() as cursor:
if fileIds:
cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (fileIds,))
cursor.execute('DELETE FROM "FileItem" WHERE "id" = ANY(%s)', (fileIds,))
orderedIds = list(folderIds)
orderedIds.remove(folderId)
orderedIds.append(folderId)
if orderedIds:
cursor.execute('DELETE FROM "FileFolder" WHERE "id" = ANY(%s)', (orderedIds,))
self.db.connection.commit()
except Exception:
self.db.connection.rollback()
raise
return {"deletedFolders": len(folderIds), "deletedFiles": len(fileIds)}
def _collectChildFolderIds(self, folderId: str) -> List[str]:
"""BFS to collect folderId + all descendant folder IDs owned by user."""
result = [folderId]
queue = [folderId]
while queue:
parentId = queue.pop(0)
children = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId})
for child in children:
cid = child.get("id") if isinstance(child, dict) else getattr(child, "id", None)
if cid and cid not in result:
result.append(cid)
queue.append(cid)
return result
def patchFolderScope(self, folderId: str, scope: str, cascadeToFiles: bool = False) -> Dict[str, Any]:
validScopes = {"personal", "featureInstance", "mandate", "global"}
if scope not in validScopes:
raise ValueError(f"Invalid scope: {scope}. Must be one of {validScopes}")
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._requireFolderWriteAccess(folder, folderId, "update")
if scope == "global":
from modules.interfaces.interfaceRbac import buildDataObjectKey
objectKey = buildDataObjectKey("FileFolder")
permissions = self.rbac.getUserPermissions(
self.currentUser, AccessRuleContext.DATA, objectKey,
mandateId=self.mandateId, featureInstanceId=self.featureInstanceId,
)
if getattr(permissions, "update", None) != AccessLevel.ALL:
raise PermissionError("Setting global scope requires ALL permission")
self.db.recordModify(FileFolder, folderId, {"scope": scope})
filesUpdated = 0
if cascadeToFiles:
items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId})
for item in items:
owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
if owner == self.userId:
iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
self.db.recordModify(FileItem, iid, {"scope": scope})
filesUpdated += 1
return {"folderId": folderId, "scope": scope, "filesUpdated": filesUpdated}
def patchFolderNeutralize(self, folderId: str, neutralize: bool) -> Dict[str, Any]:
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._requireFolderWriteAccess(folder, folderId, "update")
self.db.recordModify(FileFolder, folderId, {"neutralize": neutralize})
items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId})
filesUpdated = 0
for item in items:
owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None)
if owner == self.userId:
iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None)
self.db.recordModify(FileItem, iid, {"neutralize": neutralize})
filesUpdated += 1
return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": filesUpdated}
def _isfileNameUnique(self, fileName: str, excludeFileId: Optional[str] = None) -> bool:
"""Checks if a fileName is unique for the current user."""
# Get all files filtered by RBAC (will be filtered by user's access level)
@ -1103,15 +1337,12 @@ class ComponentObjects:
return newfileName
counter += 1
def createFile(self, name: str, mimeType: str, content: bytes, folderId: Optional[str] = None) -> FileItem:
def createFile(self, name: str, mimeType: str, content: bytes) -> FileItem:
"""Creates a new file entry if user has permission. Computes fileHash and fileSize from content.
Duplicate check: if a file with the same user + fileHash + fileName already exists,
the existing file is returned instead of creating a new one.
Same hash with different name is allowed (intentional copy by user).
Args:
folderId: Optional parent folder ID. None/empty means the root folder.
"""
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to create files")
@ -1139,11 +1370,6 @@ class ComponentObjects:
else:
scope = "personal"
# Normalize folderId: treat empty string as "no folder" (= root) NULL in DB
normalizedFolderId: Optional[str] = folderId
if isinstance(normalizedFolderId, str) and not normalizedFolderId.strip():
normalizedFolderId = None
fileItem = FileItem(
mandateId=mandateId,
featureInstanceId=featureInstanceId,
@ -1152,7 +1378,6 @@ class ComponentObjects:
mimeType=mimeType,
fileSize=fileSize,
fileHash=fileHash,
folderId=normalizedFolderId,
)
# Store in database
@ -1277,382 +1502,47 @@ class ComponentObjects:
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting files in batch: {str(e)}")
# ---- Folder methods ----
_RESERVED_FOLDER_NAMES = {"(Global)"}
def _validateFolderName(self, name: str, parentId: Optional[str], excludeFolderId: Optional[str] = None):
"""Ensures folder name is not reserved and is unique within parent."""
if name in self._RESERVED_FOLDER_NAMES:
raise ValueError(f"Folder name '{name}' is reserved")
if not name or not name.strip():
raise ValueError("Folder name cannot be empty")
existingFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId or ""})
for f in existingFolders:
if f.get("name") == name and f.get("id") != excludeFolderId:
raise ValueError(f"Folder '{name}' already exists in this directory")
def _isDescendantOf(self, folderId: str, ancestorId: str) -> bool:
"""Checks if folderId is a descendant of ancestorId (circular reference check)."""
visited = set()
currentId = folderId
while currentId:
if currentId == ancestorId:
return True
if currentId in visited:
break
visited.add(currentId)
folders = self.db.getRecordset(FileFolder, recordFilter={"id": currentId})
if not folders:
break
currentId = folders[0].get("parentId")
return False
def _ensureFeatureInstanceFolder(self, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
"""Return the folder ID for a feature instance, creating it on first use.
The folder is named after the feature instance label."""
existing = self.db.getRecordset(
FileFolder,
recordFilter={
"featureInstanceId": featureInstanceId,
"sysCreatedBy": self.userId or "",
},
)
if existing:
return existing[0].get("id")
# Resolve the instance label for the folder name
folderName = featureInstanceId[:8]
def _ensureFeatureInstanceGroup(self, featureInstanceId: str, contextKey: str = "files/list") -> Optional[str]:
"""Return the groupId of the default group for a feature instance.
Creates the group if it doesn't exist yet."""
try:
from modules.datamodels.datamodelFeatures import FeatureInstance
from modules.security.rootAccess import getRootDbAppConnector
dbApp = getRootDbAppConnector()
instances = dbApp.getRecordset(FeatureInstance, recordFilter={"id": featureInstanceId})
if instances:
folderName = instances[0].get("label") or folderName
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(self._currentUser)
existing = appInterface.getTableGrouping(contextKey)
nodes = [n.model_dump() if hasattr(n, 'model_dump') else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
# Look for group with name matching featureInstanceId
def _find(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
nmeta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
if (nmeta or {}).get("featureInstanceId") == featureInstanceId:
return nid
subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
result = _find(subs)
if result:
return result
return None
found = _find(nodes)
if found:
return found
# Create new group
import uuid
newId = str(uuid.uuid4())
newGroup = {
"id": newId,
"name": featureInstanceId,
"itemIds": [],
"subGroups": [],
"meta": {"featureInstanceId": featureInstanceId},
}
nodes.append(newGroup)
appInterface.upsertTableGrouping(contextKey, nodes)
return newId
except Exception as e:
logger.warning(f"Could not resolve feature instance label: {e}")
logger.error(f"_ensureFeatureInstanceGroup failed: {e}")
return None
folder = FileFolder(
name=folderName,
parentId=None,
mandateId=mandateId,
featureInstanceId=featureInstanceId,
)
created = self.db.recordCreate(FileFolder, folder)
return created.get("id") if isinstance(created, dict) else getattr(created, "id", None)
def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]:
"""Returns a folder by ID if it belongs to the current user."""
folders = self.db.getRecordset(FileFolder, recordFilter={"id": folderId, "sysCreatedBy": self.userId or ""})
return folders[0] if folders else None
def listFolders(self, parentId: Optional[str] = None) -> List[Dict[str, Any]]:
"""List folders visible to the current user.
Own folders are always returned. Other users' folders are only
returned when they contain files visible to the current user.
Each folder is enriched with ``fileCount``."""
recordFilter = {}
if parentId is not None:
recordFilter["parentId"] = parentId
folders = self.db.getRecordset(FileFolder, recordFilter=recordFilter if recordFilter else None)
if not folders:
return folders
folderIds = [f["id"] for f in folders if f.get("id")]
fileCounts: Dict[str, int] = {}
try:
from modules.interfaces.interfaceRbac import buildFilesScopeWhereClause
scopeClause = buildFilesScopeWhereClause(
self.currentUser, "FileItem", self.db,
self.mandateId, self.featureInstanceId,
[], [],
)
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
baseQuery = (
'SELECT "folderId", COUNT(*) AS cnt '
'FROM "FileItem" '
'WHERE "folderId" = ANY(%s)'
)
queryValues: list = [folderIds]
if scopeClause:
baseQuery += ' AND (' + scopeClause["condition"] + ')'
queryValues.extend(scopeClause["values"])
baseQuery += ' GROUP BY "folderId"'
cursor.execute(baseQuery, queryValues)
for row in cursor.fetchall():
fileCounts[row["folderId"]] = row["cnt"]
except Exception as e:
logger.warning(f"Could not count files per folder: {e}")
userId = self.userId or ""
result = []
for folder in folders:
fc = fileCounts.get(folder.get("id", ""), 0)
folder["fileCount"] = fc
isOwn = folder.get("sysCreatedBy") == userId
if isOwn or fc > 0:
result.append(folder)
return result
def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]:
"""Create a new folder with unique name validation."""
self._validateFolderName(name, parentId)
folder = FileFolder(
name=name,
parentId=parentId,
mandateId=self.mandateId or "",
featureInstanceId=self.featureInstanceId or "",
)
return self.db.recordCreate(FileFolder, folder)
def renameFolder(self, folderId: str, newName: str) -> bool:
"""Rename a folder with unique name validation."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._validateFolderName(newName, folder.get("parentId"), excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, {"name": newName})
def updateFolder(self, folderId: str, updateData: Dict[str, Any]) -> bool:
"""
Update folder metadata (e.g. ``scope``, ``neutralize``). Owner-only,
same access model as renameFolder/moveFolder. Use ``renameFolder`` for
``name`` changes (uniqueness validation) and ``moveFolder`` for
``parentId`` changes (cycle/uniqueness validation).
"""
if not updateData:
return True
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
forbiddenKeys = {"id", "sysCreatedBy", "sysCreatedAt", "sysUpdatedAt"}
cleaned: Dict[str, Any] = {k: v for k, v in updateData.items() if k not in forbiddenKeys}
if "name" in cleaned:
self._validateFolderName(cleaned["name"], folder.get("parentId"), excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, cleaned)
def moveFolder(self, folderId: str, targetParentId: Optional[str] = None) -> bool:
"""Move a folder to a new parent, with circular reference and unique name checks."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
if targetParentId and self._isDescendantOf(targetParentId, folderId):
raise ValueError("Cannot move folder into its own subtree")
self._validateFolderName(folder.get("name", ""), targetParentId, excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, {"parentId": targetParentId})
def moveFilesBatch(self, fileIds: List[str], targetFolderId: Optional[str] = None) -> Dict[str, Any]:
"""Move multiple files with one SQL update.
Owner can always move; non-owners need RBAC ALL level."""
uniqueIds = [str(fid) for fid in dict.fromkeys(fileIds or []) if fid]
if not uniqueIds:
return {"movedFiles": 0}
if targetFolderId:
targetFolder = self.getFolder(targetFolderId)
if not targetFolder:
raise FileNotFoundError(f"Target folder {targetFolderId} not found")
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'SELECT "id", "sysCreatedBy" FROM "FileItem" WHERE "id" = ANY(%s)',
(uniqueIds,),
)
rows = cursor.fetchall()
foundIds = {row["id"] for row in rows}
missing = sorted(set(uniqueIds) - foundIds)
if missing:
raise FileNotFoundError(f"Files not found: {missing}")
for row in rows:
self._requireFileWriteAccess(row, row["id"], "update")
accessibleIds = [row["id"] for row in rows]
cursor.execute(
'UPDATE "FileItem" SET "folderId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
'WHERE "id" = ANY(%s)',
(targetFolderId, getUtcTimestamp(), self.userId or "", accessibleIds),
)
movedFiles = cursor.rowcount
self.db.connection.commit()
return {"movedFiles": movedFiles}
except Exception as e:
logger.error(f"Error moving files in batch: {e}")
self.db.connection.rollback()
raise FileError(f"Error moving files in batch: {str(e)}")
def moveFoldersBatch(self, folderIds: List[str], targetParentId: Optional[str] = None) -> Dict[str, Any]:
"""Move multiple folders with one SQL update after validation."""
uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
if not uniqueIds:
return {"movedFolders": 0}
foldersToMove: List[Dict[str, Any]] = []
for folderId in uniqueIds:
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
if targetParentId and self._isDescendantOf(targetParentId, folderId):
raise ValueError("Cannot move folder into its own subtree")
foldersToMove.append(folder)
existingInTarget = self.db.getRecordset(
FileFolder,
recordFilter={"parentId": targetParentId or "", "sysCreatedBy": self.userId or ""},
)
existingNames = {f.get("name"): f.get("id") for f in existingInTarget}
movingNames: Dict[str, str] = {}
movingIds = set(uniqueIds)
for folder in foldersToMove:
name = folder.get("name", "")
folderId = folder.get("id")
if name in movingNames and movingNames[name] != folderId:
raise ValueError(f"Folder '{name}' already exists in this move batch")
movingNames[name] = folderId
existingId = existingNames.get(name)
if existingId and existingId not in movingIds:
raise ValueError(f"Folder '{name}' already exists in target directory")
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'UPDATE "FileFolder" SET "parentId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
'WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(targetParentId, getUtcTimestamp(), self.userId or "", uniqueIds, self.userId or ""),
)
movedFolders = cursor.rowcount
self.db.connection.commit()
return {"movedFolders": movedFolders}
except Exception as e:
logger.error(f"Error moving folders in batch: {e}")
self.db.connection.rollback()
raise FileError(f"Error moving folders in batch: {str(e)}")
def deleteFolder(self, folderId: str, recursive: bool = False) -> Dict[str, Any]:
"""Delete a folder. If recursive, deletes all contents. Returns summary of deletions."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
childFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": folderId, "sysCreatedBy": self.userId or ""})
childFiles = self._getFilesByCurrentUser(recordFilter={"folderId": folderId})
if not recursive and (childFolders or childFiles):
raise ValueError(
f"Folder '{folder.get('name')}' is not empty "
f"({len(childFiles)} files, {len(childFolders)} subfolders). "
f"Use recursive=true to delete contents."
)
deletedFiles = 0
deletedFolders = 0
if recursive:
for subFolder in childFolders:
subResult = self.deleteFolder(subFolder["id"], recursive=True)
deletedFiles += subResult.get("deletedFiles", 0)
deletedFolders += subResult.get("deletedFolders", 0)
for childFile in childFiles:
try:
self.deleteFile(childFile["id"])
deletedFiles += 1
except Exception as e:
logger.warning(f"Failed to delete file {childFile['id']} during folder deletion: {e}")
self.db.recordDelete(FileFolder, folderId)
deletedFolders += 1
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
def deleteFoldersBatch(self, folderIds: List[str], recursive: bool = True) -> Dict[str, Any]:
"""Delete multiple folders and their content in batched SQL calls."""
uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
if not uniqueIds:
return {"deletedFiles": 0, "deletedFolders": 0}
if not recursive:
deletedFiles = 0
deletedFolders = 0
for folderId in uniqueIds:
result = self.deleteFolder(folderId, recursive=False)
deletedFiles += result.get("deletedFiles", 0)
deletedFolders += result.get("deletedFolders", 0)
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'SELECT "id" FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(uniqueIds, self.userId or ""),
)
rootAccessibleIds = [row["id"] for row in cursor.fetchall()]
if len(rootAccessibleIds) != len(uniqueIds):
missingIds = sorted(set(uniqueIds) - set(rootAccessibleIds))
raise FileNotFoundError(f"Folders not found or not accessible: {missingIds}")
cursor.execute(
"""
WITH RECURSIVE folder_tree AS (
SELECT "id"
FROM "FileFolder"
WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s
UNION ALL
SELECT child."id"
FROM "FileFolder" child
INNER JOIN folder_tree ft ON child."parentId" = ft."id"
WHERE child."sysCreatedBy" = %s
)
SELECT DISTINCT "id" FROM folder_tree
""",
(rootAccessibleIds, self.userId or "", self.userId or ""),
)
allFolderIds = [row["id"] for row in cursor.fetchall()]
cursor.execute(
'SELECT "id" FROM "FileItem" WHERE "folderId" = ANY(%s) AND "sysCreatedBy" = %s',
(allFolderIds, self.userId or ""),
)
allFileIds = [row["id"] for row in cursor.fetchall()]
if allFileIds:
cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (allFileIds,))
cursor.execute(
'DELETE FROM "FileItem" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(allFileIds, self.userId or ""),
)
deletedFiles = cursor.rowcount
else:
deletedFiles = 0
cursor.execute(
'DELETE FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(allFolderIds, self.userId or ""),
)
deletedFolders = cursor.rowcount
self.db.connection.commit()
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
except Exception as e:
logger.error(f"Error deleting folders in batch: {e}")
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting folders in batch: {str(e)}")
def copyFile(self, sourceFileId: str, targetFolderId: Optional[str] = None, newFileName: Optional[str] = None) -> FileItem:
def copyFile(self, sourceFileId: str, newFileName: Optional[str] = None) -> FileItem:
"""Create a full duplicate of a file (FileItem + FileData)."""
sourceFile = self.getFile(sourceFileId)
if not sourceFile:
@ -1665,11 +1555,6 @@ class ComponentObjects:
fileName = newFileName or sourceFile.fileName
copiedFile = self.createFile(fileName, sourceFile.mimeType, sourceData)
if targetFolderId:
self.updateFile(copiedFile.id, {"folderId": targetFolderId})
elif sourceFile.folderId:
self.updateFile(copiedFile.id, {"folderId": sourceFile.folderId})
self.createFileData(copiedFile.id, sourceData)
return copiedFile
@ -1884,18 +1769,14 @@ class ComponentObjects:
logger.error(f"Error getting file content: {str(e)}")
return None
def saveUploadedFile(self, fileContent: bytes, fileName: str, folderId: Optional[str] = None) -> tuple[FileItem, str]:
"""Saves an uploaded file if user has permission.
Args:
folderId: Optional parent folder ID. None means root folder.
"""
def saveUploadedFile(self, fileContent: bytes, fileName: str) -> tuple[FileItem, str]:
"""Saves an uploaded file if user has permission."""
try:
# Check file creation permission
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to upload files")
logger.debug(f"Starting upload process for file: {fileName} (folderId={folderId!r})")
logger.debug(f"Starting upload process for file: {fileName}")
if not isinstance(fileContent, bytes):
logger.error(f"Invalid fileContent type: {type(fileContent)}")
@ -1921,7 +1802,6 @@ class ComponentObjects:
name=fileName,
mimeType=mimeType,
content=fileContent,
folderId=folderId,
)
# Save binary data

View file

@ -347,6 +347,7 @@ class FeatureInterface:
"templateSourceId": templateId,
"templateScope": "instance",
"active": True,
"targetFeatureInstanceId": instanceId,
})
copied += 1
except Exception as e:

View file

@ -529,8 +529,7 @@ def getRecordsetPaginatedWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
# Covers both historical empty-string values and true NULLs
# e.g. root-folder files where folderId may be "" or NULL.
# Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):
@ -689,8 +688,7 @@ def getDistinctColumnValuesWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
# Covers both historical empty-string values and true NULLs
# e.g. root-folder files where folderId may be "" or NULL.
# Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):

View file

View file

@ -0,0 +1,11 @@
# Archived one-off migrations
`migrate_folders_to_groups.py` copies `FileFolder` + `FileItem.folderId` into `TableGrouping` (`files/list`). It was used during an experimental UI path; **product choice** is to keep physical folders (`FileFolder`, `folderId`) and recover `FormGeneratorTree` (see `wiki/c-work/1-plan/2026-05-formgenerator-tree-and-folder-recovery.md`).
Run only if you need a historical data rescue:
```bash
cd gateway
python -m modules.migrations._archive.migrate_folders_to_groups --verbose
python -m modules.migrations._archive.migrate_folders_to_groups --execute --verbose
```

View file

@ -0,0 +1 @@
# Subpackage for archived one-off migration scripts (not part of normal app startup).

View file

@ -0,0 +1,261 @@
"""
One-time migration: Convert FileFolder tree + FileItem.folderId to table_groupings.
Archived per wiki plan 2026-05-formgenerator-tree-and-folder-recovery (Stage 1.A).
Product direction: keep FileFolder + folderId; do not run DROP migrations.
This script remains for audit / one-off data rescue only.
Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column
from the database (those would be separate Alembic/SQL steps -- not part of current product path).
Usage (from gateway working directory):
python -m modules.migrations._archive.migrate_folders_to_groups [--dry-run] [--verbose]
python -m modules.migrations._archive.migrate_folders_to_groups --execute --verbose
Steps:
1. For each distinct (userId, mandateId) combination that has FileFolder records:
a. Build the full folder tree (recursive)
b. Write it as a TableGroupNode tree into table_groupings (contextKey='files/list')
merges with any existing groups rather than overwriting
c. For each FileItem with a folderId that maps into this tree,
add its id to the matching group's itemIds
2. Print a summary (rows migrated, groups created, files assigned)
3. If not --dry-run: commits the inserts/updates
NOTE: Schema changes (ALTER TABLE DROP COLUMN, DROP TABLE) are intentionally
NOT performed by this script. Run the corresponding Alembic migration
(migrations/versions/xxxx_drop_folder_columns.py) afterwards.
"""
import argparse
import json
import logging
import uuid
from typing import Optional
logger = logging.getLogger(__name__)
def _scalarRow(row):
if row is None:
return None
if isinstance(row, dict):
return next(iter(row.values()))
return row[0]
# ── Helpers ──────────────────────────────────────────────────────────────────
def _build_tree(folders: list, parent_id: Optional[str]) -> list:
"""Recursively build TableGroupNode-compatible dicts from a flat folder list."""
children = [f for f in folders if f.get("parentId") == parent_id]
result = []
for folder in children:
node = {
"id": str(uuid.uuid4()),
"name": folder["name"],
"itemIds": [],
"subGroups": _build_tree(folders, folder["id"]),
"meta": {"migratedFromFolderId": folder["id"]},
}
result.append(node)
return result
def _assign_files_to_nodes(nodes: list, files_by_folder: dict) -> list:
"""Recursively assign file IDs to group nodes based on folder mapping."""
for node in nodes:
folder_id = (node.get("meta") or {}).get("migratedFromFolderId")
if folder_id and folder_id in files_by_folder:
node["itemIds"] = list(files_by_folder[folder_id])
node["subGroups"] = _assign_files_to_nodes(node.get("subGroups", []), files_by_folder)
return nodes
def _count_items(nodes: list) -> int:
total = 0
for node in nodes:
total += len(node.get("itemIds", []))
total += _count_items(node.get("subGroups", []))
return total
def _now_ts() -> str:
from modules.shared.timeUtils import getUtcTimestamp
return getUtcTimestamp()
# ── Main migration ────────────────────────────────────────────────────────────
def run_migration(dry_run: bool = True, verbose: bool = False):
"""Main migration entry point."""
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
logger.info(f"Starting folder to group migration (dry_run={dry_run})")
from modules.connectors.connectorDbPostgre import getCachedConnector
from modules.shared.configuration import APP_CONFIG
connector = getCachedConnector(
dbHost=APP_CONFIG.get("DB_HOST", "_no_config_default_data"),
dbDatabase="poweron_management",
dbUser=APP_CONFIG.get("DB_USER"),
dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET"),
dbPort=int(APP_CONFIG.get("DB_PORT", 5432)),
userId=None,
)
if not connector or not connector.connection:
logger.error("Could not obtain a DB connection. Aborting.")
return
conn = connector.connection
cur = conn.cursor()
# ── 1. Check that the source tables still exist ───────────────────────────
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_name = 'FileFolder'
) AS ok
""")
folder_table_exists = bool(_scalarRow(cur.fetchone()))
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'FileItem' AND column_name = 'folderId'
) AS ok
""")
folder_column_exists = bool(_scalarRow(cur.fetchone()))
if not folder_table_exists and not folder_column_exists:
logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.")
return
if not folder_table_exists:
logger.warning("FileFolder table missing but FileItem.folderId column still present. Only file assignments will be migrated.")
if not folder_column_exists:
logger.warning("FileItem.folderId column missing but FileFolder table still present. Only group tree structure will be migrated.")
# ── 2. Load all folders ───────────────────────────────────────────────────
folders_by_user: dict = {}
if folder_table_exists:
cur.execute('SELECT "id", "name", "parentId", "sysCreatedBy", "mandateId" FROM "FileFolder"')
for row in cur.fetchall():
fid, fname, parent_id, user_id, mandate_id = row
key = (str(user_id), str(mandate_id) if mandate_id else "")
folders_by_user.setdefault(key, []).append({
"id": fid, "name": fname, "parentId": parent_id,
})
logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations")
# ── 3. Load file to folder assignments ────────────────────────────────────
files_by_key: dict = {}
if folder_column_exists:
cur.execute(
'SELECT "id", "folderId", "sysCreatedBy", "mandateId" FROM "FileItem" WHERE "folderId" IS NOT NULL AND "folderId" != \'\''
)
for row in cur.fetchall():
file_id, folder_id, user_id, mandate_id = row
key = (str(user_id), str(mandate_id) if mandate_id else "")
files_by_key.setdefault(key, {}).setdefault(folder_id, []).append(file_id)
total_files = sum(
sum(len(v) for v in d.values()) for d in files_by_key.values()
)
logger.info(f"Found {total_files} file to folder assignments across {len(files_by_key)} (user, mandate) combos")
# ── 4. Combine and upsert groupings ──────────────────────────────────────
all_keys = set(folders_by_user.keys()) | set(files_by_key.keys())
stats = {"groups_created": 0, "groupings_upserted": 0, "files_assigned": 0}
for key in all_keys:
user_id, mandate_id = key
folders = folders_by_user.get(key, [])
files_by_folder = files_by_key.get(key, {})
# Build tree
roots = _build_tree(folders, None)
roots = _assign_files_to_nodes(roots, files_by_folder)
# Handle files in unknown folders (folder no longer in tree)
known_folder_ids = {f["id"] for f in folders}
for folder_id, file_ids in files_by_folder.items():
if folder_id not in known_folder_ids:
# Orphaned files: put them in an "Orphaned" group
roots.append({
"id": str(uuid.uuid4()),
"name": f"Orphaned (folder {folder_id[:8]}…)",
"itemIds": file_ids,
"subGroups": [],
"meta": {"migratedFromFolderId": folder_id, "orphaned": True},
})
if not roots:
continue
n_items = _count_items(roots)
stats["groups_created"] += len(roots)
stats["files_assigned"] += n_items
context_key = "files/list"
if verbose:
logger.debug(f" user={user_id} mandate={mandate_id}: {len(roots)} root groups, {n_items} files")
if not dry_run:
# Check for existing grouping
cur.execute(
'SELECT "id", "rootGroups" FROM "TableGrouping" WHERE "userId" = %s AND "contextKey" = %s',
(user_id, context_key),
)
existing_row = cur.fetchone()
if existing_row:
existing_id, existing_raw = existing_row
existing_roots = json.loads(existing_raw) if isinstance(existing_raw, str) else (existing_raw or [])
# Merge: append migrated groups (avoid duplicates by migratedFromFolderId)
existing_meta_ids = {
(n.get("meta") or {}).get("migratedFromFolderId")
for n in existing_roots
if (n.get("meta") or {}).get("migratedFromFolderId")
}
new_roots = existing_roots + [
r for r in roots
if (r.get("meta") or {}).get("migratedFromFolderId") not in existing_meta_ids
]
cur.execute(
'UPDATE "TableGrouping" SET "rootGroups" = %s, "updatedAt" = %s WHERE "id" = %s',
(json.dumps(new_roots), _now_ts(), existing_id),
)
else:
new_id = str(uuid.uuid4())
cur.execute(
'INSERT INTO "TableGrouping" ("id", "userId", "contextKey", "rootGroups", "updatedAt") VALUES (%s, %s, %s, %s, %s)',
(new_id, user_id, context_key, json.dumps(roots), _now_ts()),
)
stats["groupings_upserted"] += 1
# ── 5. Summary ────────────────────────────────────────────────────────────
if not dry_run:
conn.commit()
logger.info("Migration committed.")
else:
logger.info("DRY RUN — no changes written.")
logger.info(
f"Summary: groupings_upserted={stats['groupings_upserted']}, "
f"groups_created={stats['groups_created']}, "
f"files_assigned={stats['files_assigned']}"
)
logger.info(
"Next steps (run after verifying data):\n"
" 1. Run Alembic migration to DROP COLUMN FileItem.folderId\n"
" 2. Run Alembic migration to DROP TABLE FileFolder"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings (archived script)")
parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)")
parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)")
parser.add_argument("--verbose", action="store_true", help="Show per-user details")
args = parser.parse_args()
dry_run = not args.execute
run_migration(dry_run=dry_run, verbose=args.verbose)

View file

@ -0,0 +1,305 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
User-facing Automation Workspace API.
Lists workflow runs the user can access (via FeatureAccess on
targetFeatureInstanceId) and provides detail views with step logs
and linked files. Designed for the "Workspace" tab under
Nutzung > Automation.
"""
import logging
import math
from typing import Optional
from fastapi import APIRouter, Depends, Request, Query, Path, HTTPException
from slowapi import Limiter
from slowapi.util import get_remote_address
from modules.auth.authentication import getRequestContext, RequestContext
from modules.connectors.connectorDbPostgre import DatabaseConnector
from modules.shared.configuration import APP_CONFIG
from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import (
AutoRun,
AutoStepLog,
AutoWorkflow,
)
from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
from modules.shared.i18nRegistry import apiRouteContext
routeApiMsg = apiRouteContext("routeAutomationWorkspace")
logger = logging.getLogger(__name__)
limiter = Limiter(key_func=get_remote_address)
router = APIRouter(prefix="/api/automations/runs", tags=["AutomationWorkspace"])
def _getDb() -> DatabaseConnector:
return DatabaseConnector(
dbHost=APP_CONFIG.get("DB_HOST", "localhost"),
dbDatabase=graphicalEditorDatabase,
dbUser=APP_CONFIG.get("DB_USER"),
dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"),
dbPort=int(APP_CONFIG.get("DB_PORT", 5432)),
userId=None,
)
def _getUserAccessibleInstanceIds(userId: str) -> list[str]:
"""Return all featureInstanceIds the user has enabled FeatureAccess for."""
from modules.interfaces.interfaceDbApp import getRootInterface
rootIface = getRootInterface()
allAccess = rootIface.getFeatureAccessesForUser(userId) or []
return [
a.featureInstanceId
for a in allAccess
if a.featureInstanceId and a.enabled
]
_FILE_REF_KEYS = ("fileId", "documentId", "fileIds", "documents")
def _extractFileIdsFromValue(value, accumulator: set[str]) -> None:
"""Recursively scan a value (dict/list/str) for file id references."""
if isinstance(value, dict):
for key, sub in value.items():
if key in _FILE_REF_KEYS:
_collectFileIdsFromRef(sub, accumulator)
else:
_extractFileIdsFromValue(sub, accumulator)
elif isinstance(value, list):
for item in value:
_extractFileIdsFromValue(item, accumulator)
def _collectFileIdsFromRef(val, accumulator: set[str]) -> None:
"""Add file ids from a value located under a known file-reference key."""
if isinstance(val, str) and val:
accumulator.add(val)
elif isinstance(val, list):
for v in val:
if isinstance(v, str) and v:
accumulator.add(v)
elif isinstance(v, dict) and v.get("id"):
accumulator.add(v["id"])
elif isinstance(val, dict) and val.get("id"):
accumulator.add(val["id"])
@router.get("")
@limiter.limit("60/minute")
def listWorkspaceRuns(
request: Request,
scope: str = Query("mine", description="mine = own runs, mandate = all accessible"),
status: Optional[str] = Query(None, description="Filter by run status"),
targetInstanceId: Optional[str] = Query(None, description="Filter by targetFeatureInstanceId"),
workflowId: Optional[str] = Query(None, description="Filter by workflow"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
context: RequestContext = Depends(getRequestContext),
) -> dict:
"""List workflow runs visible to the user.
scope=mine: only runs owned by the user.
scope=mandate: all runs where the user has FeatureAccess on the
workflow's targetFeatureInstanceId.
"""
db = _getDb()
if not db._ensureTableExists(AutoRun):
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
userId = str(context.user.id) if context.user else None
if not userId:
raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required"))
accessibleInstanceIds = _getUserAccessibleInstanceIds(userId)
if not accessibleInstanceIds:
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
if not db._ensureTableExists(AutoWorkflow):
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
wfFilter: dict = {}
if targetInstanceId:
if targetInstanceId not in accessibleInstanceIds:
raise HTTPException(status_code=403, detail=routeApiMsg("Access denied to target instance"))
wfFilter["targetFeatureInstanceId"] = targetInstanceId
workflows = db.getRecordset(AutoWorkflow, recordFilter=wfFilter or None) or []
visibleWfIds: set[str] = set()
wfMap: dict = {}
for wf in workflows:
wfDict = dict(wf)
tid = wfDict.get("targetFeatureInstanceId") or wfDict.get("featureInstanceId")
if tid and tid in accessibleInstanceIds:
wfId = wfDict.get("id")
if wfId:
visibleWfIds.add(wfId)
wfMap[wfId] = wfDict
if workflowId:
if workflowId not in visibleWfIds:
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
visibleWfIds = {workflowId}
if not visibleWfIds:
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
allRuns = db.getRecordset(AutoRun, recordFilter={}) or []
filtered = []
for r in allRuns:
row = dict(r)
if row.get("workflowId") not in visibleWfIds:
continue
if scope == "mine" and row.get("ownerId") != userId:
continue
if status and row.get("status") != status:
continue
filtered.append(row)
filtered.sort(
key=lambda x: x.get("startedAt") or x.get("sysCreatedAt") or 0,
reverse=True,
)
total = len(filtered)
page = filtered[offset: offset + limit]
from modules.routes.routeHelpers import enrichRowsWithFkLabels, resolveMandateLabels, resolveInstanceLabels
for row in page:
wf = wfMap.get(row.get("workflowId"), {})
row["workflowLabel"] = row.get("label") or wf.get("label") or row.get("workflowId", "")
row["targetFeatureInstanceId"] = wf.get("targetFeatureInstanceId") or wf.get("featureInstanceId")
enrichRowsWithFkLabels(
page,
labelResolvers={
"mandateId": resolveMandateLabels,
"targetFeatureInstanceId": resolveInstanceLabels,
},
)
for row in page:
row["targetInstanceLabel"] = row.pop("targetFeatureInstanceIdLabel", None)
row["mandateLabel"] = row.pop("mandateIdLabel", None)
return {"runs": page, "total": total, "limit": limit, "offset": offset}
@router.get("/{runId}/detail")
@limiter.limit("60/minute")
def getWorkspaceRunDetail(
request: Request,
runId: str = Path(..., description="Run ID"),
context: RequestContext = Depends(getRequestContext),
) -> dict:
"""Get full detail for a single run: metadata, step logs, linked files."""
db = _getDb()
userId = str(context.user.id) if context.user else None
if not userId:
raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required"))
if not db._ensureTableExists(AutoRun):
raise HTTPException(status_code=404, detail=routeApiMsg("Run not found"))
runs = db.getRecordset(AutoRun, recordFilter={"id": runId})
if not runs:
raise HTTPException(status_code=404, detail=routeApiMsg("Run not found"))
run = dict(runs[0])
wfId = run.get("workflowId")
workflow: dict = {}
if wfId and db._ensureTableExists(AutoWorkflow):
wfs = db.getRecordset(AutoWorkflow, recordFilter={"id": wfId})
if wfs:
workflow = dict(wfs[0])
tid = workflow.get("targetFeatureInstanceId") or workflow.get("featureInstanceId")
accessibleIds = _getUserAccessibleInstanceIds(userId)
isOwner = run.get("ownerId") == userId
if not isOwner and (not tid or tid not in accessibleIds) and not context.isPlatformAdmin:
raise HTTPException(status_code=403, detail=routeApiMsg("Access denied"))
steps: list = []
if db._ensureTableExists(AutoStepLog):
stepRecords = db.getRecordset(AutoStepLog, recordFilter={"runId": runId}) or []
steps = [dict(s) for s in stepRecords]
steps.sort(key=lambda s: s.get("startedAt") or 0)
allFileIds: set[str] = set()
perStepFileIds: list[tuple[set[str], set[str]]] = []
for step in steps:
inputIds: set[str] = set()
outputIds: set[str] = set()
_extractFileIdsFromValue(step.get("inputSnapshot") or {}, inputIds)
_extractFileIdsFromValue(step.get("output") or {}, outputIds)
perStepFileIds.append((inputIds, outputIds))
allFileIds.update(inputIds)
allFileIds.update(outputIds)
nodeOutputs = run.get("nodeOutputs") or {}
runLevelIds: set[str] = set()
_extractFileIdsFromValue(nodeOutputs, runLevelIds)
allFileIds.update(runLevelIds)
fileMetaById: dict[str, dict] = {}
try:
from modules.datamodels.datamodelFiles import FileItem
from modules.interfaces.interfaceDbManagement import ComponentObjects
mgmtDb = ComponentObjects().db
if mgmtDb._ensureTableExists(FileItem):
for fid in allFileIds:
try:
rec = mgmtDb.getRecord(FileItem, fid)
if rec:
recDict = dict(rec)
fileMetaById[fid] = {
"id": fid,
"fileName": recDict.get("fileName") or recDict.get("name"),
}
except Exception:
pass
except Exception as e:
logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e)
def _resolveFileList(ids: set[str]) -> list[dict]:
return [fileMetaById[fid] for fid in ids if fid in fileMetaById]
assignedFileIds: set[str] = set()
for step, (inputIds, outputIds) in zip(steps, perStepFileIds):
step["inputFiles"] = _resolveFileList(inputIds)
step["outputFiles"] = _resolveFileList(outputIds)
assignedFileIds.update(inputIds)
assignedFileIds.update(outputIds)
unassignedFiles = _resolveFileList(allFileIds - assignedFileIds)
allFiles = _resolveFileList(allFileIds)
run["workflowLabel"] = run.get("label") or workflow.get("label") or wfId
run["targetFeatureInstanceId"] = tid
targetInstanceLabel = None
if tid:
try:
from modules.routes.routeHelpers import resolveInstanceLabels
labelMap = resolveInstanceLabels([tid])
targetInstanceLabel = labelMap.get(tid)
except Exception:
pass
run["targetInstanceLabel"] = targetInstanceLabel
return {
"run": run,
"workflow": {
"id": workflow.get("id"),
"label": workflow.get("label"),
"targetFeatureInstanceId": tid,
"featureInstanceId": workflow.get("featureInstanceId"),
"tags": workflow.get("tags", []),
} if workflow else None,
"steps": steps,
"files": allFiles,
"unassignedFiles": unassignedFiles,
}

View file

@ -57,8 +57,8 @@ def _svc_for_connection(current_user: User, connection: UserConnection):
services = getServices(current_user, None)
if not services.clickup.setAccessTokenFromConnection(connection):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=routeApiMsg("Failed to set ClickUp access token"),
status_code=status.HTTP_502_BAD_GATEWAY,
detail=routeApiMsg("Failed to set ClickUp access token. Connection may be expired or invalid."),
)
return services.clickup

View file

@ -152,10 +152,28 @@ async def get_connections(
- GET /api/connections/?mode=filterValues&column=status
- GET /api/connections/?mode=ids
"""
from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
handleGroupingInRequest, applyGroupScopeFilter,
)
CONTEXT_KEY = "connections"
# Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
interface = getInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, interface, CONTEXT_KEY)
def _buildEnhancedItems():
interface = getInterface(currentUser)
connections = interface.getUserConnections(currentUser.id)
items = []
for connection in connections:
@ -182,6 +200,7 @@ async def get_connections(
try:
items = _buildEnhancedItems()
enrichRowsWithFkLabels(items, UserConnection)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
except Exception as e:
logger.error(f"Error getting filter values for connections: {str(e)}")
@ -189,63 +208,40 @@ async def get_connections(
if mode == "ids":
try:
return handleIdsInMemory(_buildEnhancedItems(), pagination)
items = applyGroupScopeFilter(_buildEnhancedItems(), groupCtx.itemIds)
return handleIdsInMemory(items, pagination)
except Exception as e:
logger.error(f"Error getting IDs for connections: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
try:
interface = getInterface(currentUser)
# NOTE: Cannot use db.getRecordsetPaginated() here because each connection
# is enriched with computed tokenStatus/tokenExpiresAt (requires per-row DB lookup).
# Token refresh also may trigger re-fetch. Connections per user are typically < 10,
# so in-memory pagination is acceptable.
# Parse pagination parameter
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
# Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
# SECURITY FIX: All users (including admins) can only see their own connections
# This prevents admin from seeing other users' connections and causing confusion
connections = interface.getUserConnections(currentUser.id)
# Perform silent token refresh for expired OAuth connections
try:
refresh_result = await token_refresh_service.refresh_expired_tokens(currentUser.id)
if refresh_result.get("refreshed", 0) > 0:
logger.info(f"Silently refreshed {refresh_result['refreshed']} tokens for user {currentUser.id}")
# Re-fetch connections to get updated token status
connections = interface.getUserConnections(currentUser.id)
except Exception as e:
logger.warning(f"Silent token refresh failed for user {currentUser.id}: {str(e)}")
# Continue with original connections even if refresh fails
# Enhance each connection with token status information and convert to dict
enhanced_connections_dict = []
for connection in connections:
# Get token status for this connection
tokenStatus, tokenExpiresAt = getTokenStatusForConnection(interface, connection.id)
# Convert to dict for filtering/sorting
connection_dict = {
"id": connection.id,
"userId": connection.userId,
"authority": connection.authority.value if hasattr(connection.authority, 'value') else str(connection.authority),
"externalId": connection.externalId,
"externalUsername": connection.externalUsername or "",
"externalEmail": connection.externalEmail, # Keep None instead of converting to empty string
"externalEmail": connection.externalEmail,
"status": connection.status.value if hasattr(connection.status, 'value') else str(connection.status),
"connectedAt": connection.connectedAt,
"lastChecked": connection.lastChecked,
@ -254,24 +250,26 @@ async def get_connections(
"tokenExpiresAt": tokenExpiresAt
}
enhanced_connections_dict.append(connection_dict)
enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection)
enhanced_connections_dict = applyGroupScopeFilter(enhanced_connections_dict, groupCtx.itemIds)
if paginationParams is None:
return {
"items": enhanced_connections_dict,
"pagination": None,
"groupTree": groupCtx.groupTree,
}
# Apply filtering if provided
if paginationParams.filters:
component_interface = ComponentObjects()
component_interface.setUserContext(currentUser)
enhanced_connections_dict = component_interface._applyFilters(
enhanced_connections_dict,
enhanced_connections_dict,
paginationParams.filters
)
# Apply sorting if provided
if paginationParams.sort:
component_interface = ComponentObjects()
@ -280,14 +278,14 @@ async def get_connections(
enhanced_connections_dict,
paginationParams.sort
)
totalItems = len(enhanced_connections_dict)
totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0
startIdx = (paginationParams.page - 1) * paginationParams.pageSize
endIdx = startIdx + paginationParams.pageSize
paged_connections = enhanced_connections_dict[startIdx:endIdx]
return {
"items": paged_connections,
"pagination": PaginationMetadata(
@ -298,6 +296,7 @@ async def get_connections(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
except HTTPException:
@ -351,11 +350,18 @@ def create_connection(
externalUsername="", # Will be set after OAuth
status=ConnectionStatus.PENDING # Start with PENDING status
)
# Apply knowledge consent + preferences from request body before persisting
knowledge_enabled = connection_data.get("knowledgeIngestionEnabled")
if isinstance(knowledge_enabled, bool):
connection.knowledgeIngestionEnabled = knowledge_enabled
knowledge_prefs = connection_data.get("knowledgePreferences")
if isinstance(knowledge_prefs, dict):
connection.knowledgePreferences = knowledge_prefs
# Save connection record - models now handle timestamp serialization automatically
interface.db.recordModify(UserConnection, connection.id, connection.model_dump())
return connection
except HTTPException:
@ -586,8 +592,25 @@ def disconnect_service(
detail=routeApiMsg("Connection not found")
)
# Update connection status
connection.status = ConnectionStatus.INACTIVE
# Fire revoked event BEFORE DB status change so knowledge purge and
# status mutation form one logical step; subscribers see the
# connection as it was. INACTIVE does not exist on the enum — REVOKED
# is the correct terminal-but-retained state (deleted rows are
# handled in DELETE /{id}).
try:
from modules.shared.callbackRegistry import callbackRegistry
callbackRegistry.trigger(
"connection.revoked",
connectionId=connectionId,
authority=str(getattr(connection.authority, "value", connection.authority) or ""),
userId=str(currentUser.id),
reason="disconnected",
)
except Exception as _cbErr:
logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
connection.status = ConnectionStatus.REVOKED
connection.lastChecked = getUtcTimestamp()
# Update connection record - models now handle timestamp serialization automatically
@ -636,6 +659,23 @@ def delete_connection(
detail=routeApiMsg("Connection not found")
)
# Fire revoked event BEFORE the row disappears so consumers still
# have authority/connection context for observability; purge itself
# targets FileContentIndex rows by connectionId which are unaffected
# by the UserConnection delete.
try:
from modules.shared.callbackRegistry import callbackRegistry
callbackRegistry.trigger(
"connection.revoked",
connectionId=connectionId,
authority=str(getattr(connection.authority, "value", connection.authority) or ""),
userId=str(currentUser.id),
reason="deleted",
)
except Exception as _cbErr:
logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
# Remove the connection - only need connectionId since permissions are verified
interface.removeUserConnection(connectionId)

File diff suppressed because it is too large Load diff

View file

@ -112,8 +112,8 @@ def get_mandates(
status_code=status.HTTP_403_FORBIDDEN,
detail=routeApiMsg("Admin role required")
)
# Parse pagination parameter
# Parse pagination parameter early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@ -126,14 +126,24 @@ def get_mandates(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory,
handleFilterValuesMode, handleIdsMode,
parseCrossFilterPagination,
handleGroupingInRequest, applyGroupScopeFilter,
)
appInterface = interfaceDbApp.getRootInterface()
groupCtx = handleGroupingInRequest(paginationParams, appInterface, "mandates")
def _mandateItemsForAdmin():
items = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
items.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
return items
if mode == "filterValues":
if not column:
@ -144,54 +154,42 @@ def get_mandates(
values = appInterface.db.getDistinctColumnValues(Mandate, column, crossPagination)
return JSONResponse(content=sorted(values, key=lambda v: str(v).lower()))
else:
mandateItems = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleFilterValuesInMemory(mandateItems, column, pagination)
if mode == "ids":
if isPlatformAdmin:
return handleIdsMode(appInterface.db, Mandate, pagination)
else:
mandateItems = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleIdsInMemory(mandateItems, pagination)
if isPlatformAdmin:
result = appInterface.getAllMandates(pagination=paginationParams)
else:
allMandates = []
for mandateId in adminMandateIds:
mandate = appInterface.getMandate(mandateId)
if mandate and getattr(mandate, "enabled", True):
mandateDict = mandate if isinstance(mandate, dict) else mandate.model_dump() if hasattr(mandate, 'model_dump') else vars(mandate)
allMandates.append(mandateDict)
result = allMandates
paginationParams = None
if paginationParams and hasattr(result, 'items'):
return PaginatedResponse(
items=result.items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
items = result.items if hasattr(result, 'items') else (result if isinstance(result, list) else [])
items = applyGroupScopeFilter(
[i.model_dump() if hasattr(i, 'model_dump') else (i if isinstance(i, dict) else vars(i)) for i in items],
groupCtx.itemIds,
)
if paginationParams and hasattr(result, 'items'):
return PaginatedResponse(
items=items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
),
groupTree=groupCtx.groupTree,
)
)
else:
return PaginatedResponse(items=items, pagination=None, groupTree=groupCtx.groupTree)
else:
items = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else result)
return PaginatedResponse(
items=items,
pagination=None
)
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return PaginatedResponse(items=mandateItems, pagination=None, groupTree=groupCtx.groupTree)
except HTTPException:
raise
except Exception as e:

View file

@ -44,27 +44,15 @@ def get_prompts(
- filterValues: distinct values for a column (cross-filtered)
- ids: all IDs matching current filters
"""
from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
handleGroupingInRequest, applyGroupScopeFilter,
)
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
def _promptsToEnrichedDicts(promptItems):
dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
enrichRowsWithFkLabels(dicts, Prompt)
return dicts
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
managementInterface = interfaceDbManagement.getInterface(currentUser)
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
return handleFilterValuesInMemory(items, column, pagination)
if mode == "ids":
managementInterface = interfaceDbManagement.getInterface(currentUser)
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
return handleIdsInMemory(items, pagination)
CONTEXT_KEY = "prompts"
# Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@ -74,12 +62,35 @@ def get_prompts(
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
appInterface = getAppInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, appInterface, CONTEXT_KEY)
def _promptsToEnrichedDicts(promptItems):
dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
enrichRowsWithFkLabels(dicts, Prompt)
return dicts
managementInterface = interfaceDbManagement.getInterface(currentUser)
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
if mode == "ids":
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleIdsInMemory(items, pagination)
result = managementInterface.getAllPrompts(pagination=paginationParams)
if paginationParams:
items = _promptsToEnrichedDicts(result.items)
items = applyGroupScopeFilter(_promptsToEnrichedDicts(result.items), groupCtx.itemIds)
return {
"items": items,
"pagination": PaginationMetadata(
@ -90,12 +101,14 @@ def get_prompts(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
else:
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(_promptsToEnrichedDicts(result), groupCtx.itemIds)
return {
"items": items,
"pagination": None,
"groupTree": groupCtx.groupTree,
}

View file

@ -208,6 +208,21 @@ def get_users(
- GET /api/users/ (no pagination - returns all users in mandate)
- GET /api/users/?pagination={"page":1,"pageSize":10,"sort":[]}
"""
# Parse pagination early — needed for grouping in all modes
_paginationParams = None
if pagination:
try:
_pd = json.loads(pagination)
if _pd:
_pd = normalize_pagination_dict(_pd)
_paginationParams = PaginationParams(**_pd)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
from modules.routes.routeHelpers import handleGroupingInRequest as _handleGrouping, applyGroupScopeFilter as _applyGroupScope
_appInterfaceForGrouping = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
_groupCtx = _handleGrouping(_paginationParams, _appInterfaceForGrouping, "users")
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
@ -217,27 +232,15 @@ def get_users(
return _getUserFilterOrIds(context, pagination, idsMode=True)
try:
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
appInterface = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
paginationParams = _paginationParams
appInterface = _appInterfaceForGrouping
if context.mandateId:
# Get users for specific mandate using getUsersByMandate
result = appInterface.getUsersByMandate(str(context.mandateId), paginationParams)
if paginationParams and hasattr(result, 'items'):
enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -248,17 +251,18 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else result.items if hasattr(result, 'items') else []
enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
return {"items": enriched, "pagination": None}
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
elif context.isPlatformAdmin:
# PlatformAdmin without mandateId — DB-level pagination via interface
result = appInterface.getAllUsers(paginationParams)
if paginationParams and hasattr(result, 'items'):
enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -269,11 +273,12 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else [])
enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
return {"items": enriched, "pagination": None}
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
else:
# Non-SysAdmin without mandateId: aggregate users across all admin mandates
rootInterface = getRootInterface()
@ -313,16 +318,16 @@ def get_users(
]
from modules.routes.routeHelpers import applyFiltersAndSort as _applyFiltersAndSortHelper
filteredUsers = _applyFiltersAndSortHelper(allUsers, paginationParams)
filteredUsers = _applyGroupScope(_applyFiltersAndSortHelper(allUsers, paginationParams), _groupCtx.itemIds)
enriched = enrichRowsWithFkLabels(filteredUsers, User)
if paginationParams:
import math
totalItems = len(enriched)
totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0
startIdx = (paginationParams.page - 1) * paginationParams.pageSize
endIdx = startIdx + paginationParams.pageSize
return {
"items": enriched[startIdx:endIdx],
"pagination": PaginationMetadata(
@ -333,9 +338,10 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
return {"items": enriched, "pagination": None}
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:

View file

@ -701,3 +701,157 @@ def paginateInMemory(
offset = (paginationParams.page - 1) * paginationParams.pageSize
pageItems = items[offset:offset + paginationParams.pageSize]
return pageItems, totalItems
# ---------------------------------------------------------------------------
# Table Grouping helpers
# ---------------------------------------------------------------------------
from dataclasses import dataclass, field as dc_field
@dataclass
class GroupingContext:
"""
Result of handleGroupingInRequest.
Carries the group tree for the response and the resolved item-ID set for
group-scope filtering (None = no active group scope).
"""
groupTree: Optional[list] # List[TableGroupNode] serialised as dicts — for response
itemIds: Optional[set] # Set[str] when groupId was set, else None
def _collectItemIds(nodes: list, groupId: str) -> Optional[set]:
"""
Recursively search *nodes* for a node whose id == groupId and collect
all itemIds from it and all its descendant subGroups.
Returns None if the group is not found.
"""
for node in nodes:
nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
if nodeId == groupId:
ids: set = set()
_collectAllIds(node, ids)
return ids
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
result = _collectItemIds(subGroups, groupId)
if result is not None:
return result
return None
def _collectAllIds(node, ids: set) -> None:
"""Collect itemIds from a node and all its descendants into ids."""
nodeItemIds = node.get("itemIds", []) if isinstance(node, dict) else getattr(node, "itemIds", [])
for iid in nodeItemIds:
ids.add(str(iid))
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
for child in subGroups:
_collectAllIds(child, ids)
def _removeGroupFromTree(nodes: list, groupId: str) -> list:
"""Remove a group node (and all descendants) from the tree by id."""
result = []
for node in nodes:
nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
if nodeId == groupId:
continue # skip this node (remove it)
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
filtered_sub = _removeGroupFromTree(subGroups, groupId)
if isinstance(node, dict):
node = {**node, "subGroups": filtered_sub}
result.append(node)
return result
def handleGroupingInRequest(
paginationParams: Optional[PaginationParams],
interface,
contextKey: str,
) -> GroupingContext:
"""
Central grouping handler call at the start of every list route that
supports table grouping.
Steps (in order):
1. If paginationParams.saveGroupTree is set:
persist the new tree via interface.upsertTableGrouping, then clear
saveGroupTree from paginationParams so it is not treated as a filter.
2. Load the current group tree from the DB (used in step 3 and response).
3. If paginationParams.groupId is set:
resolve it to a Set[str] of itemIds (including all sub-groups),
then clear groupId from paginationParams so it is not treated as a
normal filter field.
4. Return a GroupingContext with groupTree (for the response) and itemIds
(for applyGroupScopeFilter).
The caller does NOT need to handle any grouping logic itself just call
applyGroupScopeFilter(items, groupCtx.itemIds) and embed groupCtx.groupTree
in the response dict.
"""
from modules.datamodels.datamodelPagination import TableGroupNode
groupTree = None
itemIds = None
if paginationParams is None:
try:
existing = interface.getTableGrouping(contextKey)
if existing:
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
except Exception as e:
logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
return GroupingContext(groupTree=groupTree, itemIds=None)
# Step 1: persist saveGroupTree if present
if paginationParams.saveGroupTree is not None:
try:
saved = interface.upsertTableGrouping(contextKey, paginationParams.saveGroupTree)
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in saved.rootGroups]
except Exception as e:
logger.error(f"handleGroupingInRequest: upsertTableGrouping failed: {e}")
paginationParams.saveGroupTree = None
# Step 2: load current tree (only if not already set from save above)
if groupTree is None:
try:
existing = interface.getTableGrouping(contextKey)
if existing:
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
except Exception as e:
logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
# Step 3: resolve groupId to itemIds set
if paginationParams.groupId is not None:
targetGroupId = paginationParams.groupId
paginationParams.groupId = None # remove so it is not treated as a normal filter
if groupTree:
itemIds = _collectItemIds(groupTree, targetGroupId)
if itemIds is None:
logger.warning(
f"handleGroupingInRequest: groupId={targetGroupId!r} not found in tree "
f"for contextKey={contextKey!r} — returning empty set"
)
itemIds = set() # unknown group → show nothing rather than everything
else:
# groupId sent but no tree saved yet → return empty (nothing belongs to any group)
logger.warning(
f"handleGroupingInRequest: groupId={targetGroupId!r} set but no tree exists "
f"for contextKey={contextKey!r} — returning empty set"
)
itemIds = set()
return GroupingContext(groupTree=groupTree, itemIds=itemIds)
def applyGroupScopeFilter(items: List[Dict[str, Any]], itemIds: Optional[set]) -> List[Dict[str, Any]]:
"""
Filter items to those whose "id" field is in itemIds.
Returns items unchanged when itemIds is None (no active group scope).
Works for both normal list items and for mode=ids / mode=filterValues flows
call it before handleIdsInMemory / handleFilterValuesInMemory.
"""
if itemIds is None:
return items
return [item for item in items if str(item.get("id", "")) in itemIds]

View file

@ -241,6 +241,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "clickup",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -479,6 +479,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "google"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "google",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -420,6 +420,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "msft"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "msft",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -128,7 +128,7 @@ async def getSharepointFolderOptionsByReference(
# Set access token on SharePoint service
if not services.sharepoint.setAccessTokenFromConnection(connection):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
status_code=status.HTTP_502_BAD_GATEWAY,
detail=routeApiMsg("Failed to set SharePoint access token. Connection may be expired or invalid.")
)

View file

@ -3,7 +3,7 @@
"""ActionToolAdapter: wraps existing workflow actions (dynamicMode=True) as agent tools."""
import logging
from typing import Dict, Any, List
from typing import Dict, Any, List, Optional
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
ToolDefinition, ToolResult
@ -44,7 +44,7 @@ class ActionToolAdapter:
compoundName = f"{shortName}_{actionName}"
toolDef = _buildToolDefinition(compoundName, actionDef, actionInfo)
handler = _createDispatchHandler(self._actionExecutor, shortName, actionName)
handler = _createDispatchHandler(self._actionExecutor, shortName, actionName, self._actionExecutor.services)
toolRegistry.registerFromDefinition(toolDef, handler)
self._registeredTools.append(compoundName)
registered += 1
@ -186,7 +186,7 @@ def _catalogTypeToJsonSchema(typeStr: str, _depth: int = 0) -> Dict[str, Any]:
return {"type": "string", "description": f"unknown type '{typeStr}' (defaulted to string)"}
def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
def _createDispatchHandler(actionExecutor, methodName: str, actionName: str, services=None):
"""Create an async handler that dispatches to the ActionExecutor.
Parameter validation and Ref-payload normalization (collapsing
@ -204,7 +204,7 @@ def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
if "mandateId" not in args and context.get("mandateId"):
args["mandateId"] = context["mandateId"]
result = await actionExecutor.executeAction(methodName, actionName, args)
data = _formatActionResult(result)
data = _formatActionResult(result, services, context)
return ToolResult(
toolCallId="",
toolName=f"{methodName}_{actionName}",
@ -223,9 +223,65 @@ def _createDispatchHandler(actionExecutor, methodName: str, actionName: str):
return _handler
def _formatActionResult(result) -> str:
"""Format an ActionResult into a text representation for the agent."""
_INLINE_CONTENT_LIMIT = 2000
def _persistLargeDocument(doc, services, context: Dict[str, Any]) -> Optional[str]:
"""Save an ActionDocument with large content as a workspace file.
Returns a formatted result line (with file id + docItem ref) or None
if persistence is not possible.
"""
if not services:
return None
chatService = getattr(services, "chat", None)
if not chatService:
return None
docData = getattr(doc, "documentData", None)
if not docData or not isinstance(docData, str):
return None
docName = getattr(doc, "documentName", "unnamed")
docBytes = docData.encode("utf-8")
try:
fileItem, _ = chatService.interfaceDbComponent.saveUploadedFile(docBytes, docName)
fiId = context.get("featureInstanceId") or getattr(services, "featureInstanceId", "")
if fiId:
chatService.interfaceDbComponent.updateFile(fileItem.id, {"featureInstanceId": fiId})
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_attachFileAsChatDocument,
_formatToolFileResult,
_getOrCreateTempFolder,
)
tempFolderId = _getOrCreateTempFolder(chatService)
if tempFolderId:
chatService.interfaceDbComponent.updateFile(fileItem.id, {"folderId": tempFolderId})
chatDocId = _attachFileAsChatDocument(
services, fileItem,
label=f"action_doc:{docName}",
userMessage=f"Action document: {docName}",
)
return _formatToolFileResult(
fileItem=fileItem,
chatDocId=chatDocId,
actionLabel="Produced",
extraInfo="Use readFile to read the content.",
)
except Exception as e:
logger.warning(f"_persistLargeDocument failed for {docName}: {e}")
return None
def _formatActionResult(result, services=None, context: Optional[Dict[str, Any]] = None) -> str:
"""Format an ActionResult into a text representation for the agent.
Documents whose content exceeds the inline limit are persisted as
workspace files so the agent can access them via readFile /
ai_process / searchInFileContent.
"""
parts = []
ctx = context or {}
if result.resultLabel:
parts.append(f"Result: {result.resultLabel}")
@ -238,10 +294,19 @@ def _formatActionResult(result) -> str:
for doc in result.documents:
docName = getattr(doc, "documentName", "unnamed")
docType = getattr(doc, "mimeType", "unknown")
parts.append(f" - {docName} ({docType})")
docData = getattr(doc, "documentData", None)
if docData and isinstance(docData, str) and len(docData) < 2000:
parts.append(f" Content: {docData[:2000]}")
isLarge = docData and isinstance(docData, str) and len(docData) >= _INLINE_CONTENT_LIMIT
if isLarge:
persistedLine = _persistLargeDocument(doc, services, ctx)
if persistedLine:
parts.append(f" - {docName} ({docType})")
parts.append(f" {persistedLine}")
continue
parts.append(f" - {docName} ({docType})")
if docData and isinstance(docData, str) and len(docData) < _INLINE_CONTENT_LIMIT:
parts.append(f" Content: {docData[:_INLINE_CONTENT_LIMIT]}")
if not parts:
parts.append("Action completed successfully." if result.success else "Action failed.")

View file

@ -198,7 +198,10 @@ def _registerDataSourceTools(registry: ToolRegistry, services):
if isinstance(result, _DR):
fileBytes = result.data
fileName = result.fileName or fileName
resolvedName = result.fileName or fileName
if resolvedName != fileName:
logger.debug(f"downloadFromDataSource: connector fileName={result.fileName!r} overrides arg fileName={fileName!r}")
fileName = resolvedName
else:
fileBytes = result

View file

@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_getOrCreateTempFolder,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@ -392,65 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
if chunkMime:
mimeType = chunkMime
# 2) File not yet indexed -> trigger extraction via ExtractionService, then retry
if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId):
try:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
fileContent = chatService.getFileContent(fileId)
if fileContent and fileInfo:
rawData = fileContent.get("data", "")
if isinstance(rawData, str) and len(rawData) > 100:
rawBytes = _b64.b64decode(rawData)
elif isinstance(rawData, bytes):
rawBytes = rawData
else:
rawBytes = None
if rawBytes:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
fileMime = fileInfo.get("mimeType", "application/octet-stream")
fileName = fileInfo.get("fileName", fileId)
extracted = runExtraction(
ExtractorRegistry(), None,
rawBytes, fileName, fileMime, ExtractionOptions(),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})},
})
if contentObjects:
_diFiId, _diMId = _resolveFileScope(fileId, context)
await knowledgeService.indexFile(
fileId=fileId, fileName=fileName, mimeType=fileMime,
userId=context.get("userId", ""), contentObjects=contentObjects,
featureInstanceId=_diFiId,
mandateId=_diMId,
)
chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if pageIndex is not None:
imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex]
if imageChunks:
imageData = imageChunks[0].get("data", "")
except Exception as extractErr:
logger.warning(f"describeImage: on-demand extraction failed: {extractErr}")
# 3) Direct image file (not a container) - use raw file data
# 2) Direct image file (not a container) - use raw file data
if not imageData:
chatService = services.chat
fileContent = chatService.getFileContent(fileId)
@ -460,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
imageData = fileContent.get("data", "")
mimeType = fileMimeType
# 4) PDF page rendering: render the requested page as an image via PyMuPDF
# 3) PDF page rendering: render the requested page as an image via PyMuPDF
if not imageData:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Shared helpers for core agent tools (file scope, binary detection, temp folder)."""
"""Shared helpers for core agent tools (file scope, binary detection, group helpers)."""
import logging
import uuid
@ -46,39 +46,60 @@ def _looksLikeBinary(data: bytes, sampleSize: int = 1024) -> bool:
return nonPrintable / len(sample) > 0.10
def _getOrCreateInstanceFolder(chatService, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
"""Return the folder ID for a feature instance, creating it on first use.
Delegates to interfaceDbManagement._ensureFeatureInstanceFolder.
AI tools call this when saving a file without an explicit folderId
so that instance-produced files land in a named folder automatically.
"""
try:
dbMgmt = chatService.interfaceDbComponent
return dbMgmt._ensureFeatureInstanceFolder(featureInstanceId, mandateId)
except Exception as e:
logger.warning(f"Could not get/create instance folder for {featureInstanceId}: {e}")
return None
def _getOrCreateTempFolder(chatService) -> Optional[str]:
"""Return the ID of the root-level 'Temp' folder, creating it if it doesn't exist."""
"""Deprecated stub: folder-based organisation has been replaced by grouping.
Returns None unconditionally so callers skip the (now removed) folderId
assignment. Remove callers incrementally and delete this stub afterwards.
"""
logger.debug("_getOrCreateTempFolder called folder support removed, returning None")
return None
async def _getOrCreateInstanceGroup(
appInterface,
featureInstanceId: str,
contextKey: str = "files/list",
) -> Optional[str]:
"""Return groupId of the default group for a feature instance; create if needed."""
try:
allFolders = chatService.interfaceDbComponent.listFolders()
tempFolder = next(
(f for f in allFolders
if f.get("name") == "Temp" and not f.get("parentId")),
None,
)
if tempFolder:
return tempFolder.get("id")
newFolder = chatService.interfaceDbComponent.createFolder("Temp", parentId=None)
return newFolder.get("id") if newFolder else None
existing = appInterface.getTableGrouping(contextKey)
nodes = [
n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
for n in (existing.rootGroups if existing else [])
]
def _find(nds):
for nd in nds:
meta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
if (meta or {}).get("featureInstanceId") == featureInstanceId:
return nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
found = _find(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []))
if found:
return found
return None
found = _find(nodes)
if found:
return found
newId = str(uuid.uuid4())
nodes.append({"id": newId, "name": featureInstanceId, "itemIds": [], "subGroups": [], "meta": {"featureInstanceId": featureInstanceId}})
appInterface.upsertTableGrouping(contextKey, nodes)
return newId
except Exception as e:
logger.warning(f"Could not get/create Temp folder: {e}")
logger.error(f"_getOrCreateInstanceGroup: {e}")
return None
async def _getOrCreateTempGroup(
appInterface,
sessionId: str,
contextKey: str = "files/list",
) -> Optional[str]:
"""Return groupId of a temporary group for a session; create if needed."""
return await _getOrCreateInstanceGroup(appInterface, f"_temp_{sessionId}", contextKey)
def _attachFileAsChatDocument(
services: Any,
fileItem: Any,

View file

@ -25,142 +25,11 @@ def _registerMediaTools(registry: ToolRegistry, services):
# ---- Document rendering tool ----
def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""Convert markdown content to the standard document JSON format expected by renderers."""
import re as _re
sections = []
order = 0
lines = markdown.split("\n")
i = 0
def _nextId():
nonlocal order
order += 1
return f"s_{order}"
while i < len(lines):
line = lines[i]
# --- Headings ---
headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line)
if headingMatch:
level = len(headingMatch.group(1))
text = headingMatch.group(2).strip()
sections.append({
"id": _nextId(), "content_type": "heading", "order": order,
"elements": [{"content": {"text": text, "level": level}}],
})
i += 1
continue
# --- Fenced code blocks ---
codeMatch = _re.match(r'^```(\w*)', line)
if codeMatch:
lang = codeMatch.group(1) or "text"
codeLines = []
i += 1
while i < len(lines) and not lines[i].startswith("```"):
codeLines.append(lines[i])
i += 1
i += 1
sections.append({
"id": _nextId(), "content_type": "code_block", "order": order,
"elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
})
continue
# --- Tables ---
tableMatch = _re.match(r'^\|(.+)\|$', line)
if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]):
rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
"id": _nextId(), "content_type": "table", "order": order,
"elements": [{"content": {"headers": headerCells, "rows": rows}}],
})
continue
# --- Bullet / numbered lists ---
listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line)
if listMatch:
isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2)))
items = []
while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]):
m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i])
items.append({"text": m.group(3).strip()})
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
"elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
})
continue
# --- Empty lines (skip) ---
if not line.strip():
i += 1
continue
# --- Images: ![alt](file:fileId) or ![alt](url) ---
imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
fileId = ""
if src.startswith("file:"):
fileId = src[5:]
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
"elements": [{
"content": {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
}],
})
i += 1
continue
# --- Paragraph (collect consecutive non-empty lines) ---
paraLines = []
while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]):
paraLines.append(lines[i])
i += 1
if paraLines:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": " ".join(paraLines)}}],
})
continue
i += 1
if not sections:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
})
return {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "agent_rendering",
"title": title,
"language": language,
},
"documents": [{
"id": "doc_1",
"title": title,
"sections": sections,
}],
}
"""Delegate to the consolidated parser in subDocumentUtility."""
from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson
result = markdownToDocumentJson(markdown, title, language)
result["metadata"]["extraction_method"] = "agent_rendering"
return result
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry."""
@ -245,35 +114,75 @@ def _registerMediaTools(registry: ToolRegistry, services):
except Exception as e:
logger.warning(f"renderDocument: knowledge service unavailable: {e}")
resolvedImages = 0
def _resolveImageRef(targetObj, fileRefKey="_fileRef", fileIdKey="fileId"):
"""Resolve a single image reference dict to base64Data in-place."""
nonlocal resolvedImages
fileRef = targetObj.get(fileRefKey, "") or targetObj.get(fileIdKey, "")
if not fileRef or targetObj.get("base64Data"):
return
if knowledgeService:
chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if imageChunks:
targetObj["base64Data"] = imageChunks[0].get("data", "")
chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
targetObj["mimeType"] = chunkMime
resolvedImages += 1
if not targetObj.get("base64Data"):
try:
rawBytes = services.chat.getFileData(fileRef)
if rawBytes:
import base64 as _b64
targetObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
targetObj["mimeType"] = "image/png"
resolvedImages += 1
except Exception as e:
logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}")
targetObj.pop("_fileRef", None)
targetObj.pop("_srcUrl", None)
def _resolveInlineRuns(runsList):
"""Scan a list of inline runs and resolve any image runs with fileId."""
for run in runsList:
if run.get("type") == "image" and run.get("fileId") and not run.get("base64Data"):
_resolveImageRef(run, fileRefKey="fileId", fileIdKey="fileId")
for doc in structuredContent.get("documents", []):
for section in doc.get("sections", []):
if section.get("content_type") != "image":
cType = section.get("content_type")
# Block-level image sections
if cType == "image":
for element in section.get("elements", []):
contentObj = element.get("content", {})
_resolveImageRef(contentObj)
continue
for element in section.get("elements", []):
contentObj = element.get("content", {})
fileRef = contentObj.get("_fileRef", "")
if not fileRef or contentObj.get("base64Data"):
continue
if knowledgeService:
chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if imageChunks:
contentObj["base64Data"] = imageChunks[0].get("data", "")
chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
contentObj["mimeType"] = chunkMime
resolvedImages += 1
if not contentObj.get("base64Data"):
try:
rawBytes = services.chat.getFileData(fileRef)
if rawBytes:
import base64 as _b64
contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
contentObj["mimeType"] = "image/png"
resolvedImages += 1
except Exception as e:
logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}")
contentObj.pop("_fileRef", None)
contentObj.pop("_srcUrl", None)
# Paragraphs with inlineRuns
if cType == "paragraph":
for element in section.get("elements", []):
runs = element.get("content", {}).get("inlineRuns")
if runs:
_resolveInlineRuns(runs)
continue
# Bullet lists - items are List[List[InlineRun]]
if cType == "bullet_list":
for element in section.get("elements", []):
items = element.get("content", {}).get("items", [])
for item in items:
if isinstance(item, list):
_resolveInlineRuns(item)
continue
# Tables - headers and row cells are List[InlineRun]
if cType == "table":
for element in section.get("elements", []):
contentObj = element.get("content", {})
for cell in contentObj.get("headers", []):
if isinstance(cell, list):
_resolveInlineRuns(cell)
for row in contentObj.get("rows", []):
for cell in row:
if isinstance(cell, list):
_resolveInlineRuns(cell)
sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", []))
logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}")
@ -285,6 +194,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
language=language,
title=title,
userPrompt=content,
style=args.get("style"),
)
if not documents:
@ -367,6 +277,20 @@ def _registerMediaTools(registry: ToolRegistry, services):
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
"style": {
"type": "object",
"description": (
"Optional style overrides for the rendered document. Supports nested keys: "
"fonts (primary, monospace), colors (primary, secondary, accent, background), "
"headings (h1-h4 with sizePt, weight, color, spaceBeforePt, spaceAfterPt), "
"paragraph (sizePt, lineSpacing, color), table (headerBg, headerFg, headerSizePt, "
"bodySizePt, rowBandingEven, rowBandingOdd, borderColor, borderWidthPt), "
"list (bulletChar, indentPt, sizePt), image (defaultWidthPt, maxWidthPt, alignment), "
"codeBlock (fontSizePt, background, borderColor), "
"page (format, marginsPt, showPageNumbers, headerHeight, footerHeight, headerLogo, headerText, footerText). "
"Only provided keys override defaults; omitted keys keep their default values."
),
},
},
},
readOnly=False,
@ -912,7 +836,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="executeCode", success=False, error=f"Language '{language}' not supported. Only 'python' is available.")
try:
from modules.serviceCenter.services.serviceAgent.sandboxExecutor import executePython
result = await executePython(code)
result = await executePython(code, services=services)
if result.get("success"):
output = result.get("output", "(no output)")
return ToolResult(toolCallId="", toolName="executeCode", success=True, data=output)
@ -962,12 +886,17 @@ def _registerMediaTools(registry: ToolRegistry, services):
readOnly=True
)
from modules.serviceCenter.services.serviceAgent.sandboxExecutor import SANDBOX_ALLOWED_MODULES
moduleList = ", ".join(sorted(SANDBOX_ALLOWED_MODULES | {"io"}))
registry.register(
"executeCode", _executeCode,
description=(
"Execute Python code in a sandboxed environment for calculations and data analysis. "
"Available modules: math, statistics, json, csv, re, datetime, collections, itertools, functools, decimal, fractions, random. "
"No file system, network, or OS access. Max 30s execution time. "
f"Execute Python code in a sandboxed environment for calculations and data analysis. "
f"Available modules: {moduleList}. "
"io is restricted to StringIO and BytesIO only (no file access). "
"Built-in readFile(fileId) returns UTF-8 content of a workspace file by its file ID "
"(use the 'file id' from tool outputs, e.g. data = readFile('019af...')). "
"No other file system, network, or OS access. Max 30s execution time. "
"Use print() to produce output."
),
parameters={

View file

@ -11,10 +11,9 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_attachFileAsChatDocument,
_formatToolFileResult,
_getOrCreateInstanceFolder,
_getOrCreateTempFolder,
_getOrCreateInstanceGroup,
_getOrCreateTempGroup,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required")
try:
knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None
fileStatus = None
# 1) Knowledge Store: return already-extracted text chunks
if knowledgeService:
@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]",
)
# 2) Not indexed yet: try on-demand extraction
# 2) Not indexed yet: inspect file type to decide how to serve the agent
# (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline)
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
if not fileInfo:
@ -100,83 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
isBinary = _looksLikeBinary(rawBytes)
if isBinary:
try:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
extracted = runExtraction(
ExtractorRegistry(), ChunkerRegistry(),
rawBytes, fileName, mimeType, ExtractionOptions(),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {
"containerPath": fileName,
"location": part.label or "file",
**(part.metadata or {}),
},
})
if contentObjects:
if knowledgeService:
try:
userId = context.get("userId", "")
_fiId, _mId = _resolveFileScope(fileId, context)
await knowledgeService.indexFile(
fileId=fileId, fileName=fileName, mimeType=mimeType,
userId=userId, contentObjects=contentObjects,
featureInstanceId=_fiId,
mandateId=_mId,
)
except Exception as e:
logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
joined = ""
if knowledgeService:
_chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
_textChunks = [
c for c in (_chunks or [])
if c.get("contentType") != "image" and c.get("data")
]
if _textChunks:
joined = "\n\n".join(c["data"] for c in _textChunks)
if not joined:
textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"]
joined = "\n\n".join(textParts) if textParts else ""
if joined:
chunked = _applyOffsetLimit(joined, offset, limit)
if chunked is not None:
return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked)
if len(joined) > _MAX_TOOL_RESULT_CHARS:
joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]"
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=joined,
)
imgCount = sum(1 for o in contentObjects if o["contentType"] == "image")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' "
f"({imgCount} images, no readable text). "
f"Use describeImage(fileId='{fileId}') to analyze visual content.]",
)
except Exception as extractErr:
logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. "
f"Text extraction not available. Use describeImage for images.]",
data=(
f"[File '{fileName}' ({mimeType}) is not yet indexed "
f"(status: {fileStatus or 'unknown'}). Indexing runs automatically "
f"on upload. Please wait a few seconds and retry, or re-upload the file. "
f"For visual content use describeImage(fileId='{fileId}').]"
),
)
# 3) Text file: decode raw bytes
@ -237,7 +169,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
try:
chatService = services.chat
files = chatService.listFiles(
folderId=args.get("folderId"),
tags=args.get("tags"),
search=args.get("search"),
)
@ -290,18 +221,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="searchInFileContent", success=False, error=str(e))
async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]):
try:
chatService = services.chat
folders = chatService.listFolders(parentId=args.get("parentId"))
folderList = "\n".join(
f"- {f.get('name', 'unnamed')} (id: {f.get('id', '?')})"
for f in folders
) if folders else "No folders found."
return ToolResult(toolCallId="", toolName="listFolders", success=True, data=folderList)
except Exception as e:
return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e))
async def _webSearch(args: Dict[str, Any], context: Dict[str, Any]):
query = args.get("query", "")
if not query:
@ -339,35 +258,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="tagFile", success=False, error=str(e))
async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
targetFolderId = args.get("targetFolderId")
if not fileId:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required")
try:
chatService = services.chat
chatService.interfaceDbComponent.updateFile(fileId, {"folderId": targetFolderId})
return ToolResult(
toolCallId="", toolName="moveFile", success=True,
data=f"File {fileId} moved to folder {targetFolderId or 'root'}"
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e))
async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]):
name = args.get("name", "")
if not name:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required")
try:
chatService = services.chat
folder = chatService.createFolder(name=name, parentId=args.get("parentId"))
return ToolResult(
toolCallId="", toolName="createFolder", success=True,
data=f"Folder '{name}' created (id: {folder.get('id', '?')})"
)
except Exception as e:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e))
async def _writeFile(args: Dict[str, Any], context: Dict[str, Any]):
content = args.get("content", "")
mode = args.get("mode", "create")
@ -422,12 +312,52 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "")
if fiId:
dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId})
if args.get("folderId"):
dbMgmt.updateFile(fileItem.id, {"folderId": args["folderId"]})
if args.get("groupId"):
try:
appIface = chatService.interfaceDbApp
existing = appIface.getTableGrouping("files/list")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _addToGroup(nds, gid, fid):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == gid:
ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fid not in ids:
ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = ids
return True
if _addToGroup(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
return True
return False
_addToGroup(nodes, args["groupId"], fileItem.id)
appIface.upsertTableGrouping("files/list", nodes)
except Exception as _ge:
logger.warning(f"writeFile: failed to add file to group {args['groupId']}: {_ge}")
elif fiId:
instanceFolderId = _getOrCreateInstanceFolder(chatService, fiId, context.get("mandateId", ""))
if instanceFolderId:
dbMgmt.updateFile(fileItem.id, {"folderId": instanceFolderId})
try:
appIface = chatService.interfaceDbApp
instanceGroupId = await _getOrCreateInstanceGroup(appIface, fiId)
if instanceGroupId:
existing = appIface.getTableGrouping("files/list")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _addToGroup2(nds, gid, fid):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == gid:
ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fid not in ids:
ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = ids
return True
if _addToGroup2(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
return True
return False
_addToGroup2(nodes, instanceGroupId, fileItem.id)
appIface.upsertTableGrouping("files/list", nodes)
except Exception as _ge:
logger.warning(f"writeFile: failed to add file to instance group for {fiId}: {_ge}")
if args.get("tags"):
dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]})
@ -480,13 +410,13 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
registry.register(
"listFiles", _listFiles,
description=(
"List files in the local workspace. Filter by folder, tags, or search term. "
"List files in the local workspace. Filter by tags or search term. "
"To filter by group, use listItemsInGroup. "
"For external data sources, use browseDataSource instead."
),
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "Filter by folder ID"},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Filter by tags (any match)"},
"search": {"type": "string", "description": "Search in file names and descriptions"},
}
@ -513,18 +443,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
registry.register(
"listFolders", _listFolders,
description="List folders in the local workspace. For external data sources, use browseDataSource instead.",
parameters={
"type": "object",
"properties": {
"parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
}
},
readOnly=True
)
registry.register(
"webSearch", _webSearch,
description="Search the web for general information. Use readUrl to fetch content from a known URL instead.",
@ -550,34 +468,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
registry.register(
"moveFile", _moveFile,
description="Move a file to a different folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to move"},
"targetFolderId": {"type": "string", "description": "Target folder ID (null for root)"},
},
"required": ["fileId"]
},
readOnly=False
)
registry.register(
"createFolder", _createFolder,
description="Create a new folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"name": {"type": "string", "description": "Folder name"},
"parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
},
"required": ["name"]
},
readOnly=False
)
registry.register(
"writeFile", _writeFile,
description=(
@ -598,7 +488,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"content": {"type": "string", "description": "Content to write/append"},
"mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"},
"fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"},
"folderId": {"type": "string", "description": "Target folder ID (mode=create only)"},
"groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"},
},
"required": ["content"]
@ -758,55 +648,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
# ---- Phase 2: deleteFolder, renameFolder, moveFolder, copyFile, editFile ----
async def _deleteFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
recursive = args.get("recursive", False)
if not folderId:
return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error="folderId is required")
try:
chatService = services.chat
result = chatService.interfaceDbComponent.deleteFolder(folderId, recursive=recursive)
summary = f"Deleted {result.get('deletedFolders', 1)} folder(s) and {result.get('deletedFiles', 0)} file(s)"
return ToolResult(
toolCallId="", toolName="deleteFolder", success=True, data=summary,
sideEvents=[{"type": "folderDeleted", "data": {"folderId": folderId, **result}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error=str(e))
async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
newName = args.get("newName", "")
if not folderId or not newName:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required")
try:
chatService = services.chat
chatService.interfaceDbComponent.renameFolder(folderId, newName)
return ToolResult(
toolCallId="", toolName="renameFolder", success=True,
data=f"Folder {folderId} renamed to '{newName}'",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "name": newName}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e))
async def _moveFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
targetParentId = args.get("targetParentId")
if not folderId:
return ToolResult(toolCallId="", toolName="moveFolder", success=False, error="folderId is required")
try:
chatService = services.chat
chatService.interfaceDbComponent.moveFolder(folderId, targetParentId)
return ToolResult(
toolCallId="", toolName="moveFolder", success=True,
data=f"Folder {folderId} moved to {targetParentId or 'root'}",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "parentId": targetParentId}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFolder", success=False, error=str(e))
# ---- Phase 2: copyFile, editFile ----
async def _copyFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
@ -816,7 +658,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
chatService = services.chat
copiedFile = chatService.interfaceDbComponent.copyFile(
fileId,
targetFolderId=args.get("targetFolderId"),
newFileName=args.get("newFileName"),
)
return ToolResult(
@ -891,48 +732,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="replaceInFile", success=False, error=str(e))
registry.register(
"deleteFolder", _deleteFolder,
description="Delete a folder from the local workspace. Set recursive=true to delete all contents.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to delete"},
"recursive": {"type": "boolean", "description": "If true, delete folder and all contents (files and subfolders). Default: false"},
},
"required": ["folderId"]
},
readOnly=False
)
registry.register(
"renameFolder", _renameFolder,
description="Rename a folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to rename"},
"newName": {"type": "string", "description": "New folder name"},
},
"required": ["folderId", "newName"]
},
readOnly=False
)
registry.register(
"moveFolder", _moveFolder,
description="Move a folder to a different parent in the local workspace.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to move"},
"targetParentId": {"type": "string", "description": "Target parent folder ID (null/omit for root)"},
},
"required": ["folderId"]
},
readOnly=False
)
registry.register(
"copyFile", _copyFile,
description="Create an independent copy of a file in the local workspace.",
@ -940,7 +739,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to copy"},
"targetFolderId": {"type": "string", "description": "Target folder for the copy (default: same folder)"},
"newFileName": {"type": "string", "description": "New file name (default: same name, auto-numbered if duplicate)"},
},
"required": ["fileId"]
@ -948,6 +746,137 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
# ---- Group tools (replaces folder-based tools) ----
async def _listGroups(args: Dict[str, Any], context: Dict[str, Any]):
contextKey = args.get("contextKey", "files/list")
try:
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return ToolResult(toolCallId="", toolName="listGroups", success=True, data="No groups found.")
def _flatten(nodes, depth=0):
result = []
for n in nodes:
nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
result.extend(_flatten(nd.get("subGroups", []), depth + 1))
return result
groups = _flatten(existing.rootGroups)
lines = "\n".join(
f"{' ' * g['depth']}- {g['name']} (id: {g['id']}, items: {g['itemCount']})"
for g in groups
) if groups else "No groups found."
return ToolResult(toolCallId="", toolName="listGroups", success=True, data=lines)
except Exception as e:
return ToolResult(toolCallId="", toolName="listGroups", success=False, error=str(e))
async def _listItemsInGroup(args: Dict[str, Any], context: Dict[str, Any]):
groupId = args.get("groupId", "")
contextKey = args.get("contextKey", "files/list")
if not groupId:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error="groupId is required")
try:
from modules.routes.routeHelpers import _collectItemIds
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=True, data="No groups found.")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
ids = _collectItemIds(nodes, groupId)
itemList = list(ids) if ids else []
return ToolResult(
toolCallId="", toolName="listItemsInGroup", success=True,
data="\n".join(f"- {fid}" for fid in itemList) if itemList else "No items in group.",
)
except Exception as e:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error=str(e))
async def _addItemsToGroup(args: Dict[str, Any], context: Dict[str, Any]):
groupId = args.get("groupId", "")
itemIds = args.get("itemIds", [])
contextKey = args.get("contextKey", "files/list")
if not groupId:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="groupId is required")
if not itemIds:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="itemIds is required")
try:
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _add(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == groupId:
existing_ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
for fid in itemIds:
if fid not in existing_ids:
existing_ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = existing_ids
return True
if _add(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])):
return True
return False
found = _add(nodes)
if not found:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=f"Group {groupId} not found")
appInterface.upsertTableGrouping(contextKey, nodes)
return ToolResult(
toolCallId="", toolName="addItemsToGroup", success=True,
data=f"Added {len(itemIds)} item(s) to group {groupId}",
)
except Exception as e:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=str(e))
registry.register(
"listGroups", _listGroups,
description="List all groups in the file grouping tree. Groups replace folders for organising files.",
parameters={
"type": "object",
"properties": {
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
}
},
readOnly=True
)
registry.register(
"listItemsInGroup", _listItemsInGroup,
description="List all file IDs assigned to a specific group (includes sub-groups recursively).",
parameters={
"type": "object",
"properties": {
"groupId": {"type": "string", "description": "The group ID to inspect"},
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
},
"required": ["groupId"]
},
readOnly=True
)
registry.register(
"addItemsToGroup", _addItemsToGroup,
description="Add one or more file IDs to an existing group.",
parameters={
"type": "object",
"properties": {
"groupId": {"type": "string", "description": "The group ID to add files to"},
"itemIds": {"type": "array", "items": {"type": "string"}, "description": "List of file IDs to add"},
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
},
"required": ["groupId", "itemIds"]
},
readOnly=False
)
registry.register(
"replaceInFile", _replaceInFile,
description=(

View file

@ -69,7 +69,15 @@ class _ServicesAdapter:
@property
def workflow(self):
return self._context.workflow
return getattr(self, "_workflow_override", None) or self._context.workflow
@workflow.setter
def workflow(self, value):
self._workflow_override = value
try:
self._context.workflow = value
except (AttributeError, TypeError):
pass
@property
def ai(self):
@ -95,6 +103,13 @@ class _ServicesAdapter:
def extraction(self):
return self._getService("extraction")
@property
def interfaceDbComponent(self):
try:
return self.chat.interfaceDbComponent
except Exception:
return None
@property
def rbac(self):
"""Same RbacClass as workflow hub (MethodBase permission checks during discoverMethods)."""
@ -268,24 +283,19 @@ class AgentService:
info = chatService.getFileInfo(fid)
if not info:
folderInfo = chatService.interfaceDbComponent.getFolder(fid)
if folderInfo:
folderName = folderInfo.get("name", fid)
folderFiles = chatService.listFiles(folderId=fid)
desc = f"### Folder: {folderName}\n - id: {fid}\n - type: folder\n - contains: {len(folderFiles)} file(s)"
if folderFiles:
desc += "\n - files:"
for ff in folderFiles[:30]:
ffName = ff.get("fileName", "?")
ffId = ff.get("id", "?")
ffMime = ff.get("mimeType", "?")
ffSize = ff.get("fileSize", ff.get("size", "?"))
desc += f"\n * {ffName} (id: {ffId}, type: {ffMime}, size: {ffSize} bytes)"
if len(folderFiles) > 30:
desc += f"\n ... and {len(folderFiles) - 30} more files"
desc += f'\nUse `listFiles(folderId="{fid}")` to get the full file list, then `readFile(fileId)` to read individual files.'
fileDescriptions.append(desc)
continue
# Check if fid is a group ID
try:
groupFileIds = chatService.listFilesInGroup(fid)
if groupFileIds:
allGroups = chatService.listGroups()
groupInfo = next((g for g in allGroups if g.get("id") == fid), None)
groupName = groupInfo.get("name", fid) if groupInfo else fid
desc = f"### Group: {groupName}\n - id: {fid}\n - type: group\n - contains: {len(groupFileIds)} file(s)"
desc += f'\nUse `listItemsInGroup(groupId="{fid}")` to get file IDs, then `readFile(fileId)` to read each.'
fileDescriptions.append(desc)
continue
except Exception:
pass
fileDescriptions.append(f"### File id: {fid}")
continue
@ -333,7 +343,7 @@ class AgentService:
"These files/folders have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n"
"For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n"
"For groups, use `listItemsInGroup(groupId)` to get the file IDs inside, then `readFile(fileId)` for each.\n"
"For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with "
"`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n"
"For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n"

View file

@ -10,8 +10,8 @@ from typing import Dict, Any
logger = logging.getLogger(__name__)
_PYTHON_ALLOWED_MODULES = {
"math", "statistics", "json", "csv", "re", "datetime",
SANDBOX_ALLOWED_MODULES = {
"math", "statistics", "json", "csv", "re", "datetime", "time",
"collections", "itertools", "functools", "decimal", "fractions",
"random", "string", "textwrap", "operator", "copy",
}
@ -19,17 +19,33 @@ _PYTHON_ALLOWED_MODULES = {
_PYTHON_BLOCKED_BUILTINS = {
"open", "exec", "eval", "compile", "__import__", "globals", "locals",
"getattr", "setattr", "delattr", "breakpoint", "exit", "quit",
"input", "memoryview", "type",
"input", "memoryview",
}
_MAX_EXECUTION_TIME_S = 30
_MAX_OUTPUT_CHARS = 50000
_RESTRICTED_IO = None
def _getRestrictedIo():
"""Return a restricted ``io`` module exposing only StringIO/BytesIO."""
global _RESTRICTED_IO
if _RESTRICTED_IO is None:
import types
m = types.ModuleType("io")
m.StringIO = io.StringIO
m.BytesIO = io.BytesIO
_RESTRICTED_IO = m
return _RESTRICTED_IO
def _safeImport(name, *args, **kwargs):
"""Restricted import that only allows whitelisted modules."""
if name not in _PYTHON_ALLOWED_MODULES:
raise ImportError(f"Module '{name}' is not allowed. Permitted: {', '.join(sorted(_PYTHON_ALLOWED_MODULES))}")
if name == "io":
return _getRestrictedIo()
if name not in SANDBOX_ALLOWED_MODULES:
raise ImportError(f"Module '{name}' is not allowed. Permitted: io (StringIO/BytesIO only), {', '.join(sorted(SANDBOX_ALLOWED_MODULES))}")
return __builtins__["__import__"](name, *args, **kwargs) if isinstance(__builtins__, dict) else __import__(name, *args, **kwargs)
@ -48,7 +64,7 @@ def _buildRestrictedGlobals() -> Dict[str, Any]:
safeBuiltins["__name__"] = "__sandbox__"
safeBuiltins["__builtins__"] = safeBuiltins
for modName in _PYTHON_ALLOWED_MODULES:
for modName in SANDBOX_ALLOWED_MODULES:
try:
safeBuiltins[modName] = __import__(modName)
except ImportError:
@ -57,12 +73,27 @@ def _buildRestrictedGlobals() -> Dict[str, Any]:
return {"__builtins__": safeBuiltins}
async def executePython(code: str) -> Dict[str, Any]:
def _makeReadFile(services):
"""Create a readFile(fileId) closure bound to the current services context."""
def readFile(fileId: str) -> str:
mgmt = getattr(services, 'interfaceDbComponent', None) if services else None
if not mgmt:
raise RuntimeError("readFile: no file store available in this session")
data = mgmt.getFileData(str(fileId))
if data is None:
raise FileNotFoundError(f"File '{fileId}' not found in workspace")
return data.decode("utf-8")
return readFile
async def executePython(code: str, *, services=None) -> Dict[str, Any]:
"""Execute Python code in a restricted sandbox. Returns {success, output, error}."""
import asyncio
def _run():
restrictedGlobals = _buildRestrictedGlobals()
if services:
restrictedGlobals["__builtins__"]["readFile"] = _makeReadFile(services)
capturedOutput = io.StringIO()
oldStdout = sys.stdout
oldStderr = sys.stderr

View file

@ -51,6 +51,10 @@ class _ServicesAdapter:
def workflow(self):
return self._context.workflow
@workflow.setter
def workflow(self, value):
self._context.workflow = value
@property
def chat(self):
return self._get_service("chat")
@ -86,7 +90,7 @@ class _ServicesAdapter:
return getattr(w, "featureCode", None) if w else None
def __getattr__(self, name: str):
if name in ("allowedProviders", "preferredProviders", "currentUserLanguage"):
if name in ("allowedProviders", "allowedModels", "preferredProviders", "currentUserLanguage"):
return getattr(self.workflow, name, None) if self.workflow else None
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
@ -164,12 +168,29 @@ class AiService:
# SPEECH_TEAMS: Dedicated pipeline, bypasses standard model selection
if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS:
return await self._handleSpeechTeams(request)
# FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
self._preflightBillingCheck()
# Balance & provider permission checks
await self._checkBillingBeforeAiCall()
_opType = request.options.operationType if request.options else None
_isNeutralizationCall = _opType in (
OperationTypeEnum.NEUTRALIZATION_TEXT,
OperationTypeEnum.NEUTRALIZATION_IMAGE,
)
if not _isNeutralizationCall:
# FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
self._preflightBillingCheck()
# Balance & provider permission checks
await self._checkBillingBeforeAiCall()
else:
# Neutralization calls are system-level operations (connector anonymization).
# They run without a mandate context (e.g. personal-scope connections) and
# are billed the same way as embedding calls: best-effort, skipped when no
# billing settings exist for an empty mandate.
logger.debug(
"callAi: skipping billing preflight for neutralization call "
"(operationType=%s, user=%s)",
_opType,
getattr(getattr(self.services, 'user', None), 'id', 'unknown'),
)
# Calculate effective allowedProviders: RBAC ∩ Workflow
effectiveProviders = self._calculateEffectiveProviders()
@ -177,6 +198,11 @@ class AiService:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
logger.debug(f"Effective allowedProviders for AI request: {effectiveProviders}")
# Calculate effective allowedModels: Workflow ∩ Request (node-level)
effectiveModels = self._calculateEffectiveModels(request)
if effectiveModels and request.options:
request.options = request.options.model_copy(update={'allowedModels': effectiveModels})
# Neutralize prompt if enabled (before AI call)
_wasNeutralized = False
_excludedDocs: List[str] = []
@ -218,13 +244,25 @@ class AiService:
Rehydration happens on the final AiCallResponse (not on individual str deltas).
"""
await self.ensureAiObjectsInitialized()
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
_streamOpType = request.options.operationType if request.options else None
_isNeutralizationStream = _streamOpType in (
OperationTypeEnum.NEUTRALIZATION_TEXT,
OperationTypeEnum.NEUTRALIZATION_IMAGE,
)
if not _isNeutralizationStream:
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
effectiveProviders = self._calculateEffectiveProviders()
if effectiveProviders and request.options:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
# Calculate effective allowedModels: Workflow ∩ Request (node-level)
effectiveModels = self._calculateEffectiveModels(request)
if effectiveModels and request.options:
request.options = request.options.model_copy(update={'allowedModels': effectiveModels})
# Neutralize prompt if enabled (before streaming)
_wasNeutralized = False
_excludedDocs: List[str] = []
@ -1240,6 +1278,43 @@ detectedIntent-Werte:
logger.warning(f"Error calculating effective providers: {e}")
return None
def _calculateEffectiveModels(self, request: AiCallRequest = None) -> Optional[List[str]]:
"""
Calculate effective allowed models: Workflow.allowedModels request.options.allowedModels.
AND-logic intersection:
- If workflow specifies allowedModels, start with those.
- If request (node-level) also specifies allowedModels, intersect.
- Returns None if no model filtering is needed.
"""
try:
effectiveModels = None
# Workflow-level allowedModels (from automation config)
workflowModels = getattr(self.services, 'allowedModels', None)
if workflowModels:
effectiveModels = list(workflowModels)
# Request-level (node-level) allowedModels
requestModels = None
if request and request.options and request.options.allowedModels:
requestModels = request.options.allowedModels
if requestModels:
if effectiveModels:
effectiveModels = [m for m in effectiveModels if m in requestModels]
else:
effectiveModels = list(requestModels)
if effectiveModels:
logger.debug(f"Model filter: Workflow={workflowModels}, Request={requestModels}, Effective={effectiveModels}")
return effectiveModels if effectiveModels else None
except Exception as e:
logger.warning(f"Error calculating effective models: {e}")
return None
async def ensureAiObjectsInitialized(self):
"""Ensure aiObjects is initialized and submodules are ready."""
if self.aiObjects is None:

View file

@ -199,13 +199,8 @@ class ChatService:
label = parts[1]
messageFound = None
for message in workflow.messages:
# Validate message belongs to this workflow
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
if msgWorkflowId:
logger.warning(f"Message {message.id} has workflowId {msgWorkflowId} but belongs to workflow {workflowId}. Skipping.")
else:
logger.warning(f"Message {message.id} has no workflowId. Skipping.")
continue
msgLabel = getattr(message, 'documentsLabel', None)
@ -213,7 +208,6 @@ class ChatService:
messageFound = message
break
# If found, add documents
if messageFound and messageFound.documents:
allDocuments.extend(messageFound.documents)
else:
@ -419,7 +413,7 @@ class ChatService:
return None
def getFileInfo(self, fileId: str) -> Dict[str, Any]:
"""Get file information including new fields (tags, folderId, description, status)."""
"""Get file information including new fields (tags, description, status)."""
fileItem = self.interfaceDbComponent.getFile(fileId)
if fileItem:
return {
@ -430,7 +424,6 @@ class ChatService:
"fileHash": fileItem.fileHash,
"creationDate": fileItem.sysCreatedAt,
"tags": getattr(fileItem, "tags", None),
"folderId": getattr(fileItem, "folderId", None),
"description": getattr(fileItem, "description", None),
"status": getattr(fileItem, "status", None),
}
@ -449,14 +442,12 @@ class ChatService:
def listFiles(
self,
folderId: str = None,
tags: List[str] = None,
search: str = None,
) -> List[Dict[str, Any]]:
"""List files for the current user with optional filters.
Args:
folderId: Filter by folder (None = root / all).
tags: Filter by tags (any match).
search: Search in fileName and description.
@ -469,10 +460,6 @@ class ChatService:
allFiles = self.interfaceDbComponent.getAllFiles()
results = []
for fileItem in allFiles:
if folderId is not None:
if fileItem.get("folderId") != folderId:
continue
if tags:
itemTags = fileItem.get("tags") or []
if not any(t in itemTags for t in tags):
@ -492,27 +479,40 @@ class ChatService:
"fileSize": fileItem.get("fileSize"),
"creationDate": fileItem.get("sysCreatedAt"),
"tags": fileItem.get("tags"),
"folderId": fileItem.get("folderId"),
"description": fileItem.get("description"),
"status": fileItem.get("status"),
})
return results
def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]:
"""List file folders for the current user.
def listGroups(self, contextKey: str = "files/list") -> list:
"""List all groups in the groupTree for the current context."""
try:
existing = self.interfaceDbApp.getTableGrouping(contextKey)
if not existing:
return []
def _flatten(nodes, depth=0):
result = []
for n in nodes:
nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
result.extend(_flatten(nd.get("subGroups", []), depth + 1))
return result
return _flatten(existing.rootGroups)
except Exception as e:
return []
Args:
parentId: Optional parent folder ID to filter by.
None = return ALL folders (for tree building).
Returns:
List of folder dicts.
"""
return self.interfaceDbComponent.listFolders(parentId=parentId)
def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]:
"""Create a new file folder with unique name validation."""
return self.interfaceDbComponent.createFolder(name=name, parentId=parentId)
def listFilesInGroup(self, groupId: str, contextKey: str = "files/list") -> list:
"""List file IDs in a specific group (recursive)."""
try:
from modules.routes.routeHelpers import _collectItemIds
existing = self.interfaceDbApp.getTableGrouping(contextKey)
if not existing:
return []
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
ids = _collectItemIds(nodes, groupId)
return list(ids) if ids else []
except Exception:
return []
# ---- DataSource CRUD ----

View file

@ -166,12 +166,28 @@ class ClickupService:
page: int = 0,
include_closed: bool = False,
subtasks: bool = True,
dateCreatedGt: Optional[int] = None,
dateCreatedLt: Optional[int] = None,
dateUpdatedGt: Optional[int] = None,
dateUpdatedLt: Optional[int] = None,
customFields: Optional[List[Dict[str, Any]]] = None,
) -> Dict[str, Any]:
params: Dict[str, Any] = {
"page": page,
"subtasks": str(subtasks).lower(),
"include_closed": str(include_closed).lower(),
}
if dateCreatedGt is not None:
params["date_created_gt"] = dateCreatedGt
if dateCreatedLt is not None:
params["date_created_lt"] = dateCreatedLt
if dateUpdatedGt is not None:
params["date_updated_gt"] = dateUpdatedGt
if dateUpdatedLt is not None:
params["date_updated_lt"] = dateUpdatedLt
if customFields:
import json as _json
params["custom_fields"] = _json.dumps(customFields)
return await self._request("GET", f"/list/{list_id}/task", params=params)
async def getTask(self, task_id: str, *, include_subtasks: bool = True) -> Dict[str, Any]:

View file

@ -14,6 +14,7 @@ from .subDocumentUtility import (
detectMimeTypeFromData,
convertDocumentDataToString
)
from .styleDefaults import resolveStyle
logger = logging.getLogger(__name__)
@ -382,7 +383,7 @@ class GenerationService:
'workflowId': 'unknown'
}
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None) -> List[RenderedDocument]:
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None, style: Optional[Dict[str, Any]] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to the specified output format.
Processes EACH document separately and calls renderer for each.
@ -399,12 +400,14 @@ class GenerationService:
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
parentOperationId: Optional parent operation ID for hierarchical logging
style: Optional style overrides (deep-merged with DEFAULT_STYLE)
Returns:
List of RenderedDocument objects.
Each RenderedDocument represents one rendered file (main document or supporting file)
"""
try:
resolvedStyle = resolveStyle(style)
# Validate JSON input
if not isinstance(extractedContent, dict):
raise ValueError("extractedContent must be a JSON dictionary")
@ -469,7 +472,7 @@ class GenerationService:
docTitle = doc.get("title", title)
# Render this document (can return multiple files, e.g., HTML + images)
renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService)
renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService, style=resolvedStyle)
allRenderedDocuments.extend(renderedDocs)
logger.info(f"Rendered {len(documents)} document(s) into {len(allRenderedDocuments)} file(s)")

View file

@ -84,7 +84,7 @@ class BaseRenderer(ABC):
return list(supportedSectionTypes)
@abstractmethod
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to multiple documents.
Each renderer must implement this method.
@ -95,6 +95,9 @@ class BaseRenderer(ABC):
title: Report title
userPrompt: Original user prompt for context
aiService: AI service instance for additional processing
style: Fully-resolved unified style dict from styleDefaults.resolveStyle().
When provided, renderers use these values instead of their
own defaults / AI-generated styles.
Returns:
List of RenderedDocument objects.
@ -102,6 +105,112 @@ class BaseRenderer(ABC):
Even if only one document is returned, it must be wrapped in a list.
"""
pass
def _convertUnifiedStyleToInternal(self, style: Dict[str, Any]) -> Dict[str, Any]:
"""Convert the unified resolvedStyle dict (from styleDefaults) into
the renderer-internal style-set format that all rendering methods already
consume. Override in subclasses for format-specific tweaks."""
h1 = style["headings"]["h1"]
h2 = style["headings"]["h2"]
h3 = style["headings"].get("h3", h2)
h4 = style["headings"].get("h4", h3)
tbl = style["table"]
para = style["paragraph"]
lst = style["list"]
cb = style["codeBlock"]
return {
"title": {
"font_size": h1["sizePt"], "color": h1["color"],
"bold": h1.get("weight") == "bold", "align": "left",
},
"heading1": {
"font_size": h1["sizePt"], "color": h1["color"],
"bold": h1.get("weight") == "bold", "align": "left",
},
"heading2": {
"font_size": h2["sizePt"], "color": h2["color"],
"bold": h2.get("weight") == "bold", "align": "left",
},
"heading3": {
"font_size": h3["sizePt"], "color": h3["color"],
"bold": h3.get("weight") == "bold", "align": "left",
},
"heading4": {
"font_size": h4["sizePt"], "color": h4["color"],
"bold": h4.get("weight") == "bold", "align": "left",
},
"paragraph": {
"font_size": para["sizePt"], "color": para["color"],
"bold": False, "align": "left",
},
"table_header": {
"background": tbl["headerBg"], "text_color": tbl["headerFg"],
"bold": True, "align": "center",
},
"table_cell": {
"background": tbl["rowBandingOdd"], "text_color": para["color"],
"bold": False, "align": "left",
},
"table_border": {
"style": "grid", "color": tbl["borderColor"],
},
"bullet_list": {
"font_size": lst["sizePt"], "color": para["color"],
"indent": lst["indentPt"],
},
"code_block": {
"font": style["fonts"]["monospace"],
"font_size": cb["fontSizePt"], "color": para["color"],
"background": cb["background"],
},
}
@staticmethod
def _inlineRunsFromContent(content: Dict[str, Any], *, itemsKey: str = None) -> Any:
"""Extract inline runs from new-format content, falling back to old format.
For paragraphs (itemsKey=None):
new: content["inlineRuns"] -> List[InlineRun]
old: content["text"] -> wrapped in [{"type":"text","value":text}]
For list items (itemsKey="items"):
new: content["items"] is List[List[InlineRun]]
old: content["items"] is List[str] or List[{"text":}]
Returns the items list (caller decides per-item conversion).
For table headers/cells:
new: each header/cell is List[InlineRun]
old: each header/cell is a plain str
Caller handles per-cell.
"""
if itemsKey:
return content.get(itemsKey, [])
inlineRuns = content.get("inlineRuns")
if inlineRuns:
return inlineRuns
text = content.get("text", "")
if text:
return [{"type": "text", "value": text}]
return []
@staticmethod
def _inlineRunsForCell(cell) -> list:
"""Normalize a single table header or cell value to List[InlineRun].
Accepts either a plain string or an already-correct list of run dicts."""
if isinstance(cell, list):
return cell
return [{"type": "text", "value": str(cell) if cell is not None else ""}]
@staticmethod
def _inlineRunsForListItem(item) -> list:
"""Normalize a single list item to List[InlineRun].
Accepts a plain string, a dict with 'text', or an already-correct list of run dicts."""
if isinstance(item, list):
return item
if isinstance(item, dict):
text = item.get("text", "")
return [{"type": "text", "value": text}]
return [{"type": "text", "value": str(item)}]
def _determineFilename(self, title: str, mimeType: str) -> str:
"""Determine filename from title and mimeType."""

View file

@ -79,7 +79,7 @@ class RendererCodeCsv(BaseCodeRenderer):
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.

View file

@ -91,7 +91,7 @@ class RendererCodeJson(BaseCodeRenderer):
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.

View file

@ -78,7 +78,7 @@ class RendererCodeXml(BaseCodeRenderer):
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
For XML, we only support code generation (no document renderer exists yet).

View file

@ -39,7 +39,7 @@ class RendererCsv(BaseRenderer):
"""
return ["table", "code_block"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to CSV format. Produces one CSV file per table section."""
try:
# Validate JSON structure

View file

@ -53,18 +53,17 @@ class RendererDocx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER")
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .rendererHtml import RendererHtml
htmlRenderer = RendererHtml()
return await htmlRenderer.render(extractedContent, title, userPrompt, aiService)
return await htmlRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate DOCX using AI-analyzed styling
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@ -114,23 +113,27 @@ class RendererDocx(BaseRenderer):
)
]
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, unifiedStyle: Dict[str, Any] = None) -> str:
"""Generate DOCX content from structured JSON document."""
import time
start_time = time.time()
try:
self.logger.debug("_generateDocxFromJson: Starting document generation")
# Create new document
doc = Document()
self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s")
# Get style set: use styles from metadata if available, otherwise enhance with AI
template_from_metadata = None
if json_content and isinstance(json_content.get("metadata"), dict):
template_from_metadata = json_content["metadata"].get("templateName")
# Phase 3: prefer unified style when provided
style_start = time.time()
self.logger.debug("_generateDocxFromJson: About to get style set")
styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata)
if unifiedStyle:
styleSet = self._convertUnifiedStyleToInternal(unifiedStyle)
self._unifiedStyle = unifiedStyle
else:
template_from_metadata = None
if json_content and isinstance(json_content.get("metadata"), dict):
template_from_metadata = json_content["metadata"].get("templateName")
styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata)
self._unifiedStyle = None
self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s")
# Setup basic document styles and create all styles from style set
@ -298,11 +301,11 @@ class RendererDocx(BaseRenderer):
def _setupBasicDocumentStyles(self, doc: Document) -> None:
"""Set up basic document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
us = getattr(self, '_unifiedStyle', None)
font.name = us["fonts"]["primary"] if us else 'Calibri'
font.size = Pt(us["paragraph"]["sizePt"] if us else 11)
except Exception as e:
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
@ -421,6 +424,8 @@ class RendererDocx(BaseRenderer):
def _addMarkdownInlineRuns(self, paragraph, text: str) -> None:
"""Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph."""
pos = 0
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier New"
for m in self._MD_INLINE_RE.finditer(text):
if m.start() > pos:
paragraph.add_run(text[pos:m.start()])
@ -434,12 +439,45 @@ class RendererDocx(BaseRenderer):
paragraph.add_run(m.group(6)).italic = True
elif m.group(7):
run = paragraph.add_run(m.group(7))
run.font.name = "Courier New"
run.font.name = monoFont
run.font.size = Pt(9)
pos = m.end()
if pos < len(text):
paragraph.add_run(text[pos:])
def _renderInlineRuns(self, runs: list, paragraph, styleSet: Dict[str, Any]) -> None:
"""Process a list of InlineRun dicts into python-docx Runs on a paragraph."""
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier New"
for run in runs:
runType = run.get("type", "text")
value = run.get("value", "")
if runType == "text":
paragraph.add_run(value)
elif runType == "bold":
paragraph.add_run(value).bold = True
elif runType == "italic":
paragraph.add_run(value).italic = True
elif runType == "code":
r = paragraph.add_run(value)
r.font.name = monoFont
r.font.size = Pt(9)
elif runType == "link":
r = paragraph.add_run(value)
r.font.underline = True
r.font.color.rgb = RGBColor(0x29, 0x80, 0xB9)
elif runType == "image":
b64 = run.get("base64Data", "")
if b64:
try:
imgBytes = base64.b64decode(b64)
imgStream = io.BytesIO(imgBytes)
paragraph.add_run().add_picture(imgStream, width=Inches(2))
except Exception:
paragraph.add_run(f"[Image: {run.get('altText', '')}]")
else:
paragraph.add_run(value)
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""
Render a JSON table to DOCX using AI-generated styles.
@ -485,7 +523,7 @@ class RendererDocx(BaseRenderer):
except Exception as e:
self.logger.error(f"Error rendering table: {str(e)}", exc_info=True)
def _renderTableFastXml(self, doc: Document, headers: List[str], rows: List[List[Any]], styles: Dict[str, Any]) -> None:
def _renderTableFastXml(self, doc: Document, headers: list, rows: list, styles: Dict[str, Any]) -> None:
"""
High-performance table rendering using direct XML manipulation.
@ -546,24 +584,34 @@ class RendererDocx(BaseRenderer):
# Build all rows using fast XML
rows_start = time.time()
# Header row
headerRow = self._createTableRowXml(headers, isHeader=True)
# Resolve header style colors
tableStyle = styles.get("table_header", {})
headerBg = tableStyle.get("background", "")
headerFg = tableStyle.get("text_color", "")
# Flatten inline-run headers to plain strings for fast XML path
flatHeaders = []
for h in headers:
runs = self._inlineRunsForCell(h)
flatHeaders.append("".join(r.get("value", "") for r in runs))
headerRow = self._createTableRowXml(flatHeaders, isHeader=True, headerBgHex=headerBg or None, headerFgHex=headerFg or None)
tbl.append(headerRow)
header_time = time.time() - rows_start
self.logger.debug(f"_renderTableFastXml: Header row created in {header_time:.3f}s")
# Data rows - batch process for performance
data_start = time.time()
rowCount = len(rows)
for idx, rowData in enumerate(rows):
# Convert all cells to strings
cellTexts = [str(cell) if cell is not None else '' for cell in rowData]
# Pad if needed
while len(cellTexts) < len(headers):
cellTexts = []
for cell in rowData:
runs = self._inlineRunsForCell(cell)
cellTexts.append("".join(r.get("value", "") for r in runs))
while len(cellTexts) < len(flatHeaders):
cellTexts.append('')
row = self._createTableRowXml(cellTexts, isHeader=False)
tbl.append(row)
@ -641,74 +689,64 @@ class RendererDocx(BaseRenderer):
return tblBorders
def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any:
"""
Create a table row XML element with cells.
This is the core fast-path: builds the row XML directly without
going through python-docx's slow cell.text assignment.
"""
def _createTableRowXml(self, cells: list, isHeader: bool = False, headerBgHex: str = None, headerFgHex: str = None) -> Any:
"""Create a table row XML element with cells.
Fast-path: builds row XML directly via lxml."""
from docx.oxml.shared import OxmlElement, qn
if headerBgHex is None:
us = getattr(self, '_unifiedStyle', None)
headerBgHex = us["table"]["headerBg"].lstrip('#') if us else '1F3864'
else:
headerBgHex = headerBgHex.lstrip('#')
if headerFgHex is None:
us = getattr(self, '_unifiedStyle', None)
headerFgHex = us["table"]["headerFg"].lstrip('#') if us else 'FFFFFF'
else:
headerFgHex = headerFgHex.lstrip('#')
tr = OxmlElement('w:tr')
# Row properties for header
if isHeader:
trPr = OxmlElement('w:trPr')
tblHeader = OxmlElement('w:tblHeader')
trPr.append(tblHeader)
trPr.append(OxmlElement('w:tblHeader'))
tr.append(trPr)
for cellText in cells:
# Create cell
tc = OxmlElement('w:tc')
# Cell properties
tcPr = OxmlElement('w:tcPr')
tcW = OxmlElement('w:tcW')
tcW.set(qn('w:type'), 'auto')
tcW.set(qn('w:w'), '0')
tcPr.append(tcW)
# Header cell styling - light blue background
if isHeader:
shd = OxmlElement('w:shd')
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:fill'), '4472C4') # Professional blue
shd.set(qn('w:fill'), headerBgHex)
tcPr.append(shd)
tc.append(tcPr)
# Paragraph with text
p = OxmlElement('w:p')
# Add run with text
r = OxmlElement('w:r')
# Header text styling - bold and white
if isHeader:
rPr = OxmlElement('w:rPr')
b = OxmlElement('w:b')
rPr.append(b)
# White text color
rPr.append(OxmlElement('w:b'))
color = OxmlElement('w:color')
color.set(qn('w:val'), 'FFFFFF')
color.set(qn('w:val'), headerFgHex)
rPr.append(color)
r.append(rPr)
# Text element
t = OxmlElement('w:t')
# Preserve spaces if text starts/ends with whitespace
if cellText and (cellText[0] == ' ' or cellText[-1] == ' '):
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
t.text = cellText
r.append(t)
p.append(r)
tc.append(p)
tr.append(tc)
return tr
def _applyHorizontalBordersOnly(self, table) -> None:
@ -836,47 +874,37 @@ class RendererDocx(BaseRenderer):
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON bullet list to DOCX using AI-generated styles - OPTIMIZED for performance."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return
items = content.get("items", [])
bullet_style = styles.get("bullet_list", {})
# Pre-calculate and cache style objects to avoid repeated parsing
font_size_pt = None
font_size_pt = Pt(bullet_style["font_size"]) if bullet_style.get("font_size") else None
text_color_rgb = None
if bullet_style:
if "font_size" in bullet_style:
font_size_pt = Pt(bullet_style["font_size"])
if "color" in bullet_style:
color_hex = bullet_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
if bullet_style.get("color"):
color_hex = bullet_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
for item in items:
itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "")
if not itemText:
itemRuns = self._inlineRunsForListItem(item)
if not itemRuns or not any(r.get("value") for r in itemRuns):
continue
para = doc.add_paragraph(style='List Bullet')
self._addMarkdownInlineRuns(para, itemText)
# Apply bullet list styling from style set - use cached objects
if bullet_style and para.runs:
# Use direct access instead of iterating
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
else:
# Create run if none exists
run = para.add_run()
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
isNewRunFormat = isinstance(item, list)
if isNewRunFormat:
self._renderInlineRuns(itemRuns, para, styles)
else:
itemText = "".join(r.get("value", "") for r in itemRuns)
self._addMarkdownInlineRuns(para, itemText)
if bullet_style and para.runs and len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
@ -905,90 +933,79 @@ class RendererDocx(BaseRenderer):
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON paragraph to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
inlineRuns = self._inlineRunsFromContent(content)
elif isinstance(content, str):
text = content
inlineRuns = [{"type": "text", "value": content}]
else:
text = ""
# CRITICAL: Prevent rendering base64 image data as text
# Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
(len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
# This looks like base64 data - don't render as text
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
inlineRuns = []
if not inlineRuns:
return
plainText = "".join(r.get("value", "") for r in inlineRuns)
if plainText and (plainText.startswith("/9j/") or plainText.startswith("iVBORw0KGgo") or
(len(plainText) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in plainText[:100]))):
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(plainText)})")
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
if para.runs:
para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
para.runs[0].font.color.rgb = RGBColor(255, 0, 0)
return
if text:
para = doc.add_paragraph()
self._addMarkdownInlineRuns(para, text)
paragraph_style = styles.get("paragraph", {})
if paragraph_style:
# Pre-calculate and cache style objects
font_size_pt = None
text_color_rgb = None
if "font_size" in paragraph_style:
font_size_pt = Pt(paragraph_style["font_size"])
if "color" in paragraph_style:
color_hex = paragraph_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
bold = paragraph_style.get("bold", False)
# Use direct access instead of iterating
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
para = doc.add_paragraph()
hasNewRuns = content.get("inlineRuns") if isinstance(content, dict) else None
if hasNewRuns:
self._renderInlineRuns(inlineRuns, para, styles)
else:
self._addMarkdownInlineRuns(para, plainText)
paragraph_style = styles.get("paragraph", {})
if paragraph_style:
font_size_pt = Pt(paragraph_style["font_size"]) if "font_size" in paragraph_style else None
text_color_rgb = None
if "color" in paragraph_style:
color_hex = paragraph_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
bold = paragraph_style.get("bold", False)
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
if "align" in paragraph_style:
align = paragraph_style["align"]
if align == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
# Create run if none exists
run = para.add_run()
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
if "align" in paragraph_style:
align = paragraph_style["align"]
if align == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON code block to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = code_data.get("content", {})
if not isinstance(content, dict):
return
code = content.get("code", "")
language = content.get("language", "")
code_style = styles.get("code_block", {})
us = getattr(self, '_unifiedStyle', None)
if code:
if language:
lang_para = doc.add_paragraph(f"Code ({language}):")
if len(lang_para.runs) > 0:
lang_para.runs[0].bold = True
# Pre-calculate and cache style objects
code_font_name = code_style.get("font", "Courier New")
code_font_size_pt = Pt(code_style.get("font_size", 9))
code_font_name = code_style.get("font", us["fonts"]["monospace"] if us else "Courier New")
code_font_size_pt = Pt(code_style.get("font_size", us["codeBlock"]["fontSizePt"] if us else 9))
code_text_color_rgb = None
if "color" in code_style:
color_hex = code_style["color"].lstrip('#')

View file

@ -40,7 +40,7 @@ class RendererHtml(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render HTML document with images as separate files.
Returns list of documents: [HTML document, image1, image2, ...]
@ -54,7 +54,7 @@ class RendererHtml(BaseRenderer):
self._renderedImages = images
# Generate HTML using AI-analyzed styling
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService, style=style)
# Replace base64 data URIs with relative file paths if images exist
if images:
@ -107,11 +107,16 @@ class RendererHtml(BaseRenderer):
return resultDocuments
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Use unified style when provided, otherwise fall back to existing flow
if style:
styles = self._convertUnifiedStyleToInternal(style)
self._unifiedStyle = style
else:
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
self._unifiedStyle = None
# Validate JSON structure
if not self._validateJsonStructure(jsonContent):
@ -272,6 +277,10 @@ class RendererHtml(BaseRenderer):
def _generateCssStyles(self, styles: Dict[str, Any]) -> str:
"""Generate CSS from style definitions."""
# When unified style is available, generate CSS directly from it
if getattr(self, "_unifiedStyle", None):
return self._generateCssFromUnifiedStyle(self._unifiedStyle)
css_parts = []
# Body styles
@ -368,6 +377,164 @@ class RendererHtml(BaseRenderer):
return '\n'.join(css_parts)
def _generateCssFromUnifiedStyle(self, style: Dict[str, Any]) -> str:
"""Generate CSS directly from unified style dict."""
fonts = style.get("fonts", {})
colors = style.get("colors", {})
headings = style.get("headings", {})
para = style.get("paragraph", {})
tbl = style.get("table", {})
lst = style.get("list", {})
cb = style.get("codeBlock", {})
page = style.get("page", {})
primaryFont = fonts.get("primary", "Arial, sans-serif")
monoFont = fonts.get("monospace", "Courier New, monospace")
bgColor = colors.get("background", "#FFFFFF")
primaryColor = colors.get("primary", "#1F3864")
paraColor = para.get("color", "#333333")
paraSizePt = para.get("sizePt", 11)
lineSpacing = para.get("lineSpacing", 1.15)
css_parts = []
# Body
css_parts.append("body {")
css_parts.append(f" font-family: {primaryFont};")
css_parts.append(f" background: {bgColor};")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" font-size: {paraSizePt}pt;")
css_parts.append(f" line-height: {lineSpacing};")
margins = page.get("marginsPt", {})
if margins:
css_parts.append(f" margin: {margins.get('top', 60)}pt {margins.get('right', 60)}pt {margins.get('bottom', 60)}pt {margins.get('left', 60)}pt;")
else:
css_parts.append(" margin: 0; padding: 20px;")
css_parts.append("}")
# Document title (uses h1 style)
h1 = headings.get("h1", {})
css_parts.append(".document-title {")
css_parts.append(f" font-size: {h1.get('sizePt', 24)}pt;")
css_parts.append(f" color: {h1.get('color', primaryColor)};")
css_parts.append(f" font-weight: {h1.get('weight', 'bold')};")
css_parts.append(" margin: 0 0 1em 0;")
css_parts.append("}")
# Headings h1-h4
for level in range(1, 5):
key = f"h{level}"
h = headings.get(key, h1 if level == 1 else headings.get(f"h{level-1}", {}))
css_parts.append(f"h{level} {{")
css_parts.append(f" font-size: {h.get('sizePt', max(24 - (level-1)*4, 12))}pt;")
css_parts.append(f" color: {h.get('color', primaryColor)};")
css_parts.append(f" font-weight: {h.get('weight', 'bold')};")
css_parts.append(f" margin: 1.2em 0 0.4em 0;")
css_parts.append("}")
# Paragraphs
css_parts.append("p {")
css_parts.append(f" font-size: {paraSizePt}pt;")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" line-height: {lineSpacing};")
css_parts.append(" margin: 0 0 1em 0;")
css_parts.append("}")
# Tables
borderColor = tbl.get("borderColor", "#DEE2E6")
css_parts.append("table {")
css_parts.append(f" border-collapse: collapse;")
css_parts.append(f" width: 100%;")
css_parts.append(f" margin: 1em 0;")
css_parts.append(f" border: 1px solid {borderColor};")
css_parts.append("}")
# Table headers
css_parts.append("th {")
css_parts.append(f" background: {tbl.get('headerBg', '#1F3864')};")
css_parts.append(f" color: {tbl.get('headerFg', '#FFFFFF')};")
css_parts.append(" font-weight: bold;")
css_parts.append(" text-align: center;")
css_parts.append(f" padding: 10px;")
css_parts.append(f" border: 1px solid {borderColor};")
css_parts.append("}")
# Table cells
css_parts.append("td {")
css_parts.append(f" color: {paraColor};")
css_parts.append(" padding: 8px;")
css_parts.append(f" border: 1px solid {borderColor};")
css_parts.append("}")
# Lists
css_parts.append("ul {")
css_parts.append(f" font-size: {lst.get('sizePt', paraSizePt)}pt;")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" padding-left: {lst.get('indentPt', 18)}pt;")
css_parts.append(" margin: 0 0 1em 0;")
css_parts.append("}")
# Code blocks
css_parts.append("pre {")
css_parts.append(f" font-family: {monoFont};")
css_parts.append(f" font-size: {cb.get('fontSizePt', 9)}pt;")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" background: {cb.get('background', '#F8F9FA')};")
css_parts.append(f" border: 1px solid {cb.get('borderColor', '#E2E8F0')};")
css_parts.append(" border-radius: 4px;")
css_parts.append(" padding: 1em;")
css_parts.append(" margin: 1em 0;")
css_parts.append(" overflow-x: auto;")
css_parts.append("}")
# Images
css_parts.append("img {")
css_parts.append(" max-width: 100%;")
css_parts.append(" height: auto;")
css_parts.append(" margin: 1em 0;")
css_parts.append(" border-radius: 4px;")
css_parts.append("}")
# Generated info
css_parts.append(".generated-info {")
css_parts.append(" font-size: 0.9em;")
css_parts.append(" color: #666;")
css_parts.append(" text-align: center;")
css_parts.append(" margin-top: 2em;")
css_parts.append(" padding-top: 1em;")
css_parts.append(" border-top: 1px solid #ddd;")
css_parts.append("}")
return '\n'.join(css_parts)
def _renderInlineRuns(self, runs: list) -> str:
"""Convert inline runs to HTML markup."""
import html as htmlLib
parts = []
for run in runs:
runType = run.get("type", "text")
value = htmlLib.escape(run.get("value", ""))
if runType == "text":
parts.append(value)
elif runType == "bold":
parts.append(f"<strong>{value}</strong>")
elif runType == "italic":
parts.append(f"<em>{value}</em>")
elif runType == "code":
parts.append(f"<code>{value}</code>")
elif runType == "link":
href = htmlLib.escape(run.get("href", ""))
parts.append(f'<a href="{href}">{value}</a>')
elif runType == "image":
b64 = run.get("base64Data", "")
mime = run.get("mimeType", "image/png")
alt = value
if b64:
parts.append(f'<img src="data:{mime};base64,{b64}" alt="{alt}" style="max-width:100%;height:auto;">')
else:
parts.append(value)
return "".join(parts)
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a single JSON section to HTML using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
@ -419,6 +586,11 @@ class RendererHtml(BaseRenderer):
# Regular paragraph element - extract from nested content structure (standard JSON format)
content = element.get("content", {})
if isinstance(content, dict):
# New format: inlineRuns
inlineRuns = content.get("inlineRuns")
if inlineRuns and isinstance(inlineRuns, list):
htmlParts.append(f'<p>{self._renderInlineRuns(inlineRuns)}</p>')
continue
text = content.get("text", "")
elif isinstance(content, str):
text = content
@ -495,7 +667,8 @@ class RendererHtml(BaseRenderer):
# Table header
htmlParts.append('<thead><tr>')
for header in headers:
htmlParts.append(f'<th>{header}</th>')
runs = self._inlineRunsForCell(header)
htmlParts.append(f'<th>{self._renderInlineRuns(runs)}</th>')
htmlParts.append('</tr></thead>')
# Table body
@ -503,7 +676,8 @@ class RendererHtml(BaseRenderer):
for row in rows:
htmlParts.append('<tr>')
for cellData in row:
htmlParts.append(f'<td>{cellData}</td>')
runs = self._inlineRunsForCell(cellData)
htmlParts.append(f'<td>{self._renderInlineRuns(runs)}</td>')
htmlParts.append('</tr>')
htmlParts.append('</tbody>')
@ -528,10 +702,8 @@ class RendererHtml(BaseRenderer):
htmlParts = ['<ul>']
for item in items:
if isinstance(item, str):
htmlParts.append(f'<li>{item}</li>')
elif isinstance(item, dict) and "text" in item:
htmlParts.append(f'<li>{item["text"]}</li>')
runs = self._inlineRunsForListItem(item)
htmlParts.append(f'<li>{self._renderInlineRuns(runs)}</li>')
htmlParts.append('</ul>')
return '\n'.join(htmlParts)
@ -571,6 +743,11 @@ class RendererHtml(BaseRenderer):
if isinstance(el, dict):
content = el.get("content", {})
if isinstance(content, dict):
# New format: inlineRuns
inlineRuns = content.get("inlineRuns")
if inlineRuns and isinstance(inlineRuns, list):
texts.append(self._renderInlineRuns(inlineRuns))
continue
text = content.get("text", "")
elif isinstance(content, str):
text = content
@ -581,16 +758,18 @@ class RendererHtml(BaseRenderer):
elif isinstance(el, str):
texts.append(el)
if texts:
# Join multiple paragraphs with <p> tags
return '\n'.join(f'<p>{text}</p>' for text in texts)
return ""
elif isinstance(paragraphData, str):
return f'<p>{paragraphData}</p>'
elif isinstance(paragraphData, dict):
# Handle nested content structure: element.content vs element.text
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
# New format: inlineRuns
inlineRuns = content.get("inlineRuns")
if inlineRuns and isinstance(inlineRuns, list):
return f'<p>{self._renderInlineRuns(inlineRuns)}</p>'
text = content.get("text", "")
elif isinstance(content, str):
text = content

View file

@ -43,7 +43,7 @@ class RendererImage(BaseRenderer):
"""
return ["image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to image format using AI image generation."""
try:
# Generate AI image from content

View file

@ -42,7 +42,7 @@ class RendererJson(BaseRenderer):
# Return all types except image
return [st for st in supportedSectionTypes if st != "image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to JSON format."""
try:
# The extracted content should already be JSON from the AI

View file

@ -40,7 +40,7 @@ class RendererMarkdown(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return [st for st in supportedSectionTypes if st != "image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to Markdown format."""
try:
# Generate markdown from JSON structure

View file

@ -106,17 +106,17 @@ class RendererPdf(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to PDF format using AI-analyzed styling."""
try:
if not REPORTLAB_AVAILABLE:
# Fallback to HTML if reportlab not available
from .rendererHtml import RendererHtml
html_renderer = RendererHtml()
return await html_renderer.render(extractedContent, title, userPrompt, aiService)
return await html_renderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate PDF using AI-analyzed styling
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService)
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@ -163,11 +163,28 @@ class RendererPdf(BaseRenderer):
)
]
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, unifiedStyle: Dict[str, Any] = None) -> str:
"""Generate PDF content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(json_content, userPrompt, aiService)
# Get style set from unified style or legacy approach
if unifiedStyle:
styles = self._convertUnifiedStyleToInternal(unifiedStyle)
self._unifiedStyle = unifiedStyle
for level in range(1, 7):
hKey = f"heading{level}"
if hKey not in styles:
styles[hKey] = self._defaultHeadingStyleDef(level)
else:
styles[hKey].setdefault("space_after", 12)
styles[hKey].setdefault("space_before", 12)
styles["paragraph"].setdefault("space_after", 6)
styles["paragraph"].setdefault("line_height", unifiedStyle["paragraph"].get("lineSpacing", 1.2))
styles["bullet_list"].setdefault("space_after", 3)
styles["code_block"].setdefault("space_after", 6)
styles["code_block"].setdefault("align", "left")
else:
styles = await self._getStyleSet(json_content, userPrompt, aiService)
self._unifiedStyle = None
# Validate JSON structure
if not self._validateJsonStructure(json_content):
@ -179,15 +196,13 @@ class RendererPdf(BaseRenderer):
# Create a buffer to hold the PDF
buffer = io.BytesIO()
# Create PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Create PDF document with unified page margins or defaults
pageCfg = unifiedStyle["page"] if unifiedStyle else None
if pageCfg:
m = pageCfg["marginsPt"]
doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=m["right"], leftMargin=m["left"], topMargin=m["top"], bottomMargin=m["bottom"])
else:
doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18)
# Build PDF content (no cover page — body starts on page 1; filename still uses `title`)
story = []
@ -232,13 +247,28 @@ class RendererPdf(BaseRenderer):
removed = False
for idx, flowable in enumerate(story):
fRepr = repr(flowable)
if "Image" in fRepr and hasattr(flowable, 'drawWidth') and hasattr(flowable, 'drawHeight'):
from reportlab.platypus import Image as ReportLabImage
if isinstance(flowable, ReportLabImage):
frameH = 650.0
frameW = 450.0
if flowable.drawHeight > frameH or flowable.drawWidth > frameW:
scaleW = frameW / flowable.drawWidth if flowable.drawWidth > frameW else 1.0
scaleH = frameH / flowable.drawHeight if flowable.drawHeight > frameH else 1.0
s = min(scaleW, scaleH) * 0.9
flowable.drawWidth = flowable.drawWidth * s
flowable.drawHeight = flowable.drawHeight * s
flowable._width = flowable.drawWidth
flowable._height = flowable.drawHeight
removed = True
break
if "Table" in fRepr and hasattr(flowable, '_cellvalues'):
try:
nRows = len(flowable._cellvalues)
nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0
if nRows == 1 and nCols == 1:
errPara = Paragraph(
"[Code block omitted — content too large for PDF page]",
"[Code block omitted - content too large for PDF page]",
self._createNormalStyle({}),
)
story[idx] = errPara
@ -609,6 +639,31 @@ class RendererPdf(BaseRenderer):
.replace(">", "&gt;")
)
def _renderInlineRunsToPdfXml(self, runs: list) -> str:
"""Convert inline runs to ReportLab Paragraph XML."""
parts = []
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier"
for run in runs:
runType = run.get("type", "text")
value = self._escapeReportlabXml(run.get("value", ""))
if runType == "text":
parts.append(value)
elif runType == "bold":
parts.append(f"<b>{value}</b>")
elif runType == "italic":
parts.append(f"<i>{value}</i>")
elif runType == "code":
parts.append(f'<font name="{monoFont}">{value}</font>')
elif runType == "link":
href = self._escapeReportlabXml(run.get("href", ""))
parts.append(f'<a href="{href}">{value}</a>')
elif runType == "image":
parts.append(f"[Image: {value}]")
else:
parts.append(value)
return "".join(parts)
def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str:
"""Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately)."""
if not text:
@ -744,10 +799,10 @@ class RendererPdf(BaseRenderer):
return []
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return []
numCols = len(headers)
colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1)
colWidths = [colWidth] * numCols
@ -755,8 +810,12 @@ class RendererPdf(BaseRenderer):
hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header")
cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell")
def _cellPara(val, ps):
return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps)
def _cellPara(cell, ps):
runs = self._inlineRunsForCell(cell)
if isinstance(cell, list):
xml = self._renderInlineRunsToPdfXml(runs)
return Paragraph(_wrapEmojiSpansInXml(xml), ps)
return self._paragraphFromInlineMarkdown(str(cell) if cell is not None else "", ps)
headerRow = [_cellPara(h, hdrPs) for h in headers]
bodyRows = []
@ -786,7 +845,7 @@ class RendererPdf(BaseRenderer):
]
table.setStyle(TableStyle(table_style))
return [table, Spacer(1, 12)]
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return []
@ -794,32 +853,29 @@ class RendererPdf(BaseRenderer):
def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return []
items = content.get("items", [])
bullet_style_def = styles.get("bullet_list", {})
bulletStyleDef = styles.get("bullet_list", {})
normalStyle = self._createNormalStyle(styles)
elements = []
for item in items:
if isinstance(item, str):
elements.append(
Paragraph(f"{self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles))
)
runs = self._inlineRunsForListItem(item)
if isinstance(item, list):
xml = self._renderInlineRunsToPdfXml(runs)
elements.append(Paragraph(f"\u2022 {_wrapEmojiSpansInXml(xml)}", normalStyle))
elif isinstance(item, str):
elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item)}", normalStyle))
elif isinstance(item, dict) and "text" in item:
elements.append(
Paragraph(
f"{self._markdownInlineToReportlabXml(item['text'])}",
self._createNormalStyle(styles),
)
)
elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item['text'])}", normalStyle))
if elements:
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
elements.append(Spacer(1, bulletStyleDef.get("space_after", 3)))
return elements
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return []
@ -848,20 +904,27 @@ class RendererPdf(BaseRenderer):
def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if isinstance(content, str):
content = {"text": content}
if not isinstance(content, dict):
return []
normalStyle = self._createNormalStyle(styles)
if "inlineRuns" in content:
runs = self._inlineRunsFromContent(content)
xml = self._renderInlineRunsToPdfXml(runs)
if xml:
return [Paragraph(_wrapEmojiSpansInXml(xml), normalStyle)]
return []
text = content.get("text", "")
if text:
return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))]
return [self._paragraphFromInlineMarkdown(text, normalStyle)]
return []
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return []
@ -1030,20 +1093,18 @@ class RendererPdf(BaseRenderer):
pilImage = PILImage.open(imageStream)
originalWidth, originalHeight = pilImage.size
# Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom)
pageWidth = A4[0] # 595.27 points
pageHeight = A4[1] # 841.89 points
leftMargin = 72
rightMargin = 72
topMargin = 72
bottomMargin = 18
# Use actual frame dimensions from SimpleDocTemplate
# Frame is smaller than page minus margins due to internal spacing
# From error message: frame is 439.27559055118115 x 739.8897637795277
# Use conservative values with safety margin
availableWidth = 430.0 # Slightly smaller than frame width for safety
availableHeight = 730.0 # Slightly smaller than frame height for safety
# Use page dimensions minus margins with generous safety buffer
# A4 = 595.27 x 841.89 pt; frame = page - margins - internal padding
_us = getattr(self, '_unifiedStyle', None) or {}
_pageMgn = (_us.get('page') or {}).get('marginsPt') or {}
marginTop = _pageMgn.get('top', 60)
marginBottom = _pageMgn.get('bottom', 60)
marginLeft = _pageMgn.get('left', 60)
marginRight = _pageMgn.get('right', 60)
availableWidth = pageWidth - marginLeft - marginRight - 20 # 20pt safety
availableHeight = pageHeight - marginTop - marginBottom - 80 # 80pt safety for header/footer
# Convert original image size from pixels to points
# PIL provides size in pixels, need to convert to points

View file

@ -59,7 +59,7 @@ class RendererPptx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render content as PowerPoint presentation from JSON data.
@ -68,7 +68,7 @@ class RendererPptx(BaseRenderer):
title: Title for the presentation
userPrompt: User prompt for AI styling
aiService: AI service for styling
**kwargs: Additional rendering options
style: Unified style dict from pipeline (preferred over AI-generated styles)
Returns:
Base64-encoded PowerPoint presentation as string
@ -81,8 +81,19 @@ class RendererPptx(BaseRenderer):
from pptx.dml.color import RGBColor
import re
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
# Get style set: prefer unified style, then metadata, then AI-enhanced
if style:
internalStyle = self._convertUnifiedStyleToInternal(style)
defaultPptx = self._getDefaultStyleSet()
for key in ("slide_size", "content_per_slide", "design_theme", "color_scheme", "background_style", "accent_colors", "professional_grade", "executive_ready"):
internalStyle[key] = defaultPptx.get(key)
internalStyle["heading"] = internalStyle["heading1"]
internalStyle["subheading"] = internalStyle["heading2"]
styles = internalStyle
self._unifiedStyle = style
else:
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
self._unifiedStyle = None
# Create new presentation
prs = Presentation()
@ -910,15 +921,17 @@ JSON ONLY. NO OTHER TEXT."""
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
if content.get("inlineRuns"):
text = "".join(r.get("value", "") for r in content["inlineRuns"])
else:
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
# Limit paragraph length based on content density
max_length = 200 # Default limit
max_length = 200
if len(text) > max_length:
text = text[:max_length] + "..."
@ -1303,6 +1316,32 @@ JSON ONLY. NO OTHER TEXT."""
r.text = text[pos:]
_applyBase(r)
def _renderInlineRunsPptx(self, runs, paragraph, fontSize=None, fontColor=None):
"""Process InlineRun dicts into pptx text runs."""
from pptx.util import Pt
paragraph.text = ""
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier New"
for run in runs:
runType = run.get("type", "text")
value = run.get("value", "")
r = paragraph.add_run()
r.text = value
if fontSize:
r.font.size = fontSize
if fontColor:
r.font.color.rgb = fontColor
if runType == "bold":
r.font.bold = True
elif runType == "italic":
r.font.italic = True
elif runType == "code":
r.font.name = monoFont
if fontSize and hasattr(fontSize, 'pt'):
r.font.size = Pt(max(8, int(fontSize.pt * 0.85)))
elif runType == "link":
r.font.underline = True
def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None:
"""Add a PowerPoint table to slide."""
try:
@ -1374,7 +1413,8 @@ JSON ONLY. NO OTHER TEXT."""
cell = table.cell(0, col_idx)
# Clear existing text and set new text
cell.text_frame.clear()
header_text = str(header) if header else ""
cellRuns = self._inlineRunsForCell(header)
header_text = "".join(r.get("value", "") for r in cellRuns)
cell.text = header_text
# Ensure paragraph exists
@ -1420,7 +1460,8 @@ JSON ONLY. NO OTHER TEXT."""
cell = table.cell(row_idx, col_idx)
# Clear existing text and set new text
cell.text_frame.clear()
cell_text = str(cell_data) if cell_data is not None else ""
cellRuns = self._inlineRunsForCell(cell_data)
cell_text = "".join(r.get("value", "") for r in cellRuns)
cell.text = cell_text
# Ensure paragraph exists
@ -1462,9 +1503,8 @@ JSON ONLY. NO OTHER TEXT."""
fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47))))
for item in items:
itemText = item.get("text", "") if isinstance(item, dict) else str(item)
if not itemText or not itemText.strip():
continue
runs = self._inlineRunsForListItem(item)
isNewFormat = isinstance(item, list)
p = text_frame.add_paragraph()
p.level = 0
@ -1472,21 +1512,33 @@ JSON ONLY. NO OTHER TEXT."""
p.space_before = Pt(2)
p.space_after = Pt(2)
# Consistent bullet prefix
self._addMarkdownInlineRuns(p, f"{itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
if isNewFormat:
bulletRuns = [{"type": "text", "value": " \u2022 "}] + runs
self._renderInlineRunsPptx(bulletRuns, p, fontSize=fontSize, fontColor=fontColor)
else:
itemText = item.get("text", "") if isinstance(item, dict) else str(item)
if not itemText or not itemText.strip():
continue
self._addMarkdownInlineRuns(p, f" \u2022 {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
# Subitems
# Subitems (only for dict-style items)
if isinstance(item, dict):
for sub in item.get("subitems", []):
subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
if not subText:
continue
subRuns = self._inlineRunsForListItem(sub)
isSubNew = isinstance(sub, list)
sp = text_frame.add_paragraph()
sp.level = 0
sp.alignment = PP_ALIGN.LEFT
sp.space_before = Pt(1)
sp.space_after = Pt(1)
self._addMarkdownInlineRuns(sp, f" {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
if isSubNew:
subBulletRuns = [{"type": "text", "value": " \u2013 "}] + subRuns
self._renderInlineRunsPptx(subBulletRuns, sp, fontSize=fontSize, fontColor=fontColor)
else:
subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
if not subText:
continue
self._addMarkdownInlineRuns(sp, f" \u2013 {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
except Exception as e:
logger.warning(f"Error adding bullet list to slide: {str(e)}")
@ -1540,42 +1592,53 @@ JSON ONLY. NO OTHER TEXT."""
# Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
inlineRuns = self._inlineRunsFromContent(content)
hasInlineRuns = content.get("inlineRuns") is not None
text = content.get("text", "")
elif isinstance(content, str):
text = content
inlineRuns = [{"type": "text", "value": text}] if text else []
hasInlineRuns = False
else:
text = ""
inlineRuns = []
hasInlineRuns = False
if text:
p = text_frame.add_paragraph()
p.level = 0
try:
if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None
except (AttributeError, TypeError):
pass
paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 14)
calculated_size = max(10, int(base_font_size * font_size_multiplier))
fSize = Pt(calculated_size)
fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
fBold = paragraph_style.get("bold", False)
if not inlineRuns and not text:
return
p = text_frame.add_paragraph()
p.level = 0
try:
if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None
except (AttributeError, TypeError):
pass
paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 14)
calculated_size = max(10, int(base_font_size * font_size_multiplier))
fSize = Pt(calculated_size)
fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
fBold = paragraph_style.get("bold", False)
if hasInlineRuns:
self._renderInlineRunsPptx(inlineRuns, p, fontSize=fSize, fontColor=fColor)
else:
self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold)
# Add proper spacing
p.space_before = Pt(6) # Space before paragraph
p.space_after = Pt(6) # Space after paragraph
p.line_spacing = 1.2 # Line spacing for readability
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
p.space_before = Pt(6)
p.space_after = Pt(6)
p.line_spacing = 1.2
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
except Exception as e:
logger.warning(f"Error adding paragraph to slide: {str(e)}")

View file

@ -76,7 +76,7 @@ class RendererText(BaseRenderer):
# Text renderer accepts all types except images
return [st for st in supportedSectionTypes if st != "image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to plain text format."""
try:
# Generate text from JSON structure

View file

@ -68,17 +68,17 @@ class RendererXlsx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to Excel format using AI-analyzed styling."""
try:
if not OPENPYXL_AVAILABLE:
# Fallback to CSV if openpyxl not available
from .rendererCsv import RendererCsv
csvRenderer = RendererCsv()
return await csvRenderer.render(extractedContent, title, userPrompt, aiService)
return await csvRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate Excel using AI-analyzed styling
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService)
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService, style=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@ -298,15 +298,22 @@ class RendererXlsx(BaseRenderer):
except Exception as e:
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str:
"""Generate Excel content from structured JSON document using AI-generated styling."""
try:
# Debug output
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER")
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER")
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Store unified style for use by inline-run helpers
self._unifiedStyle = style
# Get style set: prefer unified style, fall back to legacy approach
if style:
styles = self._convertUnifiedStyleToInternal(style)
styles = self._convertColorsFormat(styles)
else:
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
if not self._validateJsonStructure(jsonContent):
@ -511,6 +518,10 @@ class RendererXlsx(BaseRenderer):
"code_block": {"font": "Courier New", "font_size": 10, "color": "FF2F2F2F", "background": "FFF5F5F5"}
}
def _renderInlineRuns(self, runs: list) -> str:
"""Flatten inline runs to plain text for Excel cells."""
return "".join(r.get("value", "") for r in runs)
async def _getAiStylesWithExcelColors(self, aiService, styleTemplate: str, defaultStyles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper Excel color conversion."""
if not aiService:
@ -1206,7 +1217,9 @@ class RendererXlsx(BaseRenderer):
# Add headers with formatting - OPTIMIZED: use cached style objects
for col, header in enumerate(headers, 1):
sanitized_header = self._sanitizeCellValue(header)
runs = self._inlineRunsForCell(header)
headerText = self._renderInlineRuns(runs)
sanitized_header = self._sanitizeCellValue(headerText)
cell = sheet.cell(row=headerRow, column=col, value=sanitized_header)
# Apply styling with fallbacks - use pre-calculated objects
@ -1272,7 +1285,9 @@ class RendererXlsx(BaseRenderer):
cell_values = cell_values[:header_count]
for col, cell_value in enumerate(cell_values, 1):
sanitized_value = self._sanitizeCellValue(cell_value)
runs = self._inlineRunsForCell(cell_value)
cellText = self._renderInlineRuns(runs)
sanitized_value = self._sanitizeCellValue(cellText)
cell = sheet.cell(row=startRow, column=col, value=sanitized_value)
# Apply styling with fallbacks - use pre-calculated objects
@ -1311,20 +1326,20 @@ class RendererXlsx(BaseRenderer):
def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
"""Add a list element to Excel sheet. Expects nested content structure."""
try:
# Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return startRow
list_items = content.get("items") or []
# Ensure list_items is a list
if not isinstance(list_items, list):
list_items = []
listItems = content.get("items") or []
if not isinstance(listItems, list):
listItems = []
list_style = styles.get("bullet_list", {})
for item in list_items:
sheet.cell(row=startRow, column=1, value=f"{item}")
if list_style.get("color"):
sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(list_style["color"]))
listStyle = styles.get("bullet_list", {})
for item in listItems:
runs = self._inlineRunsForListItem(item)
text = self._renderInlineRuns(runs)
sheet.cell(row=startRow, column=1, value=f"\u2022 {text}")
if listStyle.get("color"):
sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(listStyle["color"]))
startRow += 1
return startRow
@ -1336,10 +1351,10 @@ class RendererXlsx(BaseRenderer):
def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
"""Add a paragraph element to Excel sheet. Expects nested content structure."""
try:
# Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
runs = self._inlineRunsFromContent(content)
text = self._renderInlineRuns(runs)
elif isinstance(content, str):
text = content
else:

View file

@ -0,0 +1,75 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Default style definitions and style resolution for document rendering."""
from typing import Any, Dict
DEFAULT_STYLE: Dict[str, Any] = {
"fonts": {
"primary": "Calibri",
"monospace": "Consolas",
},
"colors": {
"primary": "#1F3864",
"secondary": "#2C3E50",
"accent": "#2980B9",
"background": "#FFFFFF",
},
"headings": {
"h1": {"sizePt": 24, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 12, "spaceAfterPt": 6},
"h2": {"sizePt": 18, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 10, "spaceAfterPt": 4},
"h3": {"sizePt": 14, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 8, "spaceAfterPt": 3},
"h4": {"sizePt": 12, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 6, "spaceAfterPt": 2},
},
"paragraph": {"sizePt": 11, "lineSpacing": 1.15, "color": "#333333"},
"table": {
"headerBg": "#1F3864",
"headerFg": "#FFFFFF",
"headerSizePt": 10,
"bodySizePt": 10,
"rowBandingEven": "#F2F6FC",
"rowBandingOdd": "#FFFFFF",
"borderColor": "#CBD5E1",
"borderWidthPt": 0.5,
},
"list": {"bulletChar": "\u2022", "indentPt": 18, "sizePt": 11},
"image": {"defaultWidthPt": 480, "maxWidthPt": 800, "alignment": "center"},
"codeBlock": {"fontSizePt": 9, "background": "#F8F9FA", "borderColor": "#E2E8F0"},
"page": {
"format": "A4",
"marginsPt": {"top": 60, "bottom": 60, "left": 60, "right": 60},
"showPageNumbers": True,
"headerHeight": 30,
"footerHeight": 30,
"headerLogo": None,
"headerText": "",
"footerText": "",
},
}
def _deepMerge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
"""Recursively merge override into base. Both dicts left unchanged; returns new dict."""
result = {}
for key in base:
if key in override:
baseVal = base[key]
overVal = override[key]
if isinstance(baseVal, dict) and isinstance(overVal, dict):
result[key] = _deepMerge(baseVal, overVal)
else:
result[key] = overVal
else:
result[key] = base[key]
for key in override:
if key not in base:
result[key] = override[key]
return result
def resolveStyle(agentStyle: dict | None) -> Dict[str, Any]:
"""Deep-merge DEFAULT_STYLE <- agentStyle. Returns fully resolved style dict."""
if not agentStyle:
return dict(DEFAULT_STYLE)
return _deepMerge(DEFAULT_STYLE, agentStyle)

View file

@ -9,11 +9,70 @@ from typing import Any, Dict
logger = logging.getLogger(__name__)
def _parseInlineRuns(text: str) -> list:
"""
Parse inline markdown formatting into a list of InlineRun dicts.
Handles: images, links, bold, italic, inline code, plain text.
Uses a regex-based tokenizer that processes tokens left-to-right.
"""
if not text:
return [{"type": "text", "value": ""}]
# Pattern order matters: images before links, bold before italic
_TOKEN_RE = re.compile(
r'!\[(?P<imgAlt>[^\]]*)\]\((?P<imgSrc>[^)"]+)(?:\s+"(?P<imgWidth>\d+)pt")?\)' # image
r'|\[(?P<linkText>[^\]]+)\]\((?P<linkHref>[^)]+)\)' # link
r'|`(?P<code>[^`]+)`' # inline code
r'|\*\*(?P<bold>.+?)\*\*' # bold
r'|(?<!\w)\*(?P<italic1>.+?)\*(?!\w)' # italic *x*
r'|(?<!\w)_(?P<italic2>.+?)_(?!\w)' # italic _x_
)
runs = []
lastEnd = 0
for m in _TOKEN_RE.finditer(text):
# Plain text before this match
if m.start() > lastEnd:
runs.append({"type": "text", "value": text[lastEnd:m.start()]})
if m.group("imgAlt") is not None or m.group("imgSrc") is not None:
alt = (m.group("imgAlt") or "").strip() or "Image"
src = (m.group("imgSrc") or "").strip()
widthStr = m.group("imgWidth")
run = {"type": "image", "value": alt}
if src.startswith("file:"):
run["fileId"] = src[5:]
else:
run["href"] = src
if widthStr:
run["widthPt"] = int(widthStr)
runs.append(run)
elif m.group("linkText") is not None:
runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")})
elif m.group("code") is not None:
runs.append({"type": "code", "value": m.group("code")})
elif m.group("bold") is not None:
runs.append({"type": "bold", "value": m.group("bold")})
elif m.group("italic1") is not None:
runs.append({"type": "italic", "value": m.group("italic1")})
elif m.group("italic2") is not None:
runs.append({"type": "italic", "value": m.group("italic2")})
lastEnd = m.end()
# Trailing plain text
if lastEnd < len(text):
runs.append({"type": "text", "value": text[lastEnd:]})
return runs if runs else [{"type": "text", "value": text}]
def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""
Convert markdown content to the standard document JSON format expected by renderReport.
Supports headings, code blocks, tables, lists, images (file: refs), paragraphs.
For plain text: wraps entire content in a single paragraph section.
Convert markdown content to the standard document JSON format with Inline-Run model.
Sections use inlineRuns (list of run dicts) instead of plain text strings.
Supports headings, code blocks, tables, lists, images, paragraphs.
"""
if not isinstance(markdown, str):
markdown = str(markdown) if markdown else ""
@ -31,7 +90,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
while i < len(lines):
line = lines[i]
# Headings
# Headings (plain text, no inline formatting)
headingMatch = re.match(r"^(#{1,6})\s+(.+)", line)
if headingMatch:
level = len(headingMatch.group(1))
@ -43,7 +102,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
i += 1
continue
# Fenced code blocks
# Fenced code blocks (no inline formatting)
codeMatch = re.match(r"^```(\w*)", line)
if codeMatch:
lang = codeMatch.group(1) or "text"
@ -59,14 +118,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
})
continue
# Tables
# Tables - cells are List[InlineRun]
tableMatch = re.match(r"^\|(.+)\|$", line)
if tableMatch and (i + 1) < len(lines) and re.match(r"^\|[\s\-:|]+\|$", lines[i + 1]):
headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
headerCells = [_parseInlineRuns(c.strip()) for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and re.match(r"^\|(.+)\|$", lines[i]):
rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
rowCells = [_parseInlineRuns(c.strip()) for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
@ -75,14 +134,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
})
continue
# Bullet / numbered lists
# Bullet / numbered lists - items are List[List[InlineRun]]
listMatch = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", line)
if listMatch:
isNumbered = bool(re.match(r"\d+[.)]", listMatch.group(2)))
items = []
while i < len(lines) and re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i]):
m = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i])
items.append({"text": m.group(3).strip()})
items.append(_parseInlineRuns(m.group(3).strip()))
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
@ -95,46 +154,50 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
i += 1
continue
# Images (simplified: store as paragraph with ref for now - full resolution needs Knowledge Store)
imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)]+)\)", line)
# Standalone image on its own line -> block-level image section
imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)\"]+)(?:\s+\"(\d+)pt\")?\)\s*$", line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
widthStr = imgMatch.group(3)
fileId = src[5:] if src.startswith("file:") else ""
content = {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
if widthStr:
content["widthPt"] = int(widthStr)
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
"elements": [{
"content": {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
}],
"elements": [{"content": content}],
})
i += 1
continue
# Paragraph
# Paragraph - produces inlineRuns
paraLines = []
while i < len(lines) and lines[i].strip() and not re.match(
r"^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )", lines[i]
r"^(#{1,6}\s|```|\|.+\||!\[[^\]]*\]\([^)]+\)\s*$|(\s*)([-*+]|\d+[.)]) )", lines[i]
):
paraLines.append(lines[i])
i += 1
if paraLines:
combinedText = " ".join(paraLines)
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": " ".join(paraLines)}}],
"elements": [{"content": {"inlineRuns": _parseInlineRuns(combinedText)}}],
})
continue
i += 1
if not sections:
fallbackText = markdown.strip() or "(empty)"
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
"elements": [{"content": {"inlineRuns": _parseInlineRuns(fallbackText)}}],
})
return {

View file

@ -2,9 +2,13 @@
# All rights reserved.
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
import hashlib
import json
import logging
import re
from typing import Any, Callable, Dict, List, Optional
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Union
from modules.datamodels.datamodelKnowledge import (
FileContentIndex, ContentChunk, WorkflowMemory,
@ -20,6 +24,68 @@ DEFAULT_CHUNK_TOKENS = 400
DEFAULT_CONTEXT_BUDGET = 12000
# =============================================================================
# Ingestion façade (P0 of unified-knowledge-indexing concept)
# =============================================================================
@dataclass
class IngestionJob:
"""One request to add or refresh content in the unified knowledge store.
Callers from any lane (routes, feature hooks, agent tools, connector sync)
describe the work they want done via this object; idempotency, scope
resolution, and embedding are handled by KnowledgeService.requestIngestion.
"""
sourceKind: str
sourceId: str
fileName: str
mimeType: str
userId: str
contentObjects: List[Dict[str, Any]] = field(default_factory=list)
featureInstanceId: str = ""
mandateId: str = ""
structure: Optional[Dict[str, Any]] = None
containerPath: Optional[str] = None
contentVersion: Optional[str] = None
provenance: Optional[Dict[str, Any]] = None
# Connector-driven neutralization: True when the user opted in via §2.6 preferences.
# For sourceKind == "file", _indexFileInternal resolves this from FileItem.neutralize instead.
neutralize: bool = False
@dataclass
class IngestionHandle:
"""Result of requestIngestion. Stable across in-process and future queue impls."""
jobId: str
status: str
contentHash: str
fileId: str
index: Optional[FileContentIndex] = None
error: Optional[str] = None
def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
"""Deterministic SHA256 over (contentType, data) tuples in extractor order.
`contentObjectId` is intentionally excluded because extractors generate
fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable
across re-extractions of the same source defeating idempotency.
Order is preserved (no sort) because two different documents can share the
same multiset of parts but differ in arrangement (e.g. swapped pages).
Text whitespace is preserved intentionally because chunk boundaries
depend on it.
"""
normalized = [
(
str(o.get("contentType", "text") or "text"),
o.get("data", "") or "",
)
for o in (contentObjects or [])
]
payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
class KnowledgeService:
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
@ -46,6 +112,224 @@ class KnowledgeService:
results = await self._embed([text])
return results[0] if results else []
# =========================================================================
# Ingestion façade (single entry point for all lanes)
# =========================================================================
async def requestIngestion(self, job: IngestionJob) -> IngestionHandle:
"""Unified entry point for filling the knowledge corpus.
Applies idempotency based on a content hash (or caller-supplied
`contentVersion`) persisted in `FileContentIndex.structure._ingestion`.
Re-runs indexing only when the hash differs or the previous run did
not reach `indexed` state. Runs embedding synchronously for now
(callers already schedule background tasks where needed).
"""
jobId = f"{job.sourceKind}:{job.sourceId}"
startMs = time.time()
contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects)
# 1. Check for duplicate via existing FileContentIndex row.
existing = None
try:
existing = self._knowledgeDb.getFileContentIndex(job.sourceId)
except Exception:
existing = None
if existing:
existingStructure = (
existing.get("structure") if isinstance(existing, dict)
else getattr(existing, "structure", {})
) or {}
existingMeta = existingStructure.get("_ingestion", {}) or {}
existingStatus = (
existing.get("status") if isinstance(existing, dict)
else getattr(existing, "status", "")
) or ""
if existingMeta.get("hash") == contentHash and existingStatus == "indexed":
logger.info(
"ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s",
job.sourceKind, job.sourceId, contentHash[:12],
extra={
"event": "ingestion.skipped.duplicate",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"durationMs": int((time.time() - startMs) * 1000),
},
)
return IngestionHandle(
jobId=jobId,
status="duplicate",
contentHash=contentHash,
fileId=job.sourceId,
index=None,
)
# 2. Prepare ingestion metadata; stays in structure._ingestion so
# later connector revoke/purge can filter chunks by sourceKind /
# provenance.connectionId without a schema migration.
ingestionMeta = {
"hash": contentHash,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"contentVersion": job.contentVersion,
"indexedAt": getUtcTimestamp(),
"provenance": dict(job.provenance or {}),
}
structure = dict(job.structure or {})
structure["_ingestion"] = ingestionMeta
logger.info(
"ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s",
job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12],
extra={
"event": "ingestion.queued",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"objectCount": len(job.contentObjects or []),
},
)
# 3. Run real indexing.
try:
index = await self._indexFileInternal(
fileId=job.sourceId,
fileName=job.fileName,
mimeType=job.mimeType,
userId=job.userId,
featureInstanceId=job.featureInstanceId,
mandateId=job.mandateId,
contentObjects=job.contentObjects or [],
structure=structure,
containerPath=job.containerPath,
sourceKind=job.sourceKind,
connectionId=(job.provenance or {}).get("connectionId"),
neutralize=job.neutralize,
)
except Exception as exc:
logger.error(
"ingestion.failed sourceKind=%s sourceId=%s error=%s",
job.sourceKind, job.sourceId, exc,
exc_info=True,
extra={
"event": "ingestion.failed",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"error": str(exc),
"durationMs": int((time.time() - startMs) * 1000),
},
)
try:
self._knowledgeDb.updateFileStatus(job.sourceId, "failed")
except Exception:
pass
return IngestionHandle(
jobId=jobId,
status="failed",
contentHash=contentHash,
fileId=job.sourceId,
index=None,
error=str(exc),
)
logger.info(
"ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d",
job.sourceKind, job.sourceId, len(job.contentObjects or []),
int((time.time() - startMs) * 1000),
extra={
"event": "ingestion.indexed",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"objectCount": len(job.contentObjects or []),
"durationMs": int((time.time() - startMs) * 1000),
},
)
return IngestionHandle(
jobId=jobId,
status="indexed",
contentHash=contentHash,
fileId=job.sourceId,
index=index,
)
def purgeConnection(self, connectionId: str) -> Dict[str, int]:
"""Delete every FileContentIndex + ContentChunk linked to a UserConnection.
Called on `connection.revoked` events so the knowledge corpus never
holds chunks the user has withdrawn access to. Returns deletion counts
for observability.
"""
if not connectionId:
return {"indexRows": 0, "chunks": 0}
startMs = time.time()
result = self._knowledgeDb.deleteFileContentIndexByConnectionId(connectionId)
logger.info(
"ingestion.connection.purged connectionId=%s rows=%d chunks=%d durationMs=%d",
connectionId, result["indexRows"], result["chunks"],
int((time.time() - startMs) * 1000),
extra={
"event": "ingestion.connection.purged",
"connectionId": connectionId,
"indexRows": result["indexRows"],
"chunks": result["chunks"],
"durationMs": int((time.time() - startMs) * 1000),
},
)
return result
def getIngestionStatus(
self, handleOrJobId: Union[IngestionHandle, str]
) -> Dict[str, Any]:
"""Map a handle or `sourceKind:sourceId` jobId to a status snapshot."""
if isinstance(handleOrJobId, IngestionHandle):
sourceId = handleOrJobId.fileId
jobId = handleOrJobId.jobId
elif isinstance(handleOrJobId, str) and ":" in handleOrJobId:
jobId = handleOrJobId
sourceId = handleOrJobId.split(":", 1)[1]
else:
jobId = str(handleOrJobId)
sourceId = str(handleOrJobId)
row = None
try:
row = self._knowledgeDb.getFileContentIndex(sourceId)
except Exception:
row = None
if not row:
return {
"jobId": jobId,
"sourceId": sourceId,
"status": "unknown",
"contentHash": None,
}
structure = (
row.get("structure") if isinstance(row, dict)
else getattr(row, "structure", {})
) or {}
meta = structure.get("_ingestion", {}) or {}
status = (
row.get("status") if isinstance(row, dict)
else getattr(row, "status", "")
) or "unknown"
return {
"jobId": jobId,
"sourceId": sourceId,
"status": status,
"contentHash": meta.get("hash"),
"sourceKind": meta.get("sourceKind"),
"indexedAt": meta.get("indexedAt"),
}
# =========================================================================
# File Indexing (called after extraction, before embedding)
# =========================================================================
@ -61,6 +345,57 @@ class KnowledgeService:
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
) -> Optional[FileContentIndex]:
"""Backward-compatible wrapper delegating to requestIngestion.
Existing callers that still invoke `indexFile` directly automatically
participate in the idempotency/metrics layer. New callers should
prefer `requestIngestion` so they can pass `sourceKind` and
`provenance` for connector revoke/purge later.
"""
job = IngestionJob(
sourceKind="file",
sourceId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=list(contentObjects or []),
structure=structure,
containerPath=containerPath,
)
handle = await self.requestIngestion(job)
if handle.index is not None:
return handle.index
if handle.status == "duplicate":
row = None
try:
row = self._knowledgeDb.getFileContentIndex(fileId)
except Exception:
row = None
if isinstance(row, dict):
try:
return FileContentIndex(**row)
except Exception:
return None
return row
return None
async def _indexFileInternal(
self,
fileId: str,
fileName: str,
mimeType: str,
userId: str,
featureInstanceId: str = "",
mandateId: str = "",
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
sourceKind: str = "file",
connectionId: Optional[str] = None,
neutralize: bool = False,
) -> FileContentIndex:
"""Index a file's content objects and create embeddings for text chunks.
@ -83,39 +418,41 @@ class KnowledgeService:
"""
contentObjects = contentObjects or []
# 1. Resolve scope fields from FileItem (Single Source of Truth)
# FileItem lives in poweron_management; its scope/mandateId/featureInstanceId
# are authoritative and must be mirrored onto the FileContentIndex.
# 1. Resolve scope fields from FileItem (Single Source of Truth) for
# uploaded files. Connector-sourced ingestion (sharepoint_item,
# outlook_message, ...) has no FileItem row — trust the caller's
# scope + ids directly.
resolvedScope = "personal"
resolvedMandateId = mandateId
resolvedFeatureInstanceId = featureInstanceId
resolvedUserId = userId
_shouldNeutralize = False
try:
from modules.datamodels.datamodelFiles import FileItem as _FileItem
_dbComponent = getattr(self._context, "interfaceDbComponent", None)
_fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
if not _fileRecords:
from modules.interfaces.interfaceDbManagement import ComponentObjects
_row = ComponentObjects().db._loadRecord(_FileItem, fileId)
if _row:
_fileRecords = [_row]
if _fileRecords:
_fileRecord = _fileRecords[0]
_get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
_shouldNeutralize = bool(_get("neutralize", False))
_fileScope = _get("scope")
if _fileScope:
resolvedScope = _fileScope
if not resolvedMandateId:
resolvedMandateId = str(_get("mandateId", "") or "")
if not resolvedFeatureInstanceId:
resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
_fileCreatedBy = _get("sysCreatedBy")
if _fileCreatedBy:
resolvedUserId = str(_fileCreatedBy)
except Exception:
pass
_shouldNeutralize = neutralize # caller-supplied flag (connector prefs / IngestionJob)
if sourceKind == "file":
try:
from modules.datamodels.datamodelFiles import FileItem as _FileItem
_dbComponent = getattr(self._context, "interfaceDbComponent", None)
_fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
if not _fileRecords:
from modules.interfaces.interfaceDbManagement import ComponentObjects
_row = ComponentObjects().db._loadRecord(_FileItem, fileId)
if _row:
_fileRecords = [_row]
if _fileRecords:
_fileRecord = _fileRecords[0]
_get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
_shouldNeutralize = bool(_get("neutralize", False)) # FileItem is authoritative for uploads
_fileScope = _get("scope")
if _fileScope:
resolvedScope = _fileScope
if not resolvedMandateId:
resolvedMandateId = str(_get("mandateId", "") or "")
if not resolvedFeatureInstanceId:
resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
_fileCreatedBy = _get("sysCreatedBy")
if _fileCreatedBy:
resolvedUserId = str(_fileCreatedBy)
except Exception:
pass
# 2. Create FileContentIndex with correct scope from the start
index = FileContentIndex(
@ -124,6 +461,8 @@ class KnowledgeService:
featureInstanceId=resolvedFeatureInstanceId,
mandateId=resolvedMandateId,
scope=resolvedScope,
sourceKind=sourceKind,
connectionId=connectionId,
fileName=fileName,
mimeType=mimeType,
containerPath=containerPath,
@ -300,7 +639,12 @@ class KnowledgeService:
Formatted context string for injection into the agent's system prompt.
"""
queryVector = await self._embedSingle(currentPrompt)
logger.debug(
"buildAgentContext.start userId=%s featureInstanceId=%s mandateId=%s isSysAdmin=%s prompt=%r",
userId, featureInstanceId, mandateId, isSysAdmin, (currentPrompt or "")[:120],
)
if not queryVector:
logger.debug("buildAgentContext.abort reason=no_query_vector")
return ""
builder = _ContextBuilder(budget=contextBudget)
@ -327,9 +671,14 @@ class KnowledgeService:
featureInstanceId=featureInstanceId,
mandateId=mandateId,
limit=15,
minScore=0.65,
minScore=0.35,
isSysAdmin=isSysAdmin,
)
logger.debug(
"buildAgentContext.layer1 instanceChunks=%d top_scores=%s",
len(instanceChunks),
[round(float(c.get("_score", 0) or 0), 3) for c in (instanceChunks or [])[:3]],
)
if instanceChunks:
builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000)
@ -338,7 +687,7 @@ class KnowledgeService:
queryVector=queryVector,
workflowId=workflowId,
limit=10,
minScore=0.55,
minScore=0.35,
)
if roundMemories:
memItems = []
@ -376,7 +725,7 @@ class KnowledgeService:
scope="mandate",
mandateId=mandateId,
limit=10,
minScore=0.7,
minScore=0.35,
isSysAdmin=isSysAdmin,
)
if mandateChunks:
@ -392,7 +741,12 @@ class KnowledgeService:
maxChars=500,
)
return builder.build()
_result = builder.build()
logger.debug(
"buildAgentContext.done totalChars=%d userId=%s",
len(_result), userId,
)
return _result
# =========================================================================
# Workflow Memory

View file

@ -0,0 +1,334 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Connection-lifecycle consumer bridging OAuth events to ingestion jobs.
Subscribes to `connection.established` and `connection.revoked` callbacks
emitted by the OAuth callbacks / connection management routes and dispatches:
- `connection.established` -> enqueue a `connection.bootstrap` BackgroundJob
that walks the connector and ingests all reachable items via
KnowledgeService.requestIngestion (file-like or virtual documents).
- `connection.revoked` -> run `KnowledgeService.purgeConnection` synchronously
so the knowledge corpus releases the data before the UI confirms the revoke.
The consumer is registered once at process boot (see `app.py` lifespan).
It intentionally does NOT hold a per-user service context; each callback
creates whatever context it needs from the UserConnection row itself.
"""
from __future__ import annotations
import asyncio
import logging
from typing import Any, Dict, Optional
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.shared.callbackRegistry import callbackRegistry
from modules.serviceCenter.services.serviceBackgroundJobs import (
registerJobHandler,
startJob,
)
logger = logging.getLogger(__name__)
BOOTSTRAP_JOB_TYPE = "connection.bootstrap"
_registered = False
def _onConnectionEstablished(
*,
connectionId: str,
authority: str,
userId: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Fire-and-forget bootstrap enqueue for a freshly connected UserConnection."""
if not connectionId:
logger.warning("connection.established without connectionId; ignoring")
return
payload: Dict[str, Any] = {
"connectionId": connectionId,
"authority": (authority or "").lower(),
"userId": userId,
}
logger.info(
"ingestion.connection.bootstrap.queued connectionId=%s authority=%s",
connectionId, authority,
extra={
"event": "ingestion.connection.bootstrap.queued",
"connectionId": connectionId,
"authority": authority,
},
)
async def _enqueue() -> None:
try:
await startJob(
BOOTSTRAP_JOB_TYPE,
payload,
triggeredBy=userId,
)
except Exception as exc:
logger.error(
"ingestion.connection.bootstrap.enqueue_failed connectionId=%s error=%s",
connectionId, exc, exc_info=True,
)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(_enqueue())
else:
loop.run_until_complete(_enqueue())
except RuntimeError:
asyncio.run(_enqueue())
def _onConnectionRevoked(
*,
connectionId: str,
authority: Optional[str] = None,
userId: Optional[str] = None,
reason: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Run the knowledge purge synchronously so UI feedback is authoritative."""
if not connectionId:
logger.warning("connection.revoked without connectionId; ignoring")
return
try:
# Purge lives on the DB interface to avoid ServiceCenter/user-context
# plumbing here; the service method is a thin wrapper on top of this.
result = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId)
except Exception as exc:
logger.error(
"ingestion.connection.purged.failed connectionId=%s error=%s",
connectionId, exc, exc_info=True,
)
return
logger.info(
"ingestion.connection.purged connectionId=%s authority=%s reason=%s rows=%d chunks=%d",
connectionId, authority, reason,
result.get("indexRows", 0), result.get("chunks", 0),
extra={
"event": "ingestion.connection.purged",
"connectionId": connectionId,
"authority": authority,
"reason": reason,
"indexRows": result.get("indexRows", 0),
"chunks": result.get("chunks", 0),
},
)
async def _bootstrapJobHandler(
job: Dict[str, Any],
progressCb,
) -> Dict[str, Any]:
"""Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps."""
payload = job.get("payload") or {}
connectionId = payload.get("connectionId")
authority = (payload.get("authority") or "").lower()
if not connectionId:
raise ValueError("connection.bootstrap requires payload.connectionId")
progressCb(5, f"resolving {authority} connection")
# Defensive consent check: if the connection has since disabled knowledge ingestion
# (e.g. user toggled setting after the job was enqueued), skip all walkers.
try:
from modules.interfaces.interfaceDbApp import getRootInterface
_root = getRootInterface()
_conn = _root.getUserConnectionById(connectionId)
if _conn and not getattr(_conn, "knowledgeIngestionEnabled", True):
logger.info(
"ingestion.connection.bootstrap.skipped — consent disabled connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connectionId,
"authority": authority,
"reason": "consent_disabled",
},
)
return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "consent_disabled"}
except Exception as _guardErr:
logger.debug("Could not load connection for consent guard: %s", _guardErr)
def _normalize(res: Any, label: str) -> Dict[str, Any]:
if isinstance(res, Exception):
logger.error(
"ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
label, connectionId, res, exc_info=res,
)
return {"error": str(res)}
return res or {}
if authority == "msft":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
bootstrapSharepoint,
)
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
bootstrapOutlook,
)
progressCb(10, "sharepoint + outlook")
spResult, olResult = await asyncio.gather(
bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb),
bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
return_exceptions=True,
)
return {
"connectionId": connectionId,
"authority": authority,
"sharepoint": _normalize(spResult, "sharepoint"),
"outlook": _normalize(olResult, "outlook"),
}
if authority == "google":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
bootstrapGdrive,
)
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
bootstrapGmail,
)
progressCb(10, "drive + gmail")
gdResult, gmResult = await asyncio.gather(
bootstrapGdrive(connectionId=connectionId, progressCb=progressCb),
bootstrapGmail(connectionId=connectionId, progressCb=progressCb),
return_exceptions=True,
)
return {
"connectionId": connectionId,
"authority": authority,
"drive": _normalize(gdResult, "gdrive"),
"gmail": _normalize(gmResult, "gmail"),
}
if authority == "clickup":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
bootstrapClickup,
)
progressCb(10, "clickup tasks")
cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb)
return {
"connectionId": connectionId,
"authority": authority,
"clickup": _normalize(cuResult, "clickup"),
}
logger.info(
"ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s",
authority, connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"authority": authority,
"connectionId": connectionId,
"reason": "unsupported_authority",
},
)
return {
"connectionId": connectionId,
"authority": authority,
"skipped": True,
"reason": "unsupported_authority",
}
async def _scheduledDailyResync() -> None:
"""Enqueue a connection.bootstrap job for every active knowledge connection.
Runs once per day (default 2 AM Europe/Zurich). Each job re-walks the
connector and hands new / changed items to KnowledgeService.requestIngestion.
Unchanged items are deduplicated by content-hash and skipped automatically.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
connections = rootInterface.getActiveKnowledgeConnections()
except Exception as exc:
logger.error("knowledge.daily_resync: could not load connections: %s", exc, exc_info=True)
return
if not connections:
logger.info("knowledge.daily_resync: no active knowledge connections — nothing to do")
return
logger.info(
"knowledge.daily_resync: enqueuing bootstrap for %d connection(s)",
len(connections),
extra={"event": "knowledge.daily_resync.started", "count": len(connections)},
)
enqueued = 0
skipped = 0
for conn in connections:
connectionId = str(conn.id)
authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority)
userId = str(conn.userId)
payload: Dict[str, Any] = {
"connectionId": connectionId,
"authority": authority.lower(),
"userId": userId,
}
try:
await startJob(
BOOTSTRAP_JOB_TYPE,
payload,
triggeredBy="scheduler.daily_resync",
)
enqueued += 1
logger.debug(
"knowledge.daily_resync: queued connectionId=%s authority=%s",
connectionId, authority,
)
except Exception as exc:
skipped += 1
logger.error(
"knowledge.daily_resync: failed to enqueue connectionId=%s: %s",
connectionId, exc,
)
logger.info(
"knowledge.daily_resync: done — enqueued=%d skipped=%d",
enqueued, skipped,
extra={"event": "knowledge.daily_resync.done", "enqueued": enqueued, "skipped": skipped},
)
def registerDailyResyncScheduler(*, hour: int = 2, minute: int = 0) -> None:
"""Register the daily knowledge re-sync cron job. Idempotent.
Args:
hour: Hour of day to run (023, default 2 2 AM Europe/Zurich).
minute: Minute within the hour (default 0).
"""
try:
from modules.shared.eventManagement import eventManager
eventManager.registerCron(
jobId="knowledge.daily_resync",
func=_scheduledDailyResync,
cronKwargs={"hour": str(hour), "minute": str(minute)},
)
logger.info(
"knowledge.daily_resync scheduler registered (daily %02d:%02d Europe/Zurich)",
hour, minute,
)
except Exception as exc:
logger.warning("knowledge.daily_resync scheduler registration failed (non-critical): %s", exc)
def registerKnowledgeIngestionConsumer() -> None:
"""Register callback subscribers + background job handler. Idempotent."""
global _registered
if _registered:
return
callbackRegistry.register("connection.established", _onConnectionEstablished)
callbackRegistry.register("connection.revoked", _onConnectionRevoked)
registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler)
registerDailyResyncScheduler()
_registered = True
logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler + daily resync)", BOOTSTRAP_JOB_TYPE)

View file

@ -0,0 +1,101 @@
"""Per-connection knowledge ingestion preference helpers.
Walkers call `loadConnectionPrefs(connectionId)` once at bootstrap start and
receive a `ConnectionIngestionPrefs` dataclass they can pass down into their
inner loops. All fields have safe defaults so walkers stay backward-compatible
with connections that predate the §2.6 preference schema (knowledgePreferences
is None).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
_DEFAULT_MAX_AGE_DAYS = 90
_DEFAULT_MAIL_DEPTH = "full"
_DEFAULT_CLICKUP_SCOPE = "title_description"
@dataclass
class ConnectionIngestionPrefs:
"""Parsed per-connection preferences for knowledge ingestion walkers."""
# PII
neutralizeBeforeEmbed: bool = False
# Mail (Outlook + Gmail)
mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full"
mailIndexAttachments: bool = False
# Files (Drive / SharePoint / OneDrive)
filesIndexBinaries: bool = True
mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed
# ClickUp
clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments"
clickupIndexAttachments: bool = False
# Per-authority surface toggles (default everything on)
gmailEnabled: bool = True
driveEnabled: bool = True
sharepointEnabled: bool = True
outlookEnabled: bool = True
# Time window
maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit
def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs:
"""Load and parse per-connection preferences from the database.
Returns safe defaults for any missing or unparseable values so walkers
never fail due to missing preference data.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
root = getRootInterface()
conn = root.getUserConnectionById(connectionId)
if not conn:
logger.debug("loadConnectionPrefs: connection %s not found, using defaults", connectionId)
return ConnectionIngestionPrefs()
raw: Optional[Dict[str, Any]] = getattr(conn, "knowledgePreferences", None)
if not raw or not isinstance(raw, dict):
return ConnectionIngestionPrefs()
def _bool(key: str, default: bool) -> bool:
v = raw.get(key)
return bool(v) if isinstance(v, bool) else default
def _str(key: str, allowed: List[str], default: str) -> str:
v = raw.get(key)
return v if v in allowed else default
def _int(key: str, default: int) -> int:
v = raw.get(key)
return int(v) if isinstance(v, int) else default
surface = raw.get("surfaceToggles") or {}
google_surf = surface.get("google") or {}
msft_surf = surface.get("msft") or {}
return ConnectionIngestionPrefs(
neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False),
mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH),
mailIndexAttachments=_bool("mailIndexAttachments", False),
filesIndexBinaries=_bool("filesIndexBinaries", True),
mimeAllowlist=list(raw.get("mimeAllowlist") or []),
clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE),
clickupIndexAttachments=_bool("clickupIndexAttachments", False),
gmailEnabled=bool(google_surf.get("gmail", True)),
driveEnabled=bool(google_surf.get("drive", True)),
sharepointEnabled=bool(msft_surf.get("sharepoint", True)),
outlookEnabled=bool(msft_surf.get("outlook", True)),
maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS),
)
except Exception as exc:
logger.warning("loadConnectionPrefs failed for %s, using defaults: %s", connectionId, exc)
return ConnectionIngestionPrefs()

View file

@ -0,0 +1,512 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""ClickUp bootstrap for the unified knowledge ingestion lane.
ClickUp tasks are ingested as *virtual documents* we never download file
bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose
`contentObjects` carry a summary header (name + status + metadata) and the
task description / text content so retrieval finds them without a live API
call.
Hierarchy traversal: workspace (team) spaces folders / folderless lists
tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` /
`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d).
Idempotency: `date_updated` from the ClickUp task payload is a millisecond
timestamp and strictly monotonic per revision used as `contentVersion`.
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
MAX_TASKS_DEFAULT = 500
MAX_WORKSPACES_DEFAULT = 3
MAX_LISTS_PER_WORKSPACE_DEFAULT = 20
MAX_DESCRIPTION_CHARS_DEFAULT = 8000
MAX_AGE_DAYS_DEFAULT = 180
@dataclass
class ClickupBootstrapLimits:
maxTasks: int = MAX_TASKS_DEFAULT
maxWorkspaces: int = MAX_WORKSPACES_DEFAULT
maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT
maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT
# Only ingest tasks updated within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Include closed/archived tasks if they still meet the recency filter.
# ClickUp `closed` tasks often carry the most useful RAG context
# ("why was this shipped the way it was?").
includeClosed: bool = True
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
# Content scope: "titles" | "title_description" | "with_comments"
clickupScope: str = "title_description"
@dataclass
class ClickupBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
workspaces: int = 0
lists: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticTaskId(connectionId: str, taskId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16]
return f"cu:{connectionId[:8]}:{token}"
def _truncate(value: Any, limit: int) -> str:
text = str(value or "").strip()
if not text:
return ""
if len(text) <= limit:
return text
return text[:limit].rstrip() + "\n[truncated]"
def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not dateUpdatedMs:
return True
try:
ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc)
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
return ts >= cutoff
def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]:
"""Header (name/status/metadata) + optional description + text_content.
`limits.clickupScope` controls how much is embedded:
- "titles": task name + status metadata only
- "title_description": header + description / text_content (default)
- "with_comments": header + description + text_content
(comments themselves are not yet fetched in v1)
"""
name = task.get("name") or f"Task {task.get('id', '')}"
status = ((task.get("status") or {}).get("status")) or ""
assignees = ", ".join(
filter(None, [
(a.get("username") or a.get("email") or "")
for a in (task.get("assignees") or [])
])
)
tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])]))
listInfo = task.get("list") or {}
folderInfo = task.get("folder") or {}
spaceInfo = task.get("space") or {}
dueMs = task.get("due_date")
dueIso = ""
if dueMs:
try:
dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
except Exception:
dueIso = ""
headerLines = [
f"Task: {name}",
f"Status: {status}" if status else "",
f"List: {listInfo.get('name', '')}" if listInfo else "",
f"Folder: {folderInfo.get('name', '')}" if folderInfo else "",
f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "",
f"Assignees: {assignees}" if assignees else "",
f"Tags: {tags}" if tags else "",
f"Due: {dueIso}" if dueIso else "",
f"Url: {task.get('url', '')}" if task.get("url") else "",
]
header = "\n".join(line for line in headerLines if line)
parts: List[Dict[str, Any]] = [{
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
}]
scope = getattr(limits, "clickupScope", "title_description")
if scope in ("title_description", "with_comments"):
description = _truncate(task.get("description"), limits.maxDescriptionChars)
if description:
parts.append({
"contentObjectId": "description",
"contentType": "text",
"data": description,
"contextRef": {"part": "description"},
})
# text_content is ClickUp's rendered-markdown version; include if it adds
# something beyond the plain description (common for bullet lists, checklists).
textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
if textContent and textContent != description:
parts.append({
"contentObjectId": "text_content",
"contentType": "text",
"data": textContent,
"contextRef": {"part": "text_content"},
})
return parts
async def bootstrapClickup(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[ClickupBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Walk workspaces → lists → tasks and ingest each task as a virtual doc."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = ClickupBootstrapLimits(
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
clickupScope=prefs.clickupScope,
)
startMs = time.time()
result = ClickupBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=clickup connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "clickup",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
svc = getattr(adapter, "_svc", None)
if svc is None:
result.errors.append("adapter missing _svc instance")
return _finalizeResult(connectionId, result, startMs)
try:
teamsResp = await svc.getAuthorizedTeams()
except Exception as exc:
logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"teams: {exc}")
return _finalizeResult(connectionId, result, startMs)
teams = (teamsResp or {}).get("teams") or []
for team in teams[: limits.maxWorkspaces]:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
break
teamId = str(team.get("id", "") or "")
if not teamId:
continue
result.workspaces += 1
try:
await _walkTeam(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
team=team,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
result.errors.append(f"team({teamId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerClickup.connectorClickup import ClickupConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = ClickupConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("clickup")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkTeam(
*,
svc,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
team: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
teamId = str(team.get("id", "") or "")
spacesResp = await svc.getSpaces(teamId)
spaces = (spacesResp or {}).get("spaces") or []
listsCollected: List[Dict[str, Any]] = []
for space in spaces:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
spaceId = str(space.get("id", "") or "")
if not spaceId:
continue
# Folderless lists directly under the space
folderless = await svc.getFolderlessLists(spaceId)
for lst in (folderless or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
listsCollected.append({**lst, "_space": space})
# Lists inside folders
foldersResp = await svc.getFolders(spaceId)
for folder in (foldersResp or {}).get("folders") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
folderId = str(folder.get("id", "") or "")
if not folderId:
continue
folderLists = await svc.getListsInFolder(folderId)
for lst in (folderLists or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
listsCollected.append({**lst, "_space": space, "_folder": folder})
for lst in listsCollected:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return
result.lists += 1
await _walkList(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
teamId=teamId,
lst=lst,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _walkList(
*,
svc,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
teamId: str,
lst: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
listId = str(lst.get("id", "") or "")
if not listId:
return
page = 0
while result.indexed + result.skippedDuplicate < limits.maxTasks:
resp = await svc.getTasksInList(
listId,
page=page,
include_closed=limits.includeClosed,
subtasks=True,
)
if isinstance(resp, dict) and resp.get("error"):
logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error"))
result.errors.append(f"list({listId}): {resp.get('error')}")
return
tasks = (resp or {}).get("tasks") or []
if not tasks:
return
for task in tasks:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return
if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
result.skippedPolicy += 1
continue
# Inject the list/folder/space metadata we already loaded.
task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
task["folder"] = task.get("folder") or lst.get("_folder") or {}
task["space"] = task.get("space") or lst.get("_space") or {}
await _ingestTask(
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
teamId=teamId,
task=task,
limits=limits,
result=result,
progressCb=progressCb,
)
if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page
return
page += 1
async def _ingestTask(
*,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
teamId: str,
task: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
taskId = str(task.get("id", "") or "")
if not taskId:
result.skippedPolicy += 1
return
revision = str(task.get("date_updated") or task.get("date_created") or "")
name = task.get("name") or f"Task {taskId}"
syntheticId = _syntheticTaskId(connectionId, taskId)
fileName = f"{name[:80].strip() or taskId}.task.json"
contentObjects = _buildContentObjects(task, limits)
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="clickup_task",
sourceId=syntheticId,
fileName=fileName,
mimeType="application/vnd.clickup.task+json",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision or None,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "clickup",
"service": "clickup",
"externalItemId": taskId,
"teamId": teamId,
"listId": ((task.get("list") or {}).get("id")),
"spaceId": ((task.get("space") or {}).get("id")),
"url": task.get("url"),
"status": ((task.get("status") or {}).get("status")),
"tier": limits.clickupScope,
},
)
)
except Exception as exc:
logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({taskId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),
f"clickup processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "clickup",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.workspaces, result.lists, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "clickup",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"workspaces": result.workspaces,
"lists": result.lists,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"workspaces": result.workspaces,
"lists": result.lists,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,443 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Drive bootstrap for the unified knowledge ingestion lane.
Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
user's *My Drive* tree from the virtual `root` folder, downloads each
file-like item via `DriveAdapter.download` (which handles native Google docs
via export), runs the standard extraction pipeline and routes results through
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
`contentVersion = modifiedTime` (monotonic per-revision).
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
FOLDER_MIME = "application/vnd.google-apps.folder"
@dataclass
class GdriveBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
# Only ingest files modified within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
# Whether to skip binary/non-text files
filesIndexBinaries: bool = True
@dataclass
class GdriveBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"gd:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not modifiedIso:
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
return True
try:
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
return ts >= cutoff
async def bootstrapGdrive(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GdriveBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Walk My Drive starting from the virtual root folder."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = GdriveBootstrapLimits(
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
filesIndexBinaries=prefs.filesIndexBinaries,
)
startMs = time.time()
result = GdriveBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gdrive",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"walk: {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("drive")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("gdrive browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
metadata = getattr(entry, "metadata", {}) or {}
mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
effectiveMime = mimeType or "application/octet-stream"
if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
modifiedTime = metadata.get("modifiedTime")
if not _isRecent(modifiedTime, limits.maxAgeDays):
result.skippedPolicy += 1
continue
externalItemId = metadata.get("id") or entryPath
revision = modifiedTime
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=effectiveMime,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
downloaded = await adapter.download(entryPath)
except Exception as exc:
logger.warning("gdrive download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
fileBytes: bytes
if isinstance(downloaded, (bytes, bytearray)):
fileBytes = bytes(downloaded)
else:
fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
if getattr(downloaded, "mimeType", None):
mimeType = downloaded.mimeType # export may have changed the type
if not fileBytes:
result.failed += 1
return
if len(fileBytes) > limits.maxFileSize:
result.skippedPolicy += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gdrive_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "drive",
"externalItemId": externalItemId,
"entryPath": entryPath,
"tier": "body",
},
)
)
except Exception as exc:
logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"gdrive processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gdrive",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.bytesProcessed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gdrive",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytes": result.bytesProcessed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,606 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Gmail bootstrap for the unified knowledge ingestion lane.
Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
with header / snippet / cleaned body content-objects; attachments are optional
child jobs with `sourceKind="gmail_attachment"`.
Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
passed as `contentVersion`, so rerunning the bootstrap yields
`ingestion.skipped.duplicate` for unchanged messages.
"""
from __future__ import annotations
import asyncio
import base64
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
DEFAULT_LABELS = ("INBOX", "SENT")
@dataclass
class GmailBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
labels: tuple = DEFAULT_LABELS
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
# Content depth: "metadata" | "snippet" | "full"
mailContentDepth: str = "full"
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class GmailBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"gm:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"ga:{connectionId[:8]}:{token}"
def _decodeBase64Url(data: str) -> bytes:
if not data:
return b""
# Gmail uses URL-safe base64 without padding.
padding = 4 - (len(data) % 4)
if padding != 4:
data = data + ("=" * padding)
try:
return base64.urlsafe_b64decode(data)
except Exception:
return b""
def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
"""Return {"text": ..., "html": ...} by walking MIME parts.
Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
markup if plain is missing.
"""
found: Dict[str, str] = {"text": "", "html": ""}
def _walk(part: Dict[str, Any]) -> None:
mime = (part.get("mimeType") or "").lower()
body = part.get("body") or {}
raw = body.get("data") or ""
if raw and mime.startswith("text/"):
decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
if key and not found[key]:
found[key] = decoded
for sub in part.get("parts") or []:
_walk(sub)
_walk(payload or {})
return found
def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
return {
(h.get("name") or "").lower(): (h.get("value") or "")
for h in (payload.get("headers") or [])
}
def _buildContentObjects(
message: Dict[str, Any],
maxBodyChars: int,
mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
"""Build content objects for a Gmail message.
`mailContentDepth` controls how much is embedded:
- "metadata": header only (subject, from, to, date)
- "snippet": header + Gmail snippet (~155 chars, no full body)
- "full": header + snippet + cleaned full body (default)
"""
payload = message.get("payload") or {}
headers = _headerMap(payload)
subject = headers.get("subject") or "(no subject)"
fromAddr = headers.get("from") or ""
toAddr = headers.get("to") or ""
ccAddr = headers.get("cc") or ""
date = headers.get("date") or ""
snippet = message.get("snippet") or ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {date}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if mailContentDepth in ("snippet", "full") and snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if mailContentDepth == "full":
bodies = _walkPayloadForBody(payload)
rawBody = bodies["text"] or bodies["html"]
cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapGmail(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GmailBootstrapLimits] = None,
googleGetFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = GmailBootstrapLimits(
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time()
result = GmailBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gmail connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gmail",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if googleGetFn is None:
from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet
token = getattr(adapter, "_token", "")
async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef]
return await _defaultGet(token, url)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
for labelId in limits.labels:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestLabel(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
result.errors.append(f"label({labelId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("gmail")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _ingestLabel(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
labelId: str,
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
query = ""
if limits.maxAgeDays:
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
# Gmail uses YYYY/MM/DD.
query = f"after:{cutoff.strftime('%Y/%m/%d')}"
baseUrl = (
"https://gmail.googleapis.com/gmail/v1/users/me/messages"
f"?labelIds={labelId}&maxResults={pageSize}"
)
if query:
baseUrl = f"{baseUrl}&q={query}"
nextPageToken: Optional[str] = None
while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
page = await googleGetFn(url)
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("gmail list page error for label %s: %s", labelId, err)
result.errors.append(f"list({labelId}): {err}")
return
messageStubs = page.get("messages") or []
for stub in messageStubs:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
msgId = stub.get("id")
if not msgId:
continue
detailUrl = (
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
)
detail = await googleGetFn(detailUrl)
if not isinstance(detail, dict) or "error" in detail:
result.failed += 1
continue
await _ingestMessage(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
message=detail,
limits=limits,
result=result,
progressCb=progressCb,
)
nextPageToken = page.get("nextPageToken")
if not nextPageToken:
break
async def _ingestMessage(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
labelId: str,
message: Dict[str, Any],
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("historyId") or message.get("internalDate")
headers = _headerMap(message.get("payload") or {})
subject = headers.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(
message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
)
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gmail_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=str(revision) if revision else None,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "gmail",
"externalItemId": messageId,
"label": labelId,
"threadId": message.get("threadId"),
"tier": limits.mailContentDepth,
},
)
)
except Exception as exc:
logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments:
try:
await _ingestAttachments(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("gmail attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"gmail processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gmail",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
parentSyntheticId: str,
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
messageId = message.get("id") or ""
def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
filename = part.get("filename") or ""
body = part.get("body") or {}
attId = body.get("attachmentId")
if filename and attId:
acc.append({
"filename": filename,
"mimeType": part.get("mimeType") or "application/octet-stream",
"attachmentId": attId,
"size": int(body.get("size") or 0),
})
for sub in part.get("parts") or []:
_collectAttachmentStubs(sub, acc)
stubs: List[Dict[str, Any]] = []
_collectAttachmentStubs(message.get("payload") or {}, stubs)
if not stubs:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for stub in stubs:
if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
attUrl = (
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
f"/attachments/{stub['attachmentId']}"
)
detail = await googleGetFn(attUrl)
if not isinstance(detail, dict) or "error" in detail:
result.failed += 1
continue
rawBytes = _decodeBase64Url(detail.get("data") or "")
if not rawBytes:
continue
fileName = stub["filename"]
mimeType = stub["mimeType"]
syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gmail_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "gmail",
"parentId": parentSyntheticId,
"externalItemId": stub["attachmentId"],
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gmail",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,576 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Outlook bootstrap for the unified knowledge ingestion lane.
Unlike SharePoint, Outlook messages are "virtual documents" we never persist
file bytes in the store. Each message becomes a `sourceKind="outlook_message"`
IngestionJob whose `contentObjects` carry the header, snippet and cleaned body
so retrieval can show a compact answer without fetching Graph again.
Attachments are optional (`includeAttachments` limit flag) and enqueued as
child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_FOLDERS_DEFAULT = 5
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
WELL_KNOWN_FOLDERS = ("inbox", "sentitems")
@dataclass
class OutlookBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
maxFolders: int = MAX_FOLDERS_DEFAULT
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
# Content depth: "metadata" | "snippet" | "full"
mailContentDepth: str = "full"
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class OutlookBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"om:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"oa:{connectionId[:8]}:{token}"
def _extractRecipient(recipient: Dict[str, Any]) -> str:
email = (recipient or {}).get("emailAddress") or {}
name = email.get("name") or ""
addr = email.get("address") or ""
if name and addr:
return f"{name} <{addr}>"
return addr or name
def _joinRecipients(recipients: List[Dict[str, Any]]) -> str:
return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []]))
def _buildContentObjects(
message: Dict[str, Any],
maxBodyChars: int,
mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
"""Build content objects for an Outlook message.
`mailContentDepth` mirrors the Gmail walker:
- "metadata": header only
- "snippet": header + bodyPreview (~255 chars)
- "full": header + snippet + cleaned body (default)
"""
subject = message.get("subject") or "(no subject)"
fromAddr = _extractRecipient(message.get("from") or {})
toAddr = _joinRecipients(message.get("toRecipients") or [])
ccAddr = _joinRecipients(message.get("ccRecipients") or [])
received = message.get("receivedDateTime") or ""
snippet = message.get("bodyPreview") or ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {received}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if mailContentDepth in ("snippet", "full") and snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if mailContentDepth == "full":
body = message.get("body") or {}
bodyContent = body.get("content") or ""
cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else ""
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapOutlook(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[OutlookBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Enumerate Outlook folders (inbox + sent by default) and ingest messages."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = OutlookBootstrapLimits(
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time()
result = OutlookBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=outlook connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "outlook",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
folderIds = await _selectFolderIds(adapter, limits)
for folderId in folderIds:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestFolder(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderId=folderId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
result.errors.append(f"folder({folderId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("outlook")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]:
"""Prefer well-known folders (inbox, sentitems); fall back to browse()."""
folderIds: List[str] = []
for wellKnown in WELL_KNOWN_FOLDERS:
if len(folderIds) >= limits.maxFolders:
break
try:
row = await adapter._graphGet(f"me/mailFolders/{wellKnown}")
except Exception:
row = None
if isinstance(row, dict) and "error" not in row and row.get("id"):
folderIds.append(row["id"])
if len(folderIds) < limits.maxFolders:
try:
entries = await adapter.browse("/")
except Exception:
entries = []
for entry in entries:
metadata = getattr(entry, "metadata", {}) or {}
fid = metadata.get("id")
if fid and fid not in folderIds:
folderIds.append(fid)
if len(folderIds) >= limits.maxFolders:
break
return folderIds
async def _ingestFolder(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
folderId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
select = (
"id,subject,from,toRecipients,ccRecipients,receivedDateTime,"
"bodyPreview,body,internetMessageId,hasAttachments,changeKey"
)
endpoint: Optional[str] = (
f"me/mailFolders/{folderId}/messages"
f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}"
)
# Keep header-based age filter in Graph itself to avoid shipping ancient
# messages we'd discard client-side.
if limits.maxAgeDays:
from datetime import datetime, timezone, timedelta
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ")
endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}"
while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages:
try:
page = await adapter._graphGet(endpoint)
except Exception as exc:
logger.warning("outlook graph page failed for folder %s: %s", folderId, exc)
result.errors.append(f"graph({folderId}): {exc}")
return
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("outlook graph page error for folder %s: %s", folderId, err)
result.errors.append(f"graph({folderId}): {err}")
return
for message in page.get("value", []) or []:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
await _ingestMessage(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
limits=limits,
result=result,
progressCb=progressCb,
)
nextLink = page.get("@odata.nextLink")
if not nextLink:
break
# Strip Graph base so adapter._graphGet accepts the relative path.
from modules.connectors.providerMsft.connectorMsft import _stripGraphBase
endpoint = _stripGraphBase(nextLink)
async def _ingestMessage(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("changeKey") or message.get("internetMessageId")
subject = message.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(
message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
)
# Always at least the header is emitted, so `contentObjects` is non-empty.
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"externalItemId": messageId,
"internetMessageId": message.get("internetMessageId"),
"tier": limits.mailContentDepth,
},
)
)
except Exception as exc:
logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments and message.get("hasAttachments"):
try:
await _ingestAttachments(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
messageId=messageId,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("outlook attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"outlook processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "outlook",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
messageId: str,
parentSyntheticId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments (skip inline & oversized)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
import base64
page = await adapter._graphGet(f"me/messages/{messageId}/attachments")
if not isinstance(page, dict) or "error" in page:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for attachment in page.get("value", []) or []:
if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment":
continue
if attachment.get("isInline"):
continue
size = int(attachment.get("size") or 0)
if size and size > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
contentBytesB64 = attachment.get("contentBytes")
if not contentBytesB64:
continue
try:
rawBytes = base64.b64decode(contentBytesB64)
except Exception:
result.skippedPolicy += 1
continue
fileName = attachment.get("name") or "attachment"
mimeType = attachment.get("contentType") or "application/octet-stream"
attachmentId = attachment.get("id") or fileName
syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId)
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"parentId": parentSyntheticId,
"externalItemId": attachmentId,
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "outlook",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,433 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""SharePoint bootstrap for the unified knowledge ingestion lane.
Walks the SharePoint drive(s) reachable via a UserConnection, downloads each
file-like item, runs the standard content extraction pipeline and hands the
result to `KnowledgeService.requestIngestion`. Idempotency is provided by the
ingestion façade itself; repeat bootstraps therefore produce
`ingestion.skipped.duplicate` for every unchanged item because we pass the
Graph `eTag` as `contentVersion`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_SITES_DEFAULT = 3
@dataclass
class SharepointBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
maxSites: int = MAX_SITES_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class SharepointBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
"""Deterministic synthetic FileContentIndex id for a SharePoint item.
Stable across bootstraps idempotency works; independent of file name so
moves/renames don't duplicate chunks.
"""
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"sp:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
"""Translate ExtractionResult → content objects accepted by requestIngestion."""
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
async def bootstrapSharepoint(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[SharepointBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate SharePoint drives and ingest every reachable file via the façade.
Parameters allow injection for tests; production callers pass only
`connectionId` (and optionally a progressCb) and everything else is
resolved against the registered services.
"""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed)
startMs = time.time()
result = SharepointBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=sharepoint connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "sharepoint",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
sites = await adapter.browse("/", limit=limits.maxSites)
except Exception as exc:
logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"site_discovery: {exc}")
return _finalizeResult(connectionId, result, startMs)
for site in sites[: limits.maxSites]:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
break
sitePath = getattr(site, "path", "") or ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=sitePath,
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True)
result.errors.append(f"walk({sitePath}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
"""Load connection, instantiate SharepointAdapter, and build a KnowledgeService.
Runs with root privileges: bootstrap is a system operation triggered by an
authenticated user via callback; it must not be gated by a per-user
service-center context.
"""
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("sharepoint")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("sharepoint browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
if getattr(entry, "isFolder", False):
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
mimeType = getattr(entry, "mimeType", None) or "application/octet-stream"
if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
metadata = getattr(entry, "metadata", {}) or {}
externalItemId = metadata.get("id") or entryPath
revision = metadata.get("revision") or metadata.get("lastModifiedDateTime")
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=mimeType,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
fileBytes = await adapter.download(entryPath)
except Exception as exc:
logger.warning("sharepoint download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
if not fileBytes:
result.failed += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("sharepoint extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
provenance: Dict[str, Any] = {
"connectionId": connectionId,
"authority": "msft",
"service": "sharepoint",
"externalItemId": externalItemId,
"externalPath": entryPath,
"revision": revision,
}
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="sharepoint_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance=provenance,
)
)
except Exception as exc:
logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if handle.error:
result.errors.append(f"ingest({entryPath}): {handle.error}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"sharepoint processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "sharepoint",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
# Yield so the event loop can interleave other tasks (download/extract are
# CPU-ish and extraction uses sync libs; cooperative scheduling prevents
# starving other workers).
await asyncio.sleep(0)
def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed,
durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "sharepoint",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,107 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Text normalisation utilities used by knowledge ingestion.
The email body cleaning logic is intentionally regex-based and works on plain
text after an HTMLtext pass so we never store unsanitised HTML/JS in the
knowledge store and retrieval stays robust (no extraneous markup tokens
eating embedding budget).
"""
from __future__ import annotations
import re
from typing import Optional
DEFAULT_MAX_CHARS = 8000
_QUOTE_MARKER_PATTERNS = [
re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
]
_SIGNATURE_MARKERS = [
re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
re.compile(r"^\s*—\s*$", re.MULTILINE),
re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
]
def _htmlToText(html: str) -> str:
"""Prefer BeautifulSoup when available, fall back to regex."""
try:
from bs4 import BeautifulSoup # type: ignore
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
for br in soup.find_all(["br"]):
br.replace_with("\n")
for p in soup.find_all(["p", "div", "li", "tr"]):
p.append("\n")
text = soup.get_text()
except Exception:
# Minimal fallback: strip tags crudely.
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"</(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
# Collapse non-breaking + zero-width whitespace.
text = text.replace("\u00a0", " ").replace("\u200b", "")
return text
def _stripQuotedThread(text: str) -> str:
"""Remove reply-chain content so only the author's own contribution remains."""
earliest = len(text)
for pattern in _QUOTE_MARKER_PATTERNS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
# Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
if quotedBlock and quotedBlock.start() < earliest:
earliest = quotedBlock.start()
return text[:earliest].rstrip()
def _stripSignature(text: str) -> str:
earliest = len(text)
for pattern in _SIGNATURE_MARKERS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
return text[:earliest].rstrip()
def _collapseWhitespace(text: str) -> str:
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
"""Return a compact plain-text view of an email body suitable for embedding.
Steps: HTML text, remove quoted reply chain, remove signature, collapse
whitespace, truncate to maxChars. Always returns a string (possibly empty).
"""
if not html:
return ""
text = _htmlToText(html) if "<" in html and ">" in html else html
text = _stripQuotedThread(text)
text = _stripSignature(text)
text = _collapseWhitespace(text)
if maxChars and len(text) > maxChars:
text = text[:maxChars].rstrip() + ""
return text

View file

@ -302,6 +302,30 @@ async def _executeWithRetry(executor, node, context, maxRetries: int = 0, retryD
raise lastError
def _substituteFeatureInstancePlaceholders(
graph: Dict[str, Any],
targetFeatureInstanceId: str,
) -> Dict[str, Any]:
"""Replace ``{{featureInstanceId}}`` placeholders in the serialised graph.
Works on the full JSON representation so that placeholders inside nested
parameter dicts, prompt strings, etc. are all caught. Already-resolved
concrete UUIDs (pre-baked by ``_copyTemplateWorkflows``) are left untouched
because the placeholder literal ``{{featureInstanceId}}`` will not match.
"""
import json as _json
raw = _json.dumps(graph)
if "{{featureInstanceId}}" not in raw:
return graph
replaced = raw.replace("{{featureInstanceId}}", targetFeatureInstanceId)
logger.debug(
"_substituteFeatureInstancePlaceholders: resolved %d occurrence(s) -> %s",
raw.count("{{featureInstanceId}}"),
targetFeatureInstanceId,
)
return _json.loads(replaced)
async def executeGraph(
graph: Dict[str, Any],
services: Any,
@ -315,6 +339,7 @@ async def executeGraph(
runId: Optional[str] = None,
run_envelope: Optional[Dict[str, Any]] = None,
label: Optional[str] = None,
targetFeatureInstanceId: Optional[str] = None,
) -> Dict[str, Any]:
"""
Execute automation2 graph. Returns { success, nodeOutputs, error?, stopped? }.
@ -322,14 +347,16 @@ async def executeGraph(
pauses the run, and returns { success: False, paused: True, taskId, runId }.
For resume: pass initialNodeOutputs (with result for the human node) and startAfterNodeId.
For fresh runs: pass run_envelope (unified start payload for the start node); normalized with userId into context.runEnvelope.
targetFeatureInstanceId: resolves {{featureInstanceId}} placeholders in the graph JSON before execution.
"""
logger.info(
"executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s",
"executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s targetInstance=%s",
instanceId,
workflowId,
userId,
mandateId,
startAfterNodeId is not None,
targetFeatureInstanceId,
)
from modules.workflows.processing.shared.methodDiscovery import discoverMethods
discoverMethods(services)
@ -338,6 +365,9 @@ async def executeGraph(
materializeFeatureInstanceRefs,
)
if targetFeatureInstanceId:
graph = _substituteFeatureInstancePlaceholders(graph, targetFeatureInstanceId)
# Phase-5 Schicht-4: typed-ref envelopes are materialized FIRST so the
# subsequent connection-ref pass and validation see the canonical shape.
graph = materializeFeatureInstanceRefs(graph)

View file

@ -377,7 +377,11 @@ class ActionNodeExecutor:
if nodeType.startswith("ai."):
out["prompt"] = promptText
out["response"] = extractedContext
out["context"] = f"{promptText}\n\n{extractedContext}" if promptText and extractedContext else (extractedContext or promptText)
inputContext = resolvedParams.get("context")
if inputContext is not None:
out["context"] = inputContext if isinstance(inputContext, str) else json.dumps(inputContext, ensure_ascii=False, default=str)
else:
out["context"] = ""
# Structured output
if extractedContext:
try:

View file

@ -0,0 +1,18 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Shared helpers for AI workflow actions."""
def applyCommonAiParams(parameters: dict, request) -> None:
"""Apply common AI parameters (requireNeutralization, allowedModels) from node to request."""
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
request.requireNeutralization = bool(requireNeutralization)
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
if not request.options:
from modules.datamodels.datamodelAi import AiCallOptions
request.options = AiCallOptions()
request.options.allowedModels = allowedModels

View file

@ -67,6 +67,8 @@ async def consolidate(self, parameters: Dict[str, Any]) -> ActionResult:
prompt=prompt,
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE),
)
from modules.workflows.methods.methodAi._common import applyCommonAiParams
applyCommonAiParams(parameters, req)
resp = await ai_service.callAi(req)
except (SubscriptionInactiveException, BillingContextError):
raise

View file

@ -36,6 +36,10 @@ async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult:
}
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
if parameters.get("allowedModels"):
processParams["allowedModels"] = parameters["allowedModels"]
if parameters.get("requireNeutralization") is not None:
processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)

View file

@ -55,6 +55,16 @@ async def generateCode(self, parameters: Dict[str, Any]) -> ActionResult:
processingMode=ProcessingModeEnum.DETAILED
)
# Apply node-level AI params
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
options.allowedModels = allowedModels
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
_ctx = getattr(self.services, '_context', None)
if _ctx:
_ctx.requireNeutralization = bool(requireNeutralization)
# outputFormat: Optional - if None, formats determined from prompt by AI
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt,

View file

@ -59,6 +59,16 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
compressContext=False
)
# Apply node-level AI params
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
options.allowedModels = allowedModels
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
_ctx = getattr(self.services, '_context', None)
if _ctx:
_ctx.requireNeutralization = bool(requireNeutralization)
# outputFormat: Optional - if None, formats determined from prompt by AI
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt,

View file

@ -73,6 +73,49 @@ def _action_docs_to_content_parts(services, docs: List[Any]) -> List[ContentPart
logger.info(f"ai.process: Extracted {len(ec.parts)} parts from {name} (no persistence)")
return all_parts
def _resolve_file_refs_to_content_parts(services, fileIdRefs) -> List[ContentPart]:
"""Fetch files by ID from the file store and extract content.
Used ONLY for automation2 workflows where documents are file-store
references, not chat message attachments. In the agent/chat context,
``DocumentItemReference`` holds ChatDocument IDs that must be resolved
via ``getChatDocumentsFromDocumentList`` instead."""
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
mgmt = getattr(services, 'interfaceDbComponent', None)
extraction = getattr(services, 'extraction', None)
if not mgmt or not extraction:
logger.warning("_resolve_file_refs_to_content_parts: missing interfaceDbComponent or extraction service")
return []
allParts: List[ContentPart] = []
opts = ExtractionOptions(prompt="", mergeStrategy=MergeStrategy())
for ref in fileIdRefs:
fileId = ref.documentId
fileMeta = mgmt.getFile(fileId)
if not fileMeta:
logger.warning(f"_resolve_file_refs_to_content_parts: file {fileId} not found")
continue
fileData = mgmt.getFileData(fileId)
if not fileData:
logger.warning(f"_resolve_file_refs_to_content_parts: no data for file {fileId}")
continue
fileName = getattr(fileMeta, 'fileName', fileId)
mimeType = getattr(fileMeta, 'mimeType', 'application/octet-stream')
ec = extraction.extractContentFromBytes(
documentBytes=fileData,
fileName=fileName,
mimeType=mimeType,
documentId=fileId,
options=opts,
)
for p in ec.parts:
if p.data or getattr(p, "typeGroup", "") == "image":
p.metadata.setdefault("originalFileName", fileName)
allParts.append(p)
logger.info(f"_resolve_file_refs_to_content_parts: extracted {len(ec.parts)} parts from {fileName}")
return allParts
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
@ -129,6 +172,25 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
f"ai.process: Coerced documentList ({type(documentListParam).__name__}) "
f"to DocumentReferenceList with {len(documentList.references)} references"
)
# DocumentItemReferences carry either file-store IDs (automation2)
# or ChatDocument IDs (agent context with docItem: refs).
# Route based on context: if a chat workflow with messages exists,
# let getChatDocumentsFromDocumentList handle them (it resolves
# docItem:uuid via workflow.messages). Otherwise fall through to
# the file-store path for automation2.
from modules.datamodels.datamodelDocref import DocumentItemReference
fileIdRefs = [r for r in documentList.references if isinstance(r, DocumentItemReference)]
if fileIdRefs:
chatService = getattr(self.services, 'chat', None)
workflow = getattr(chatService, '_workflow', None) if chatService else None
hasChatContext = workflow and getattr(workflow, 'messages', None)
if not hasChatContext:
extractedParts = _resolve_file_refs_to_content_parts(self.services, fileIdRefs)
if extractedParts:
inline_content_parts = (inline_content_parts or []) + extractedParts
remaining = [r for r in documentList.references if not isinstance(r, DocumentItemReference)]
documentList = DocumentReferenceList(references=remaining)
# Optional: if omitted, formats determined from prompt. Default "txt" is validation fallback only.
resultType = parameters.get("resultType")
@ -157,7 +219,19 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
mimeMap = {"txt": "text/plain", "json": "application/json", "html": "text/html", "md": "text/markdown", "csv": "text/csv", "xml": "application/xml"}
output_mime_type = mimeMap.get(normalized_result_type, "text/plain") if normalized_result_type else "text/plain"
# Normalize context: workflow refs may resolve to dict/list instead of str
paramContext = parameters.get("context")
if paramContext is not None and not isinstance(paramContext, str):
try:
paramContext = json.dumps(paramContext, ensure_ascii=False, default=str)
parameters["context"] = paramContext
logger.info(f"ai.process: Serialized non-string context ({type(parameters.get('context')).__name__}) to JSON ({len(paramContext)} chars)")
except Exception as e:
logger.warning(f"ai.process: Failed to serialize context: {e}")
paramContext = str(paramContext)
parameters["context"] = paramContext
# Phase 7.3: Pass documentList and/or contentParts to AI service
contentParts: Optional[List[ContentPart]] = inline_content_parts
if "contentParts" in parameters and not inline_content_parts:
@ -212,6 +286,9 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
)
)
from modules.workflows.methods.methodAi._common import applyCommonAiParams
applyCommonAiParams(parameters, request)
aiResponse_obj = await self.services.ai.callAi(request)
# Convert AiCallResponse to AiResponse format
@ -243,6 +320,16 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
operationType=OperationTypeEnum.IMAGE_GENERATE if isImageGeneration else OperationTypeEnum.DATA_GENERATE
)
# Apply node-level AI params (allowedModels, requireNeutralization)
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
options.allowedModels = allowedModels
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
_ctx = getattr(self.services, '_context', None)
if _ctx:
_ctx.requireNeutralization = bool(requireNeutralization)
# Get generationIntent from parameters (required for DATA_GENERATE)
# Default to "document" if not provided (most common use case)
# For code generation, use ai.generateCode action or explicitly pass generationIntent="code"

View file

@ -39,6 +39,10 @@ async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult:
}
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
if parameters.get("allowedModels"):
processParams["allowedModels"] = parameters["allowedModels"]
if parameters.get("requireNeutralization") is not None:
processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)

View file

@ -41,6 +41,10 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
processParams["resultType"] = resultType
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
if parameters.get("allowedModels"):
processParams["allowedModels"] = parameters["allowedModels"]
if parameters.get("requireNeutralization") is not None:
processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)

View file

@ -56,6 +56,23 @@ class MethodAi(MethodBase):
required=False,
description="Document reference(s) in any format to use as input/context"
),
"context": WorkflowActionParameter(
name="context",
type="str",
frontendType=FrontendType.TEXTAREA,
required=False,
default="",
description="Additional context data (string or upstream-bound dict/list, e.g. accounting data) appended to the prompt. Non-string values are JSON-serialized."
),
"documentTheme": WorkflowActionParameter(
name="documentTheme",
type="str",
frontendType=FrontendType.SELECT,
frontendOptions=["general", "finance", "legal", "technical", "hr"],
required=False,
default="general",
description="Style hint for the document renderer (e.g. finance, legal). Used by the AI agent to choose colors and layout."
),
"resultType": WorkflowActionParameter(
name="resultType",
type="str",

View file

@ -31,8 +31,30 @@ async def list_tasks(self, parameters: Dict[str, Any]) -> ActionResult:
page = int(parameters.get("page") or 0)
include_closed = bool(parameters.get("includeClosed", False))
dateFilters = {}
for key in ("dateCreatedGt", "dateCreatedLt", "dateUpdatedGt", "dateUpdatedLt"):
val = parameters.get(key)
if val is not None and str(val).strip():
try:
dateFilters[key] = int(val)
except (ValueError, TypeError):
pass
rawCustomFields = parameters.get("customFields")
customFields = None
if rawCustomFields:
if isinstance(rawCustomFields, str):
try:
customFields = json.loads(rawCustomFields)
except json.JSONDecodeError:
return ActionResult.isFailure(error="customFields must be valid JSON array")
elif isinstance(rawCustomFields, list):
customFields = rawCustomFields
data = await self.services.clickup.getTasksInList(
list_id, page=page, include_closed=include_closed, subtasks=True
list_id, page=page, include_closed=include_closed, subtasks=True,
**dateFilters, customFields=customFields,
)
if isinstance(data, dict) and data.get("error"):
return ActionResult.isFailure(error=str(data.get("error")) + (data.get("body") or ""))

View file

@ -66,6 +66,41 @@ class MethodClickup(MethodBase):
default=False,
description="Include closed tasks",
),
"dateCreatedGt": WorkflowActionParameter(
name="dateCreatedGt",
type="int",
frontendType=FrontendType.NUMBER,
required=False,
description="Filter: created after this Unix ms timestamp",
),
"dateCreatedLt": WorkflowActionParameter(
name="dateCreatedLt",
type="int",
frontendType=FrontendType.NUMBER,
required=False,
description="Filter: created before this Unix ms timestamp",
),
"dateUpdatedGt": WorkflowActionParameter(
name="dateUpdatedGt",
type="int",
frontendType=FrontendType.NUMBER,
required=False,
description="Filter: updated after this Unix ms timestamp",
),
"dateUpdatedLt": WorkflowActionParameter(
name="dateUpdatedLt",
type="int",
frontendType=FrontendType.NUMBER,
required=False,
description="Filter: updated before this Unix ms timestamp",
),
"customFields": WorkflowActionParameter(
name="customFields",
type="str",
frontendType=FrontendType.TEXTAREA,
required=False,
description='JSON array of custom field filters per ClickUp API, e.g. [{"field_id":"abc","operator":"=","value":"123"}]',
),
},
execute=list_tasks.__get__(self, self.__class__),
),

View file

@ -243,6 +243,7 @@ class WorkflowScheduler:
runEnv = normalize_run_envelope(runEnv, user_id=str(eventUser.id) if eventUser else None)
_wfLabel = wf.get("label") if isinstance(wf, dict) else getattr(wf, "label", None)
_targetInstanceId = wf.get("targetFeatureInstanceId") if isinstance(wf, dict) else getattr(wf, "targetFeatureInstanceId", None)
result = await executeGraph(
graph=wf["graph"],
@ -254,6 +255,7 @@ class WorkflowScheduler:
automation2_interface=iface,
run_envelope=runEnv,
label=_wfLabel,
targetFeatureInstanceId=_targetInstanceId,
)
logger.info(
"WorkflowScheduler: executed workflow %s success=%s paused=%s",

View file

@ -0,0 +1,19 @@
# Archived one-shot scripts
Diese Scripts haben einmal eine konkrete Daten- oder Code-Migration ausgefuehrt
und werden nicht mehr aktiv aufgerufen. Sie bleiben hier liegen, falls jemand
spaeter auf einem alten DB-Dump oder einem alten Branch nochmal denselben Stand
herstellen muss.
KEIN aktives Tool. Nicht aus CI, nicht aus Docs verlinken. Bei Aufraeumarbeiten
(z.B. nach 6 Monaten ohne Anwendung) loeschen.
## Inhalt
| Datei | Migrationsthema | Archiviert am | Begruendung |
|-------|-----------------|---------------|-------------|
| `check_orphan_featureinstance.py` | Vor-Ort-Check mit hardcoded FeatureInstance-/Mandate-UUIDs | 2026-04-29 | Ad-hoc fuer einen konkreten Vorfall |
| `script_db_cleanup_duplicate_roles.py` | Cleanup doppelter Roles wegen `IS NULL`-Bug in `connectorDbPostgre` | 2026-04-29 | Bug ist laengst gefixt, Cleanup ueberall durchgelaufen |
| `migrate_async_to_sync.py` | One-shot Codemod `async def` -> `def` fuer FastAPI-Routes | 2026-04-29 | Refactor abgeschlossen |
| `i18n_rekey_plaintext_keys.py` | Frontend `t('dot.notation')` -> `t('Klartext')` Rekey | 2026-04-29 | Frontend-Migration abgeschlossen (siehe `wiki/c-work/4-done/2026-04-ui-i18n-dynamic-language-sets.md`) |
| `script_db_migrate_accessrules_objectkeys.py` | AccessRule-Items: kurz -> vollqualifiziert (Navigation-API) | 2026-04-29 | Navigation-API live, MIGRATION_MAP nur fuer trustee+realestate hardcoded |

Some files were not shown because too many files have changed in this diff Show more