Compare commits

...

23 commits

Author SHA1 Message Date
Ida
0659d0d21a ValueOn Lead to Offer durchgespielt, bugfixes in Dateigenerierung und ai nodes 2026-05-03 18:01:10 +02:00
Ida
9115d9eec8 feat: unify workflow context picker — contextBuilder multi-select, lift type-blocking, user-language labels, backend serialization, fix circular ref crash 2026-05-03 15:50:11 +02:00
Ida
da974190ea fix: alle Node definitionen korrigiert und im backend gesetzt - keine mapping layer sonder saubere quelldaten, fehlende dataRef parameter hinzugefügt, damit jede node kontext nutzen kann 2026-05-03 15:01:24 +02:00
Patrick Motsch
7942766931
Merge pull request #149 from valueonag/feat/demo-system-readieness
Feat/demo system readieness
2026-04-30 23:58:26 +02:00
ValueOn AG
c140bd14d4 fixed nodes handovers 2026-04-30 23:54:45 +02:00
Ida
06d9910ecd file tree ersetzt durch gruppierung im formgenerator 2026-04-30 12:37:46 +02:00
ValueOn AG
b500bfa6c1 plan D fixed 2026-04-29 23:27:52 +02:00
ValueOn AG
afd7e9d941 plan d implemented - generationn styles 2026-04-29 23:12:46 +02:00
ValueOn AG
b12671bbb5 fixes before document generation refactory styles 2026-04-29 22:54:17 +02:00
ValueOn AG
880fa4d787 plana+c implemented 2026-04-29 21:27:08 +02:00
Ida
72d3175f49 Gruppierung im Formgenerator fertig 2026-04-29 18:16:02 +02:00
Ida
ce671f61b6 feat: app-scheduler ausgebaut um nachts bestehende connections zu indexieren 2026-04-29 14:39:40 +02:00
Ida
4a840e9e6e added neutralization option to indexing new connections 2026-04-29 14:39:40 +02:00
Ida
93cb6939dc feat: frontend consent integration 2026-04-29 14:39:40 +02:00
Ida
3add5c9a80 commit before rebase 2026-04-29 14:39:40 +02:00
Ida
6a5ff1ff7c feat(rag): P1 user-connection hooks + retrieval threshold fix
- connection.established/revoked callbacks from OAuth routes and
  connection management endpoints
- KnowledgeIngestionConsumer dispatches bootstrap job (established)
  and synchronous purge (revoked)
- FileContentIndex: add connectionId + sourceKind columns
- SharePoint bootstrap with @odata.nextLink pagination and eTag-based
  idempotency
- Outlook bootstrap treats messages as virtual documents with
  cleanEmailBody for HTML/quote/signature stripping
- fix(rag): lower buildAgentContext minScore thresholds from
  0.55/0.65/0.70 to 0.35 — previous values blocked all real matches
  from text-embedding-3-small
- 24 new unit tests covering purge, consumer dispatch, email cleaning
  and both bootstrap paths
2026-04-29 14:39:40 +02:00
Ida
dff3d41845 fix(rag): stable ingestion idempotency across re-extractions (AC4)
Re-indexing the same file always triggered a full embedding run —
ingestion.skipped.duplicate never fired. Two independent causes:

1. _computeIngestionHash included contentObjectId in its payload, but
   extractors generate fresh uuid4() per run, making the hash a
   per-run nonce. Now hashed over (contentType, data) in extractor
   order — stable across re-extractions, sensitive to content,
   ordering, and type changes.
2. _autoIndexFile upserted the fresh pre-scan FileContentIndex before
   requestIngestion's duplicate check, wiping structure._ingestion
   and status=indexed from the prior run. The pre-upsert now merges
   the existing _ingestion metadata and preserves the indexed status.

Verified end-to-end: second PATCH /scope on an already-indexed file
logs  and returns in ~2s
with zero embedding API calls.

Adds test_ingestion_hash_stability.py (5 cases).
2026-04-29 14:39:40 +02:00
Ida
a7f4055130 fix(rag): preserve per-page granularity + remove on-demand extraction fallbacks
The default MergeStrategy concatenates every extracted text part into a
single ContentPart, collapsing a 500-page PDF into one chunk with a
blurred average embedding — RAG retrieval was effectively broken.

- ExtractionOptions.mergeStrategy is now Optional[MergeStrategy]; passing
  None preserves per-part granularity. Default factory kept for
  backward compatibility.
- routeDataFiles._autoIndexFile, _workspaceTools.readFile, and
  _documentTools.describeImage explicitly pass mergeStrategy=None.
- Agent tools no longer carry redundant extraction + requestIngestion
  fallback paths: the unified ingestion lane owns all corpus writes,
  and readFile/describeImage are pure consumers of the knowledge store.
- Unit test asserts runExtraction(mergeStrategy=None) keeps every part.
2026-04-29 14:39:40 +02:00
Ida
078b4eaaaf removed unnecessary test files 2026-04-29 14:39:40 +02:00
Ida
9d82d3d353 P0: injection facade 2026-04-29 14:39:40 +02:00
Patrick Motsch
ba21005401
Merge pull request #147 from valueonag/feat/demo-system-readieness
Feat/demo system readieness
2026-04-29 01:57:49 +02:00
Patrick Motsch
4d7ccb0418
Merge pull request #145 from valueonag/feat/demo-system-readieness
trustee agent fix
2026-04-27 08:08:32 +02:00
Patrick Motsch
e8abd553d0
Merge pull request #144 from valueonag/feat/demo-system-readieness
Feat/demo system readieness
2026-04-27 00:00:13 +02:00
133 changed files with 10662 additions and 3003 deletions

13
app.py
View file

@ -405,6 +405,16 @@ async def lifespan(app: FastAPI):
except Exception as e:
logger.warning(f"BackgroundJob recovery failed (non-critical): {e}")
# Subscribe knowledge ingestion to connection lifecycle events so OAuth
# connect/disconnect reliably trigger bootstrap/purge.
try:
from modules.serviceCenter.services.serviceKnowledge.subConnectorIngestConsumer import (
registerKnowledgeIngestionConsumer,
)
registerKnowledgeIngestionConsumer()
except Exception as e:
logger.warning(f"KnowledgeIngestionConsumer registration failed (non-critical): {e}")
yield
# --- Stop Managers ---
@ -672,6 +682,9 @@ app.include_router(navigationRouter)
from modules.routes.routeWorkflowDashboard import router as workflowDashboardRouter
app.include_router(workflowDashboardRouter)
from modules.routes.routeAutomationWorkspace import router as automationWorkspaceRouter
app.include_router(automationWorkspaceRouter)
# ============================================================================
# PLUG&PLAY FEATURE ROUTERS
# Dynamically load routers from feature containers in modules/features/

View file

@ -351,6 +351,7 @@ class AiAnthropic(BaseConnectorAi):
# Parse response
anthropicResponse = response.json()
stop_reason = anthropicResponse.get("stop_reason")
# Extract content and tool_use blocks from response
content = ""
@ -374,9 +375,25 @@ class AiAnthropic(BaseConnectorAi):
if not content and not toolCalls:
logger.warning(f"Anthropic API returned empty content. Full response: {anthropicResponse}")
content = "[Anthropic API returned empty response]"
err = (
"Anthropic refused the request (content policy) — try another model or adjust the prompt."
if stop_reason == "refusal"
else f"Anthropic returned no assistant text (stop_reason={stop_reason or 'unknown'})."
)
return AiModelResponse(
content="",
success=False,
error=err,
modelId=model.name,
metadata={
"response_id": anthropicResponse.get("id", ""),
"stop_reason": stop_reason,
},
)
metadata = {"response_id": anthropicResponse.get("id", "")}
if stop_reason:
metadata["stop_reason"] = stop_reason
if toolCalls:
metadata["toolCalls"] = toolCalls
@ -492,6 +509,19 @@ class AiAnthropic(BaseConnectorAi):
f"Anthropic stream returned empty response: model={model.name}, "
f"stopReason={stopReason}"
)
err = (
"Anthropic refused the request (content policy) — try another model or adjust the prompt."
if stopReason == "refusal"
else f"Anthropic returned no assistant text (stop_reason={stopReason or 'unknown'})."
)
yield AiModelResponse(
content="",
success=False,
error=err,
modelId=model.name,
metadata={"stopReason": stopReason} if stopReason else {},
)
return
metadata: Dict[str, Any] = {}
if stopReason:

View file

@ -834,7 +834,10 @@ class DatabaseConnector:
createdTs = record.get("sysCreatedAt")
if createdTs is None or createdTs == 0 or createdTs == 0.0:
record["sysCreatedAt"] = currentTime
if effective_user_id:
# Do not wipe caller-provided sysCreatedBy (e.g. FileItem from createFile with
# real user). ContextVar can be "system" for the DB pool while the business
# user is set on the record from model_dump().
if effective_user_id and not record.get("sysCreatedBy"):
record["sysCreatedBy"] = effective_user_id
elif not record.get("sysCreatedBy"):
if effective_user_id:
@ -1531,7 +1534,7 @@ class DatabaseConnector:
createdTs = rec.get("sysCreatedAt")
if createdTs is None or createdTs == 0 or createdTs == 0.0:
rec["sysCreatedAt"] = currentTime
if effectiveUserId:
if effectiveUserId and not rec.get("sysCreatedBy"):
rec["sysCreatedBy"] = effectiveUserId
elif not rec.get("sysCreatedBy") and effectiveUserId:
rec["sysCreatedBy"] = effectiveUserId

View file

@ -126,6 +126,11 @@ def _stripGraphBase(url: str) -> str:
def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry:
isFolder = "folder" in item
# Graph exposes the driveItem content hash as ``eTag`` (quoted) or
# ``cTag``; we normalise to a "revision" string so callers can use it as a
# stable ``contentVersion`` for idempotent ingestion without re-downloading
# file bytes.
revision = item.get("eTag") or item.get("cTag")
return ExternalEntry(
name=item.get("name", ""),
path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""),
@ -137,6 +142,9 @@ def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> Exter
"id": item.get("id"),
"webUrl": item.get("webUrl"),
"childCount": item.get("folder", {}).get("childCount") if isFolder else None,
"revision": revision,
"lastModifiedDateTime": item.get("lastModifiedDateTime"),
"parentReference": item.get("parentReference", {}),
},
)
@ -167,21 +175,36 @@ class SharepointAdapter(_GraphApiMixin, ServiceAdapter):
return await self._discoverSites()
if not folderPath or folderPath == "/":
endpoint = f"sites/{siteId}/drive/root/children"
endpoint: Optional[str] = f"sites/{siteId}/drive/root/children?$top=200"
else:
cleanPath = folderPath.lstrip("/")
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children"
endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children?$top=200"
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
return []
# Follow @odata.nextLink until a hard cap is reached so large libraries
# are fully enumerated (required for bootstrap). Per-page size uses
# Graph's max supported value to minimise round-trips.
effectiveLimit = int(limit) if limit is not None else None
items: List[Dict[str, Any]] = []
hardCap = 5000
while endpoint and len(items) < hardCap:
result = await self._graphGet(endpoint)
if "error" in result:
logger.warning(f"SharePoint browse failed: {result['error']}")
break
for raw in result.get("value", []) or []:
items.append(raw)
if effectiveLimit is not None and len(items) >= effectiveLimit:
break
if effectiveLimit is not None and len(items) >= effectiveLimit:
break
nextLink = result.get("@odata.nextLink")
endpoint = _stripGraphBase(nextLink) if nextLink else None
entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])]
entries = [_graphItemToExternalEntry(item, path) for item in items]
if filter:
entries = [e for e in entries if _matchFilter(e, filter)]
if limit is not None:
entries = entries[: max(1, int(limit))]
if effectiveLimit is not None:
entries = entries[: max(1, effectiveLimit)]
return entries
async def _discoverSites(self) -> List[ExternalEntry]:

View file

@ -162,6 +162,7 @@ class AiCallOptions(BaseModel):
# Provider filtering (from UI multiselect or automation config)
allowedProviders: Optional[List[str]] = Field(default=None, description="List of allowed AI providers to use (empty = all RBAC-permitted)")
allowedModels: Optional[List[str]] = Field(default=None, description="Whitelist of allowed model names (AND-filter with allowedProviders). None/empty = all allowed.")
class AiCallRequest(BaseModel):

View file

@ -110,11 +110,13 @@ class DocumentReferenceList(BaseModel):
# docItem:documentId
references.append(DocumentItemReference(documentId=parts[0]))
# Unknown format - skip or log warning
else:
# Try to parse as simple string (backward compatibility)
# Assume it's a label if it doesn't match known patterns
if refStr:
if not refStr:
continue
import re
if re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', refStr, re.I):
references.append(DocumentItemReference(documentId=refStr))
else:
references.append(DocumentListReference(label=refStr))
return cls(references=references)

View file

@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel):
imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)")
# Merging strategy
mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results")
mergeStrategy: Optional[MergeStrategy] = Field(
default_factory=MergeStrategy,
description=(
"Strategy for merging extraction results. Pass None to skip merging entirely "
"(required for per-chunk ingestion pipelines like RAG, where per-page/per-section "
"granularity must be preserved for embedding)."
),
)
# Optional chunking parameters (for backward compatibility)
chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed")

View file

@ -1,82 +0,0 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""FileFolder: hierarchical folder structure for file organization."""
from typing import Optional
from pydantic import BaseModel, Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
import uuid
@i18nModel("Dateiordner")
class FileFolder(PowerOnModel):
"""Hierarchischer Ordner fuer die Dateiverwaltung."""
id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="Primary key",
json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False},
)
name: str = Field(
description="Folder name",
json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True},
)
parentId: Optional[str] = Field(
default=None,
description="Parent folder ID (null = root)",
json_schema_extra={
"label": "Uebergeordneter Ordner",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
mandateId: Optional[str] = Field(
default=None,
description="Mandate context",
json_schema_extra={
"label": "Mandanten-ID",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"},
},
)
featureInstanceId: Optional[str] = Field(
default=None,
description="Feature instance context",
json_schema_extra={
"label": "Feature-Instanz-ID",
"frontend_type": "text",
"frontend_readonly": True,
"frontend_required": False,
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
scope: str = Field(
default="personal",
description="Data visibility scope: personal, featureInstance, mandate, global. Inherited by files in this folder.",
json_schema_extra={
"label": "Sichtbarkeit",
"frontend_type": "select",
"frontend_readonly": False,
"frontend_required": False,
"frontend_options": [
{"value": "personal", "label": "Persönlich"},
{"value": "featureInstance", "label": "Feature-Instanz"},
{"value": "mandate", "label": "Mandant"},
{"value": "global", "label": "Global"},
],
},
)
neutralize: bool = Field(
default=False,
description="Whether files in this folder should be neutralized before AI processing. Inherited by new/moved files.",
json_schema_extra={
"label": "Neutralisieren",
"frontend_type": "checkbox",
"frontend_readonly": False,
"frontend_required": False,
},
)

View file

@ -68,17 +68,6 @@ class FileItem(PowerOnModel):
description="Tags for categorization and search",
json_schema_extra={"label": "Tags", "frontend_type": "tags", "frontend_readonly": False, "frontend_required": False},
)
folderId: Optional[str] = Field(
default=None,
description="ID of the parent folder",
json_schema_extra={
"label": "Ordner-ID",
"frontend_type": "text",
"frontend_readonly": False,
"frontend_required": False,
"fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"},
},
)
description: Optional[str] = Field(
default=None,
description="User-provided description of the file",

View file

@ -6,7 +6,7 @@ Unified JSON document schema and helpers used by both generation prompts and ren
This defines a single canonical template and the supported section types.
"""
from typing import List
from typing import List, Literal, TypedDict
# Canonical list of supported section types across the system
supportedSectionTypes: List[str] = [
@ -18,6 +18,21 @@ supportedSectionTypes: List[str] = [
"image",
]
class InlineRun(TypedDict, total=False):
"""Single inline content run. Every paragraph/cell/list-item is a List[InlineRun]."""
type: Literal["text", "image", "link", "bold", "italic", "code"]
value: str # text content (for text/bold/italic/code/link-label)
fileId: str # for type=image: reference to FileItem
base64Data: str # for type=image: resolved base64 (post-processing)
mimeType: str # for type=image: e.g. "image/png"
widthPt: int # for type=image: optional render width
href: str # for type=link: URL target
supportedInlineRunTypes: List[str] = [
"text", "image", "link", "bold", "italic", "code",
]
# Canonical JSON template used for AI generation (documents array + sections)
# This template is used for STRUCTURE generation - sections have empty elements arrays.
# For content generation, elements arrays will be populated later.

View file

@ -90,6 +90,16 @@ class FileContentIndex(PowerOnModel):
description="Data visibility scope: personal, featureInstance, mandate, global",
json_schema_extra={"label": "Sichtbarkeit"},
)
sourceKind: str = Field(
default="file",
description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...",
json_schema_extra={"label": "Quellenart"},
)
connectionId: Optional[str] = Field(
default=None,
description="UserConnection ID if this index entry originates from an external connector",
json_schema_extra={"label": "Connection-ID"},
)
neutralizationStatus: Optional[str] = Field(
default=None,
description="Neutralization status: completed, failed, skipped, None = not required",

View file

@ -13,6 +13,42 @@ import math
T = TypeVar('T')
# ---------------------------------------------------------------------------
# Table Grouping models
# ---------------------------------------------------------------------------
class TableGroupNode(BaseModel):
"""
A single node in a user-defined group tree for a FormGeneratorTable.
Items belong to exactly one group (no multi-membership).
Groups can be nested to arbitrary depth via subGroups.
"""
id: str
name: str
itemIds: List[str] = Field(default_factory=list)
subGroups: List['TableGroupNode'] = Field(default_factory=list)
order: int = 0
isExpanded: bool = True
TableGroupNode.model_rebuild()
class TableGrouping(BaseModel):
"""
Persisted grouping configuration for one (user, contextKey) pair.
Stored in table_groupings in poweron_app (auto-created).
contextKey convention: API path without /api/ prefix and without trailing slash.
Examples: "connections", "prompts", "admin/users", "trustee/{instanceId}/documents"
"""
id: str
userId: str
contextKey: str
rootGroups: List[TableGroupNode] = Field(default_factory=list)
updatedAt: Optional[float] = None
class SortField(BaseModel):
"""
Single sort field configuration.
@ -24,12 +60,23 @@ class SortField(BaseModel):
class PaginationParams(BaseModel):
"""
Complete pagination state including page, sorting, and filters.
Grouping extensions (both optional omit when not using grouping):
groupId Scope the request to items belonging to this group.
The backend resolves it to an itemIds IN-filter before
applying normal pagination/search/filter logic.
Also applied for mode=ids and mode=filterValues so that
bulk-select and filter-dropdowns respect the group scope.
saveGroupTree If present the backend persists this tree for the current
(user, contextKey) pair *before* fetching, then returns
the confirmed tree in the response groupTree field.
Omit on every request that does not change the group tree.
"""
page: int = Field(ge=1, description="Current page number (1-based)")
pageSize: int = Field(ge=1, le=1000, description="Number of items per page")
sort: List[SortField] = Field(default_factory=list, description="List of sort fields in priority order")
filters: Optional[Dict[str, Any]] = Field(
default=None,
default=None,
description="""Filter criteria dictionary. Supports:
- General search: {"search": "text"} - searches across all text fields (case-insensitive)
- Field-specific filters:
@ -38,6 +85,14 @@ class PaginationParams(BaseModel):
- Supported operators: equals/eq, contains, startsWith, endsWith, gt, gte, lt, lte, in, notIn
- Multiple filters are combined with AND logic"""
)
groupId: Optional[str] = Field(
default=None,
description="Scope request to items of this group (resolved server-side to itemIds IN-filter)",
)
saveGroupTree: Optional[List[Dict[str, Any]]] = Field(
default=None,
description="If set, persist this group tree before fetching (optimistic save)",
)
class PaginationRequest(BaseModel):
@ -74,10 +129,19 @@ class PaginationMetadata(BaseModel):
class PaginatedResponse(BaseModel, Generic[T]):
"""
Response containing paginated data and metadata.
groupTree is included when the endpoint supports table grouping and the
current user has a saved group tree for the requested contextKey.
It is None when grouping is not configured for the endpoint or the user
has not created any groups yet. Frontend must treat None as an empty tree.
"""
items: List[T] = Field(..., description="Array of items for current page")
pagination: Optional[PaginationMetadata] = Field(..., description="Pagination metadata (None if pagination not applied)")
groupTree: Optional[List[TableGroupNode]] = Field(
default=None,
description="Current group tree for this (user, contextKey) pair — None if no grouping configured",
)
model_config = ConfigDict(arbitrary_types_allowed=True)
@ -85,29 +149,33 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any]
"""
Normalize pagination dictionary to handle frontend variations.
Moves top-level "search" field into filters if present.
Grouping fields (groupId, saveGroupTree) are passed through as-is.
Args:
pagination_dict: Raw pagination dictionary from frontend
Returns:
Normalized pagination dictionary ready for PaginationParams parsing
"""
if not pagination_dict:
return pagination_dict
# Create a copy to avoid modifying the original
normalized = dict(pagination_dict)
# Ensure required fields have sensible defaults
if "page" not in normalized:
normalized["page"] = 1
if "pageSize" not in normalized:
normalized["pageSize"] = 25
# Move top-level "search" into filters if present
if "search" in normalized:
if "filters" not in normalized or normalized["filters"] is None:
normalized["filters"] = {}
normalized["filters"]["search"] = normalized.pop("search")
# groupId / saveGroupTree are valid PaginationParams fields — pass through unchanged.
# No transformation needed; Pydantic will validate them.
return normalized

View file

@ -475,7 +475,23 @@ class UserConnection(PowerOnModel):
description="OAuth scopes granted for this connection",
json_schema_extra={"frontend_type": "list", "frontend_readonly": True, "frontend_required": False, "label": "Gewährte Berechtigungen"},
)
knowledgeIngestionEnabled: bool = Field(
default=False,
description="Whether the user has consented to knowledge ingestion for this connection",
json_schema_extra={"frontend_type": "boolean", "frontend_readonly": False, "frontend_required": False, "label": "Wissensdatenbank aktiv"},
)
knowledgePreferences: Optional[Dict[str, Any]] = Field(
default=None,
description=(
"Per-connection knowledge ingestion preferences. schemaVersion=1 keys: "
"neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), "
"mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), "
"clickupScope (titles|title_description|with_comments), "
"surfaceToggles (dict per authority), maxAgeDays (int)."
),
json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"},
)
@computed_field
@property
def connectionReference(self) -> str:

View file

@ -174,14 +174,26 @@ async def indexSessionData(
for c in chunks
]
await knowledgeService.indexFile(
fileId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="coaching_session",
sourceId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"lane": "feature",
"feature": "commcoach",
"sessionId": sessionId,
"contextId": contextId,
"messageCount": len(messages or []),
},
)
)
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
except Exception as e:

View file

@ -72,7 +72,7 @@ class AutoWorkflow(PowerOnModel):
},
)
featureInstanceId: str = Field(
description="Feature instance ID",
description="Feature instance ID (GE owner instance / RBAC scope)",
json_schema_extra={
"frontend_type": "text",
"frontend_readonly": True,
@ -81,6 +81,17 @@ class AutoWorkflow(PowerOnModel):
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
targetFeatureInstanceId: Optional[str] = Field(
default=None,
description="Target feature instance for execution data scope. NULL for templates, mandatory for non-templates.",
json_schema_extra={
"frontend_type": "select",
"frontend_readonly": False,
"frontend_required": False,
"label": "Ziel-Instanz",
"fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"},
},
)
label: str = Field(
description="User-friendly workflow name",
json_schema_extra={"frontend_type": "text", "frontend_required": True, "label": "Bezeichnung"},

View file

@ -12,17 +12,30 @@ import uuid
from typing import Dict, Any, List, Optional
def _make_json_serializable(obj: Any) -> Any:
_INTERNAL_SKIP_KEYS = frozenset({"_context", "_orderedNodes"})
def _make_json_serializable(obj: Any, _depth: int = 0) -> Any:
"""
Recursively convert bytes to base64 strings so structures can be JSON-serialized
for storage in JSONB columns.
Internal runtime keys (_context, _orderedNodes) are skipped they hold live
Python objects (including back-references to nodeOutputs) and must never be
stored. A depth guard prevents runaway recursion on unexpected circular refs.
"""
if _depth > 50:
return None
if isinstance(obj, bytes):
return base64.b64encode(obj).decode("ascii")
if isinstance(obj, dict):
return {k: _make_json_serializable(v) for k, v in obj.items()}
return {
k: _make_json_serializable(v, _depth + 1)
for k, v in obj.items()
if k not in _INTERNAL_SKIP_KEYS
}
if isinstance(obj, list):
return [_make_json_serializable(v) for v in obj]
return [_make_json_serializable(v, _depth + 1) for v in obj]
return obj
from modules.datamodels.datamodelUam import User
@ -217,6 +230,8 @@ class GraphicalEditorObjects:
data["id"] = str(uuid.uuid4())
data["mandateId"] = self.mandateId
data["featureInstanceId"] = self.featureInstanceId
if not data.get("targetFeatureInstanceId") and not data.get("isTemplate"):
data["targetFeatureInstanceId"] = self.featureInstanceId
if "active" not in data or data.get("active") is None:
data["active"] = True
data["invocations"] = normalize_invocations_list(data.get("invocations"))

View file

@ -3,6 +3,15 @@
from modules.shared.i18nRegistry import t
_AI_COMMON_PARAMS = [
{"name": "requireNeutralization", "type": "bool", "required": False,
"frontendType": "checkbox", "default": False,
"description": t("Eingaben fuer diesen Call neutralisieren")},
{"name": "allowedModels", "type": "array", "required": False,
"frontendType": "modelMultiSelect", "default": [],
"description": t("Erlaubte LLM-Modelle (leer = alle erlaubten)")},
]
AI_NODES = [
{
"id": "ai.prompt",
@ -10,20 +19,25 @@ AI_NODES = [
"label": t("Prompt"),
"description": t("Prompt eingeben und KI führt aus"),
"parameters": [
{"name": "aiPrompt", "type": "string", "required": True, "frontendType": "textarea",
{"name": "aiPrompt", "type": "str", "required": True, "frontendType": "templateTextarea",
"description": t("KI-Prompt")},
{"name": "resultType", "type": "string", "required": False, "frontendType": "select",
{"name": "resultType", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["txt", "json", "md", "csv", "xml", "html", "pdf", "docx", "xlsx", "pptx", "png", "jpg"]},
"description": t("Ausgabeformat"), "default": "txt"},
{"name": "documentList", "type": "string", "required": False, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "simpleMode", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "hidden",
"description": t("Dokumente aus vorherigen Schritten"), "default": ""},
{"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder",
"description": t("Daten aus vorherigen Schritten"), "default": ""},
{"name": "documentTheme", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["general", "finance", "legal", "technical", "hr"]},
"description": t("Dokument-Thema (Style-Hinweis fuer den Renderer)"), "default": "general"},
{"name": "simpleMode", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Einfacher Modus"), "default": True},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": [
"DocumentList", "AiResult", "TextResult", "Transit", "LoopItem", "ActionResult",
"FormPayload", "DocumentList", "AiResult", "TextResult", "Transit", "LoopItem", "ActionResult",
]}},
"outputPorts": {0: {"schema": "AiResult"}},
"meta": {"icon": "mdi-robot", "color": "#9C27B0", "usesAi": True},
@ -36,12 +50,16 @@ AI_NODES = [
"label": t("Web-Recherche"),
"description": t("Recherche im Web"),
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
{"name": "prompt", "type": "str", "required": True, "frontendType": "textarea",
"description": t("Recherche-Anfrage")},
],
{"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder",
"description": t("Daten aus vorherigen Schritten"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "hidden",
"description": t("Dokumente aus vorherigen Schritten"), "default": ""},
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
"inputPorts": {0: {"accepts": ["FormPayload", "Transit", "AiResult", "DocumentList", "ActionResult"]}},
"outputPorts": {0: {"schema": "AiResult"}},
"meta": {"icon": "mdi-magnify", "color": "#9C27B0", "usesAi": True},
"_method": "ai",
@ -53,12 +71,12 @@ AI_NODES = [
"label": t("Dokument zusammenfassen"),
"description": t("Dokumentinhalt zusammenfassen"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "summaryLength", "type": "string", "required": False, "frontendType": "select",
{"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
"description": t("Dokumente aus vorherigen Schritten")},
{"name": "summaryLength", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["brief", "medium", "detailed"]},
"description": t("Kurz, mittel oder ausführlich"), "default": "medium"},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@ -73,11 +91,11 @@ AI_NODES = [
"label": t("Dokument übersetzen"),
"description": t("Dokument in Zielsprache übersetzen"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "targetLanguage", "type": "string", "required": True, "frontendType": "text",
{"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
"description": t("Dokumente aus vorherigen Schritten")},
{"name": "targetLanguage", "type": "str", "required": True, "frontendType": "text",
"description": t("Zielsprache (z.B. de, en, French)")},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@ -92,12 +110,12 @@ AI_NODES = [
"label": t("Dokument konvertieren"),
"description": t("Dokument in anderes Format konvertieren"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "targetFormat", "type": "string", "required": True, "frontendType": "select",
{"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef",
"description": t("Dokumente aus vorherigen Schritten")},
{"name": "targetFormat", "type": "str", "required": True, "frontendType": "select",
"frontendOptions": {"options": ["docx", "pdf", "xlsx", "csv", "txt", "html", "json", "md"]},
"description": t("Zielformat")},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}},
@ -112,12 +130,24 @@ AI_NODES = [
"label": t("Dokument generieren"),
"description": t("Dokument aus Prompt generieren"),
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
{"name": "prompt", "type": "str", "required": True, "frontendType": "textarea",
"description": t("Generierungs-Prompt")},
],
{"name": "outputFormat", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["docx", "pdf", "txt", "html", "md"]},
"description": t("Ausgabeformat"), "default": "docx"},
{"name": "title", "type": "str", "required": False, "frontendType": "text",
"description": t("Dokumenttitel (Metadaten / Dateiname)"), "default": ""},
{"name": "documentType", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["letter", "memo", "proposal", "contract", "report", "email"]},
"description": t("Dokumentart (Inhaltshinweis fuer die KI)"), "default": "proposal"},
{"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder",
"description": t("Daten aus vorherigen Schritten"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "hidden",
"description": t("Dokumente aus vorherigen Schritten"), "default": ""},
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
"inputPorts": {0: {"accepts": ["FormPayload", "Transit", "AiResult", "DocumentList", "ActionResult"]}},
"outputPorts": {0: {"schema": "DocumentList"}},
"meta": {"icon": "mdi-file-plus", "color": "#9C27B0", "usesAi": True},
"_method": "ai",
@ -129,15 +159,19 @@ AI_NODES = [
"label": t("Code generieren"),
"description": t("Code aus Beschreibung generieren"),
"parameters": [
{"name": "prompt", "type": "string", "required": True, "frontendType": "textarea",
{"name": "prompt", "type": "str", "required": True, "frontendType": "textarea",
"description": t("Code-Generierungs-Prompt")},
{"name": "resultType", "type": "string", "required": False, "frontendType": "select",
{"name": "resultType", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["py", "js", "ts", "html", "java", "cpp", "txt", "json", "csv", "xml"]},
"description": t("Datei-Endung der erzeugten Code-Datei"), "default": "py"},
],
{"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder",
"description": t("Daten aus vorherigen Schritten"), "default": ""},
{"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "hidden",
"description": t("Dokumente aus vorherigen Schritten"), "default": ""},
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["Transit"]}},
"inputPorts": {0: {"accepts": ["FormPayload", "Transit", "AiResult", "DocumentList", "ActionResult"]}},
"outputPorts": {0: {"schema": "AiResult"}},
"meta": {"icon": "mdi-code-tags", "color": "#9C27B0", "usesAi": True},
"_method": "ai",
@ -149,12 +183,12 @@ AI_NODES = [
"label": t("KI-Konsolidierung"),
"description": t("Gesammelte Ergebnisse mit KI zusammenfassen, klassifizieren oder semantisch zusammenführen"),
"parameters": [
{"name": "mode", "type": "string", "required": False, "frontendType": "select",
{"name": "mode", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["summarize", "classify", "semanticMerge"]},
"description": t("Konsolidierungsmodus"), "default": "summarize"},
{"name": "prompt", "type": "string", "required": False, "frontendType": "textarea",
{"name": "prompt", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Optionaler Prompt für die Konsolidierung"), "default": ""},
],
] + _AI_COMMON_PARAMS,
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["AggregateResult", "Transit"]}},

View file

@ -11,23 +11,23 @@ CLICKUP_NODES = [
"label": t("Aufgaben suchen"),
"description": t("Aufgaben in einem Workspace suchen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "clickup"},
"description": t("ClickUp-Verbindung")},
{"name": "teamId", "type": "string", "required": True, "frontendType": "text",
{"name": "teamId", "type": "str", "required": True, "frontendType": "text",
"description": t("Team-/Workspace-ID")},
{"name": "query", "type": "string", "required": True, "frontendType": "text",
{"name": "query", "type": "str", "required": True, "frontendType": "text",
"description": t("Suchbegriff")},
{"name": "page", "type": "number", "required": False, "frontendType": "number",
{"name": "page", "type": "int", "required": False, "frontendType": "number",
"description": t("Seite"), "default": 0},
{"name": "listId", "type": "string", "required": False, "frontendType": "clickupList",
{"name": "listId", "type": "str", "required": False, "frontendType": "clickupList",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("In dieser Liste suchen")},
{"name": "includeClosed", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "includeClosed", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Erledigte einbeziehen"), "default": False},
{"name": "fullTaskData", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "fullTaskData", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Vollständige Daten"), "default": False},
{"name": "matchNameOnly", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "matchNameOnly", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Nur Titel"), "default": True},
],
"inputs": 1,
@ -44,15 +44,15 @@ CLICKUP_NODES = [
"label": t("Aufgaben auflisten"),
"description": t("Aufgaben einer Liste auflisten"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "clickup"},
"description": t("ClickUp-Verbindung")},
{"name": "pathQuery", "type": "string", "required": True, "frontendType": "clickupList",
{"name": "pathQuery", "type": "str", "required": True, "frontendType": "clickupList",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Pfad zur Liste")},
{"name": "page", "type": "number", "required": False, "frontendType": "number",
{"name": "page", "type": "int", "required": False, "frontendType": "number",
"description": t("Seite"), "default": 0},
{"name": "includeClosed", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "includeClosed", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Erledigte einbeziehen"), "default": False},
],
"inputs": 1,
@ -69,12 +69,12 @@ CLICKUP_NODES = [
"label": t("Aufgabe abrufen"),
"description": t("Eine Aufgabe abrufen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "clickup"},
"description": t("ClickUp-Verbindung")},
{"name": "taskId", "type": "string", "required": False, "frontendType": "text",
{"name": "taskId", "type": "str", "required": False, "frontendType": "text",
"description": t("Task-ID")},
{"name": "pathQuery", "type": "string", "required": False, "frontendType": "text",
{"name": "pathQuery", "type": "str", "required": False, "frontendType": "text",
"description": t("Oder Pfad")},
],
"inputs": 1,
@ -91,34 +91,34 @@ CLICKUP_NODES = [
"label": t("Aufgabe erstellen"),
"description": t("Aufgabe erstellen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "clickup"},
"description": t("ClickUp-Verbindung")},
{"name": "pathQuery", "type": "string", "required": False, "frontendType": "clickupList",
{"name": "pathQuery", "type": "str", "required": False, "frontendType": "clickupList",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Pfad zur Liste")},
{"name": "listId", "type": "string", "required": False, "frontendType": "text",
{"name": "listId", "type": "str", "required": False, "frontendType": "text",
"description": t("Listen-ID")},
{"name": "name", "type": "string", "required": True, "frontendType": "text",
{"name": "name", "type": "str", "required": True, "frontendType": "text",
"description": t("Name")},
{"name": "description", "type": "string", "required": False, "frontendType": "textarea",
{"name": "description", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Beschreibung")},
{"name": "taskStatus", "type": "string", "required": False, "frontendType": "text",
{"name": "taskStatus", "type": "str", "required": False, "frontendType": "text",
"description": t("Status")},
{"name": "taskPriority", "type": "string", "required": False, "frontendType": "select",
{"name": "taskPriority", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["1", "2", "3", "4"]},
"description": t("Priorität 1-4")},
{"name": "taskDueDateMs", "type": "string", "required": False, "frontendType": "text",
{"name": "taskDueDateMs", "type": "str", "required": False, "frontendType": "text",
"description": t("Fälligkeit (ms)")},
{"name": "taskAssigneeIds", "type": "object", "required": False, "frontendType": "json",
"description": t("Zugewiesene")},
{"name": "taskTimeEstimateMs", "type": "string", "required": False, "frontendType": "text",
{"name": "taskTimeEstimateMs", "type": "str", "required": False, "frontendType": "text",
"description": t("Zeitschätzung (ms)")},
{"name": "taskTimeEstimateHours", "type": "string", "required": False, "frontendType": "text",
{"name": "taskTimeEstimateHours", "type": "str", "required": False, "frontendType": "text",
"description": t("Zeitschätzung (h)")},
{"name": "customFieldValues", "type": "object", "required": False, "frontendType": "json",
"description": t("Benutzerdefinierte Felder")},
{"name": "taskFields", "type": "string", "required": False, "frontendType": "json",
{"name": "taskFields", "type": "str", "required": False, "frontendType": "json",
"description": t("Zusätzliches JSON")},
],
"inputs": 1,
@ -135,14 +135,14 @@ CLICKUP_NODES = [
"label": t("Aufgabe aktualisieren"),
"description": t("Felder der Aufgabe ändern"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "clickup"},
"description": t("ClickUp-Verbindung")},
{"name": "taskId", "type": "string", "required": False, "frontendType": "text",
{"name": "taskId", "type": "str", "required": False, "frontendType": "text",
"description": t("Task-ID")},
{"name": "path", "type": "string", "required": False, "frontendType": "text",
{"name": "path", "type": "str", "required": False, "frontendType": "text",
"description": t("Oder Pfad")},
{"name": "taskUpdate", "type": "string", "required": False, "frontendType": "json",
{"name": "taskUpdate", "type": "str", "required": False, "frontendType": "json",
"description": t("JSON-Body für PUT /task/{id}, z.B. {\"name\":\"...\",\"status\":\"...\"}")},
],
"inputs": 1,
@ -159,16 +159,16 @@ CLICKUP_NODES = [
"label": t("Anhang hochladen"),
"description": t("Datei an Task anhängen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "clickup"},
"description": t("ClickUp-Verbindung")},
{"name": "taskId", "type": "string", "required": False, "frontendType": "text",
{"name": "taskId", "type": "str", "required": False, "frontendType": "text",
"description": t("Task-ID")},
{"name": "path", "type": "string", "required": False, "frontendType": "text",
{"name": "path", "type": "str", "required": False, "frontendType": "text",
"description": t("Oder Pfad")},
{"name": "fileName", "type": "string", "required": False, "frontendType": "text",
{"name": "fileName", "type": "str", "required": False, "frontendType": "text",
"description": t("Dateiname")},
{"name": "content", "type": "string", "required": True, "frontendType": "hidden",
{"name": "content", "type": "str", "required": True, "frontendType": "hidden",
"description": t("Datei-Inhalt aus Upstream-Node (via Wire oder DataRef)"), "default": ""},
],
"inputs": 1,

View file

@ -10,7 +10,7 @@ CONTEXT_NODES = [
"label": t("Inhalt extrahieren"),
"description": t("Dokumentstruktur extrahieren ohne KI (Seiten, Abschnitte, Bilder, Tabellen)"),
"parameters": [
{"name": "documentList", "type": "string", "required": True, "frontendType": "hidden",
{"name": "documentList", "type": "str", "required": True, "frontendType": "hidden",
"description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""},
{"name": "extractionOptions", "type": "object", "required": False, "frontendType": "json",
"description": t(

View file

@ -10,7 +10,7 @@ DATA_NODES = [
"label": t("Sammeln"),
"description": t("Ergebnisse aus Schleifen-Iterationen sammeln"),
"parameters": [
{"name": "mode", "type": "string", "required": False, "frontendType": "select",
{"name": "mode", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["collect", "concat", "sum", "count"]},
"description": t("Aggregationsmodus"), "default": "collect"},
],
@ -27,9 +27,9 @@ DATA_NODES = [
"label": t("Filtern"),
"description": t("Elemente nach Bedingung filtern"),
"parameters": [
{"name": "condition", "type": "string", "required": True, "frontendType": "filterExpression",
{"name": "condition", "type": "str", "required": True, "frontendType": "filterExpression",
"description": t("Filterbedingung")},
{"name": "udmContentType", "type": "string", "required": False, "frontendType": "select",
{"name": "udmContentType", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["", "text", "image", "table", "code", "media", "link", "formula"]},
"description": t("UDM-ContentType-Filter (optional, leer = kein UDM-Filter)"), "default": ""},
],
@ -46,10 +46,10 @@ DATA_NODES = [
"label": t("Konsolidieren"),
"description": t("Gesammelte Ergebnisse deterministisch zusammenführen (Tabelle, CSV, Merge)"),
"parameters": [
{"name": "mode", "type": "string", "required": False, "frontendType": "select",
{"name": "mode", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["table", "concat", "merge", "csvJoin"]},
"description": t("Konsolidierungsmodus"), "default": "table"},
{"name": "separator", "type": "string", "required": False, "frontendType": "text",
{"name": "separator", "type": "str", "required": False, "frontendType": "text",
"description": t("Trennzeichen (für concat/csvJoin)"), "default": "\n"},
],
"inputs": 1,

View file

@ -10,14 +10,14 @@ EMAIL_NODES = [
"label": t("E-Mail prüfen"),
"description": t("Neue E-Mails prüfen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("E-Mail-Konto Verbindung")},
{"name": "folder", "type": "string", "required": False, "frontendType": "text",
{"name": "folder", "type": "str", "required": False, "frontendType": "text",
"description": t("Ordner"), "default": "Inbox"},
{"name": "limit", "type": "number", "required": False, "frontendType": "number",
{"name": "limit", "type": "int", "required": False, "frontendType": "number",
"description": t("Max E-Mails"), "default": 100},
{"name": "filter", "type": "string", "required": False, "frontendType": "text",
{"name": "filter", "type": "str", "required": False, "frontendType": "text",
"description": t("Filter-Ausdruck (z.B. 'from:max@example.com hasAttachment:true betreff')"), "default": ""},
],
"inputs": 1,
@ -34,14 +34,14 @@ EMAIL_NODES = [
"label": t("E-Mail suchen"),
"description": t("E-Mails suchen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("E-Mail-Konto Verbindung")},
{"name": "query", "type": "string", "required": True, "frontendType": "text",
{"name": "query", "type": "str", "required": True, "frontendType": "text",
"description": t("Suchausdruck (z.B. 'from:max@example.com hasAttachments:true Rechnung')")},
{"name": "folder", "type": "string", "required": False, "frontendType": "text",
{"name": "folder", "type": "str", "required": False, "frontendType": "text",
"description": t("Ordner"), "default": "All"},
{"name": "limit", "type": "number", "required": False, "frontendType": "number",
{"name": "limit", "type": "int", "required": False, "frontendType": "number",
"description": t("Max E-Mails"), "default": 100},
],
"inputs": 1,
@ -59,19 +59,19 @@ EMAIL_NODES = [
"description": t(
"AI-gestützt einen E-Mail-Entwurf aus Kontext und optionalen Dokumenten erstellen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("E-Mail-Konto")},
{"name": "context", "type": "string", "required": False, "frontendType": "textarea",
"description": t("Kontext / Brief-Beschreibung für die KI-Komposition"), "default": ""},
{"name": "to", "type": "string", "required": False, "frontendType": "text",
{"name": "context", "type": "Any", "required": False, "frontendType": "templateTextarea",
"description": t("Daten aus vorherigen Schritten (oder direkte Beschreibung)"), "default": ""},
{"name": "to", "type": "str", "required": False, "frontendType": "text",
"description": t("Empfänger (komma-separiert, optional für Entwurf)"), "default": ""},
{"name": "documentList", "type": "string", "required": False, "frontendType": "hidden",
{"name": "documentList", "type": "str", "required": False, "frontendType": "hidden",
"description": t("Anhang-Dokumente (via Wire oder DataRef)"), "default": ""},
{"name": "emailContent", "type": "string", "required": False, "frontendType": "hidden",
{"name": "emailContent", "type": "str", "required": False, "frontendType": "hidden",
"description": t("Direkt vorbereiteter Inhalt {subject, body, to} (via Wire — überspringt KI)"),
"default": ""},
{"name": "emailStyle", "type": "string", "required": False, "frontendType": "select",
{"name": "emailStyle", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["formal", "casual", "business"]},
"description": t("Stil"), "default": "business"},
],

View file

@ -12,23 +12,23 @@ FILE_NODES = [
"parameters": [
{"name": "contentSources", "type": "json", "required": False, "frontendType": "json",
"description": t("Kontext-Quellen"), "default": []},
{"name": "outputFormat", "type": "string", "required": True, "frontendType": "select",
{"name": "outputFormat", "type": "str", "required": True, "frontendType": "select",
"frontendOptions": {"options": ["docx", "pdf", "txt", "html", "md"]},
"description": t("Ausgabeformat"), "default": "docx"},
{"name": "title", "type": "string", "required": False, "frontendType": "text",
{"name": "title", "type": "str", "required": False, "frontendType": "text",
"description": t("Dokumenttitel")},
{"name": "templateName", "type": "string", "required": False, "frontendType": "select",
{"name": "templateName", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["default", "corporate", "minimal"]},
"description": t("Stil-Vorlage")},
{"name": "language", "type": "string", "required": False, "frontendType": "select",
{"name": "language", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["de", "en", "fr"]},
"description": t("Sprache"), "default": "de"},
{"name": "context", "type": "string", "required": False, "frontendType": "hidden",
"description": t("Inhalt (via Wire oder DataRef)"), "default": ""},
{"name": "context", "type": "Any", "required": False, "frontendType": "contextBuilder",
"description": t("Daten aus vorherigen Schritten"), "default": ""},
],
"inputs": 1,
"outputs": 1,
"inputPorts": {0: {"accepts": ["AiResult", "TextResult", "Transit"]}},
"inputPorts": {0: {"accepts": ["AiResult", "TextResult", "Transit", "FormPayload"]}},
"outputPorts": {0: {"schema": "DocumentList"}},
"meta": {"icon": "mdi-file-plus-outline", "color": "#2196F3", "usesAi": False},
"_method": "file",

View file

@ -12,7 +12,7 @@ FLOW_NODES = [
"parameters": [
{
"name": "condition",
"type": "string",
"type": "str",
"required": True,
"frontendType": "condition",
"description": t("Bedingung"),
@ -34,7 +34,7 @@ FLOW_NODES = [
"parameters": [
{
"name": "value",
"type": "string",
"type": "str",
"required": True,
"frontendType": "text",
"description": t("Zu vergleichender Wert"),
@ -62,14 +62,14 @@ FLOW_NODES = [
"parameters": [
{
"name": "items",
"type": "string",
"type": "str",
"required": True,
"frontendType": "text",
"description": t("Pfad zum Array"),
},
{
"name": "level",
"type": "string",
"type": "str",
"required": False,
"frontendType": "select",
"frontendOptions": {"options": ["auto", "documents", "structuralNodes", "contentBlocks"]},
@ -78,7 +78,7 @@ FLOW_NODES = [
},
{
"name": "concurrency",
"type": "number",
"type": "int",
"required": False,
"frontendType": "number",
"frontendOptions": {"min": 1, "max": 20},
@ -103,7 +103,7 @@ FLOW_NODES = [
"parameters": [
{
"name": "mode",
"type": "string",
"type": "str",
"required": False,
"frontendType": "select",
"frontendOptions": {"options": ["first", "all", "append"]},
@ -112,7 +112,7 @@ FLOW_NODES = [
},
{
"name": "inputCount",
"type": "number",
"type": "int",
"required": False,
"frontendType": "number",
"frontendOptions": {"min": 2, "max": 5},

View file

@ -3,6 +3,18 @@
from modules.shared.i18nRegistry import t
# Canonical form field types — single source of truth.
# portType maps to the PORT_TYPE_CATALOG primitive used by DataPicker / validateGraph.
FORM_FIELD_TYPES = [
{"id": "text", "label": "Text (einzeilig)", "portType": "str"},
{"id": "textarea", "label": "Text (mehrzeilig)", "portType": "str"},
{"id": "number", "label": "Zahl", "portType": "int"},
{"id": "boolean", "label": "Ja/Nein", "portType": "bool"},
{"id": "date", "label": "Datum", "portType": "str"},
{"id": "email", "label": "E-Mail", "portType": "str"},
{"id": "select", "label": "Auswahl", "portType": "str"},
]
INPUT_NODES = [
{
"id": "input.form",
@ -32,11 +44,11 @@ INPUT_NODES = [
"label": t("Genehmigung"),
"description": t("Benutzer genehmigt oder lehnt ab"),
"parameters": [
{"name": "title", "type": "string", "required": True, "frontendType": "text",
{"name": "title", "type": "str", "required": True, "frontendType": "text",
"description": t("Genehmigungstitel")},
{"name": "description", "type": "string", "required": False, "frontendType": "textarea",
{"name": "description", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Was genehmigt werden soll")},
{"name": "approvalType", "type": "string", "required": False, "frontendType": "select",
{"name": "approvalType", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["generic", "document"]},
"description": t("Typ: document oder generic"), "default": "generic"},
],
@ -53,14 +65,14 @@ INPUT_NODES = [
"label": t("Upload"),
"description": t("Benutzer lädt Datei(en) hoch"),
"parameters": [
{"name": "accept", "type": "string", "required": False, "frontendType": "text",
{"name": "accept", "type": "str", "required": False, "frontendType": "text",
"description": t("Accept-String"), "default": ""},
{"name": "allowedTypes", "type": "json", "required": False, "frontendType": "multiselect",
"frontendOptions": {"options": ["pdf", "docx", "xlsx", "pptx", "txt", "csv", "jpg", "png", "gif"]},
"description": t("Ausgewählte Dateitypen"), "default": []},
{"name": "maxSize", "type": "number", "required": False, "frontendType": "number",
{"name": "maxSize", "type": "int", "required": False, "frontendType": "number",
"description": t("Max. Dateigröße in MB"), "default": 10},
{"name": "multiple", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "multiple", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Mehrere Dateien erlauben"), "default": False},
],
"inputs": 1,
@ -76,9 +88,9 @@ INPUT_NODES = [
"label": t("Kommentar"),
"description": t("Benutzer fügt einen Kommentar hinzu"),
"parameters": [
{"name": "placeholder", "type": "string", "required": False, "frontendType": "text",
{"name": "placeholder", "type": "str", "required": False, "frontendType": "text",
"description": t("Platzhalter"), "default": ""},
{"name": "required", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "required", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Kommentar erforderlich"), "default": True},
],
"inputs": 1,
@ -94,9 +106,9 @@ INPUT_NODES = [
"label": t("Prüfung"),
"description": t("Benutzer prüft Inhalt"),
"parameters": [
{"name": "contentRef", "type": "string", "required": True, "frontendType": "text",
{"name": "contentRef", "type": "str", "required": True, "frontendType": "text",
"description": t("Referenz auf Inhalt")},
{"name": "reviewType", "type": "string", "required": False, "frontendType": "select",
{"name": "reviewType", "type": "str", "required": False, "frontendType": "select",
"frontendOptions": {"options": ["generic", "document"]},
"description": t("Art der Prüfung"), "default": "generic"},
],
@ -115,7 +127,7 @@ INPUT_NODES = [
"parameters": [
{"name": "options", "type": "json", "required": True, "frontendType": "keyValueRows",
"description": t("Optionen"), "default": []},
{"name": "multiple", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "multiple", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Mehrfachauswahl erlauben"), "default": False},
],
"inputs": 1,
@ -131,11 +143,11 @@ INPUT_NODES = [
"label": t("Bestätigung"),
"description": t("Benutzer bestätigt Ja/Nein"),
"parameters": [
{"name": "question", "type": "string", "required": True, "frontendType": "text",
{"name": "question", "type": "str", "required": True, "frontendType": "text",
"description": t("Zu bestätigende Frage")},
{"name": "confirmLabel", "type": "string", "required": False, "frontendType": "text",
{"name": "confirmLabel", "type": "str", "required": False, "frontendType": "text",
"description": t("Label für Bestätigen-Button"), "default": "Confirm"},
{"name": "rejectLabel", "type": "string", "required": False, "frontendType": "text",
{"name": "rejectLabel", "type": "str", "required": False, "frontendType": "text",
"description": t("Label für Ablehnen-Button"), "default": "Reject"},
],
"inputs": 1,

View file

@ -25,7 +25,7 @@ REDMINE_NODES = [
"description": t("Einzelnes Redmine-Ticket aus dem Mirror laden."),
"parameters": [
dict(_REDMINE_INSTANCE_PARAM),
{"name": "ticketId", "type": "number", "required": True, "frontendType": "number",
{"name": "ticketId", "type": "int", "required": True, "frontendType": "number",
"description": t("Redmine-Ticket-ID")},
],
"inputs": 1,
@ -43,17 +43,17 @@ REDMINE_NODES = [
"description": t("Tickets aus dem lokalen Mirror mit Filtern (Tracker, Status, Zeitraum, Zuweisung)."),
"parameters": [
dict(_REDMINE_INSTANCE_PARAM),
{"name": "trackerIds", "type": "string", "required": False, "frontendType": "text",
{"name": "trackerIds", "type": "str", "required": False, "frontendType": "text",
"description": t("Tracker-IDs (Komma-separiert)"), "default": ""},
{"name": "status", "type": "string", "required": False, "frontendType": "text",
{"name": "status", "type": "str", "required": False, "frontendType": "text",
"description": t("Status-Filter: open | closed | *"), "default": "*"},
{"name": "dateFrom", "type": "string", "required": False, "frontendType": "date",
{"name": "dateFrom", "type": "str", "required": False, "frontendType": "date",
"description": t("Zeitraum ab (ISO-Datum)"), "default": ""},
{"name": "dateTo", "type": "string", "required": False, "frontendType": "date",
{"name": "dateTo", "type": "str", "required": False, "frontendType": "date",
"description": t("Zeitraum bis (ISO-Datum)"), "default": ""},
{"name": "assignedToId", "type": "number", "required": False, "frontendType": "number",
{"name": "assignedToId", "type": "int", "required": False, "frontendType": "number",
"description": t("Nur Tickets dieses Benutzers (ID)")},
{"name": "limit", "type": "number", "required": False, "frontendType": "number",
{"name": "limit", "type": "int", "required": False, "frontendType": "number",
"description": t("Max. Anzahl Tickets (1-500)"), "default": 100},
],
"inputs": 1,
@ -71,21 +71,21 @@ REDMINE_NODES = [
"description": t("Neues Ticket in Redmine anlegen. Mirror wird sofort aktualisiert."),
"parameters": [
dict(_REDMINE_INSTANCE_PARAM),
{"name": "subject", "type": "string", "required": True, "frontendType": "text",
{"name": "subject", "type": "str", "required": True, "frontendType": "text",
"description": t("Ticket-Titel")},
{"name": "trackerId", "type": "number", "required": True, "frontendType": "number",
{"name": "trackerId", "type": "int", "required": True, "frontendType": "number",
"description": t("Tracker-ID (Userstory, Feature, Task, ...)")},
{"name": "description", "type": "string", "required": False, "frontendType": "textarea",
{"name": "description", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Ticket-Beschreibung"), "default": ""},
{"name": "statusId", "type": "number", "required": False, "frontendType": "number",
{"name": "statusId", "type": "int", "required": False, "frontendType": "number",
"description": t("Status-ID (optional)")},
{"name": "priorityId", "type": "number", "required": False, "frontendType": "number",
{"name": "priorityId", "type": "int", "required": False, "frontendType": "number",
"description": t("Prioritaet-ID (optional)")},
{"name": "assignedToId", "type": "number", "required": False, "frontendType": "number",
{"name": "assignedToId", "type": "int", "required": False, "frontendType": "number",
"description": t("Zugewiesene Benutzer-ID (optional)")},
{"name": "parentIssueId", "type": "number", "required": False, "frontendType": "number",
{"name": "parentIssueId", "type": "int", "required": False, "frontendType": "number",
"description": t("Uebergeordnetes Ticket (optional)")},
{"name": "customFields", "type": "string", "required": False, "frontendType": "textarea",
{"name": "customFields", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Custom Fields als JSON {id: value}"), "default": ""},
],
"inputs": 1,
@ -103,25 +103,25 @@ REDMINE_NODES = [
"description": t("Felder eines Redmine-Tickets aktualisieren. Nur gesetzte Felder werden uebertragen."),
"parameters": [
dict(_REDMINE_INSTANCE_PARAM),
{"name": "ticketId", "type": "number", "required": True, "frontendType": "number",
{"name": "ticketId", "type": "int", "required": True, "frontendType": "number",
"description": t("Ticket-ID")},
{"name": "subject", "type": "string", "required": False, "frontendType": "text",
{"name": "subject", "type": "str", "required": False, "frontendType": "text",
"description": t("Neuer Titel")},
{"name": "description", "type": "string", "required": False, "frontendType": "textarea",
{"name": "description", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Neue Beschreibung")},
{"name": "trackerId", "type": "number", "required": False, "frontendType": "number",
{"name": "trackerId", "type": "int", "required": False, "frontendType": "number",
"description": t("Neuer Tracker")},
{"name": "statusId", "type": "number", "required": False, "frontendType": "number",
{"name": "statusId", "type": "int", "required": False, "frontendType": "number",
"description": t("Neuer Status")},
{"name": "priorityId", "type": "number", "required": False, "frontendType": "number",
{"name": "priorityId", "type": "int", "required": False, "frontendType": "number",
"description": t("Neue Prioritaet")},
{"name": "assignedToId", "type": "number", "required": False, "frontendType": "number",
{"name": "assignedToId", "type": "int", "required": False, "frontendType": "number",
"description": t("Neue Zuweisung")},
{"name": "parentIssueId", "type": "number", "required": False, "frontendType": "number",
{"name": "parentIssueId", "type": "int", "required": False, "frontendType": "number",
"description": t("Neues Parent-Ticket")},
{"name": "notes", "type": "string", "required": False, "frontendType": "textarea",
{"name": "notes", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Kommentar (Journal-Eintrag)"), "default": ""},
{"name": "customFields", "type": "string", "required": False, "frontendType": "textarea",
{"name": "customFields", "type": "str", "required": False, "frontendType": "textarea",
"description": t("Custom Fields als JSON {id: value}"), "default": ""},
],
"inputs": 1,
@ -139,13 +139,13 @@ REDMINE_NODES = [
"description": t("Aggregierte Kennzahlen (KPIs, Durchsatz, Status-Verteilung, Backlog) aus dem Mirror."),
"parameters": [
dict(_REDMINE_INSTANCE_PARAM),
{"name": "dateFrom", "type": "string", "required": False, "frontendType": "date",
{"name": "dateFrom", "type": "str", "required": False, "frontendType": "date",
"description": t("Zeitraum ab")},
{"name": "dateTo", "type": "string", "required": False, "frontendType": "date",
{"name": "dateTo", "type": "str", "required": False, "frontendType": "date",
"description": t("Zeitraum bis")},
{"name": "bucket", "type": "string", "required": False, "frontendType": "text",
{"name": "bucket", "type": "str", "required": False, "frontendType": "text",
"description": t("Bucket: day | week | month"), "default": "week"},
{"name": "trackerIds", "type": "string", "required": False, "frontendType": "text",
{"name": "trackerIds", "type": "str", "required": False, "frontendType": "text",
"description": t("Tracker-IDs (Komma-separiert)"), "default": ""},
],
"inputs": 1,
@ -163,7 +163,7 @@ REDMINE_NODES = [
"description": t("Tickets und Beziehungen aus Redmine in den lokalen Mirror uebernehmen."),
"parameters": [
dict(_REDMINE_INSTANCE_PARAM),
{"name": "force", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "force", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Vollsync erzwingen (ignoriert lastSyncAt)"), "default": False},
],
"inputs": 1,

View file

@ -10,14 +10,14 @@ SHAREPOINT_NODES = [
"label": t("Datei finden"),
"description": t("Datei nach Pfad oder Suche finden"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung")},
{"name": "searchQuery", "type": "string", "required": True, "frontendType": "text",
{"name": "searchQuery", "type": "str", "required": True, "frontendType": "text",
"description": t("Suchanfrage oder Pfad")},
{"name": "site", "type": "string", "required": False, "frontendType": "text",
{"name": "site", "type": "str", "required": False, "frontendType": "text",
"description": t("Optionaler Site-Hinweis"), "default": ""},
{"name": "maxResults", "type": "number", "required": False, "frontendType": "number",
{"name": "maxResults", "type": "int", "required": False, "frontendType": "number",
"description": t("Max Ergebnisse"), "default": 1000},
],
"inputs": 1,
@ -34,10 +34,10 @@ SHAREPOINT_NODES = [
"label": t("Datei lesen"),
"description": t("Inhalt aus Datei extrahieren"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung")},
{"name": "pathQuery", "type": "string", "required": True, "frontendType": "sharepointFile",
{"name": "pathQuery", "type": "str", "required": True, "frontendType": "sharepointFile",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Dateipfad")},
],
@ -55,13 +55,13 @@ SHAREPOINT_NODES = [
"label": t("Datei hochladen"),
"description": t("Datei zu SharePoint hochladen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung")},
{"name": "pathQuery", "type": "string", "required": True, "frontendType": "sharepointFolder",
{"name": "pathQuery", "type": "str", "required": True, "frontendType": "sharepointFolder",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Zielordner-Pfad")},
{"name": "content", "type": "string", "required": True, "frontendType": "hidden",
{"name": "content", "type": "str", "required": True, "frontendType": "hidden",
"description": t("Datei-Inhalt aus Upstream-Node (via Wire oder DataRef)"), "default": ""},
],
"inputs": 1,
@ -78,10 +78,10 @@ SHAREPOINT_NODES = [
"label": t("Dateien auflisten"),
"description": t("Dateien in Ordner auflisten"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung")},
{"name": "pathQuery", "type": "string", "required": False, "frontendType": "sharepointFolder",
{"name": "pathQuery", "type": "str", "required": False, "frontendType": "sharepointFolder",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Ordnerpfad"), "default": "/"},
],
@ -99,10 +99,10 @@ SHAREPOINT_NODES = [
"label": t("Datei herunterladen"),
"description": t("Datei vom Pfad herunterladen"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung")},
{"name": "pathQuery", "type": "string", "required": True, "frontendType": "sharepointFile",
{"name": "pathQuery", "type": "str", "required": True, "frontendType": "sharepointFile",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Vollständiger Dateipfad")},
],
@ -120,13 +120,13 @@ SHAREPOINT_NODES = [
"label": t("Datei kopieren"),
"description": t("Datei an Ziel kopieren"),
"parameters": [
{"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": True, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung")},
{"name": "sourcePath", "type": "string", "required": True, "frontendType": "sharepointFile",
{"name": "sourcePath", "type": "str", "required": True, "frontendType": "sharepointFile",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Quelldatei-Pfad")},
{"name": "destPath", "type": "string", "required": True, "frontendType": "sharepointFolder",
{"name": "destPath", "type": "str", "required": True, "frontendType": "sharepointFolder",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("Zielordner")},
],

View file

@ -46,7 +46,7 @@ TRIGGER_NODES = [
"parameters": [
{
"name": "cron",
"type": "string",
"type": "str",
"required": False,
"frontendType": "cron",
"description": t("Cron-Ausdruck"),

View file

@ -25,11 +25,11 @@ TRUSTEE_NODES = [
"description": t("Buchhaltungsdaten aus externem System importieren/aktualisieren."),
"parameters": [
dict(_TRUSTEE_INSTANCE_PARAM),
{"name": "forceRefresh", "type": "boolean", "required": False, "frontendType": "checkbox",
{"name": "forceRefresh", "type": "bool", "required": False, "frontendType": "checkbox",
"description": t("Import erzwingen"), "default": False},
{"name": "dateFrom", "type": "string", "required": False, "frontendType": "date",
{"name": "dateFrom", "type": "str", "required": False, "frontendType": "date",
"description": t("Startdatum"), "default": ""},
{"name": "dateTo", "type": "string", "required": False, "frontendType": "date",
{"name": "dateTo", "type": "str", "required": False, "frontendType": "date",
"description": t("Enddatum"), "default": ""},
],
"inputs": 1,
@ -46,14 +46,14 @@ TRUSTEE_NODES = [
"label": t("Dokumente extrahieren"),
"description": t("Dokumenttyp und Daten aus PDF/JPG per AI extrahieren."),
"parameters": [
{"name": "connectionReference", "type": "string", "required": False, "frontendType": "userConnection",
{"name": "connectionReference", "type": "str", "required": False, "frontendType": "userConnection",
"frontendOptions": {"authority": "msft"},
"description": t("SharePoint-Verbindung"), "default": ""},
{"name": "sharepointFolder", "type": "string", "required": False, "frontendType": "sharepointFolder",
{"name": "sharepointFolder", "type": "str", "required": False, "frontendType": "sharepointFolder",
"frontendOptions": {"dependsOn": "connectionReference"},
"description": t("SharePoint-Ordnerpfad"), "default": ""},
dict(_TRUSTEE_INSTANCE_PARAM),
{"name": "prompt", "type": "string", "required": False, "frontendType": "textarea",
{"name": "prompt", "type": "str", "required": False, "frontendType": "textarea",
"description": t("AI-Prompt für Extraktion"), "default": ""},
],
"inputs": 1,
@ -77,7 +77,7 @@ TRUSTEE_NODES = [
# is List[ActionDocument] (see datamodelChat.ActionResult). The
# DataPicker uses this string to filter compatible upstream paths.
{"name": "documentList", "type": "List[ActionDocument]", "required": True, "frontendType": "dataRef",
"description": t("Dokumentenliste — gebunden via DataRef.")},
"description": t("Dokumente aus vorherigen Schritten")},
dict(_TRUSTEE_INSTANCE_PARAM),
],
"inputs": 1,
@ -95,7 +95,7 @@ TRUSTEE_NODES = [
"description": t("Trustee-Positionen in Buchhaltungssystem übertragen."),
"parameters": [
{"name": "documentList", "type": "List[ActionDocument]", "required": True, "frontendType": "dataRef",
"description": t("Verarbeitete Dokumentenliste — gebunden via DataRef.")},
"description": t("Dokumente aus vorherigen Schritten")},
dict(_TRUSTEE_INSTANCE_PARAM),
],
"inputs": 1,
@ -113,25 +113,25 @@ TRUSTEE_NODES = [
"description": t("Daten aus der Trustee-DB lesen (Lookup, Aggregation, Roh-Export). Pendant zu refreshAccountingData ohne externen Sync."),
"parameters": [
dict(_TRUSTEE_INSTANCE_PARAM),
{"name": "mode", "type": "string", "required": True, "frontendType": "select",
{"name": "mode", "type": "str", "required": True, "frontendType": "select",
"frontendOptions": {"options": ["lookup", "raw", "aggregate"]},
"description": t("Abfragemodus"), "default": "lookup"},
{"name": "entity", "type": "string", "required": True, "frontendType": "select",
{"name": "entity", "type": "str", "required": True, "frontendType": "select",
"frontendOptions": {"options": ["tenantWithRent", "contact", "journalLines", "accounts", "balances"]},
"description": t("Entität, die gelesen werden soll"), "default": "tenantWithRent"},
{"name": "tenantNameRef", "type": "string", "required": False, "frontendType": "text",
{"name": "tenantNameRef", "type": "str", "required": False, "frontendType": "text",
"frontendOptions": {"dependsOn": "entity", "showWhen": ["tenantWithRent", "contact"]},
"description": t("Mietername (oder {{wire.feld}} aus Upstream)"), "default": ""},
{"name": "tenantAddressRef", "type": "string", "required": False, "frontendType": "text",
{"name": "tenantAddressRef", "type": "str", "required": False, "frontendType": "text",
"frontendOptions": {"dependsOn": "entity", "showWhen": ["tenantWithRent", "contact"]},
"description": t("Mieteradresse (Toleranz für Tippfehler)"), "default": ""},
{"name": "period", "type": "string", "required": False, "frontendType": "text",
{"name": "period", "type": "str", "required": False, "frontendType": "text",
"frontendOptions": {"dependsOn": "entity", "showWhen": ["tenantWithRent", "journalLines", "balances"]},
"description": t("Zeitraum (YYYY oder YYYY-MM-DD/YYYY-MM-DD)"), "default": ""},
{"name": "rentAccountPattern", "type": "string", "required": False, "frontendType": "text",
{"name": "rentAccountPattern", "type": "str", "required": False, "frontendType": "text",
"frontendOptions": {"dependsOn": "entity", "showWhen": ["tenantWithRent"]},
"description": t("Konto-Filter für Mietzins (z.B. '6000-6099' oder '6*')"), "default": ""},
{"name": "filterJson", "type": "string", "required": False, "frontendType": "textarea",
{"name": "filterJson", "type": "str", "required": False, "frontendType": "textarea",
"frontendOptions": {"dependsOn": "mode", "showWhen": ["raw", "aggregate"]},
"description": t("Optionaler JSON-Filter für mode=raw/aggregate"), "default": ""},
],

View file

@ -9,6 +9,7 @@ import logging
from typing import Dict, List, Any, Optional
from modules.features.graphicalEditor.nodeDefinitions import STATIC_NODE_TYPES
from modules.features.graphicalEditor.nodeDefinitions.input import FORM_FIELD_TYPES
from modules.features.graphicalEditor.nodeAdapter import bindsActionFromLegacy
from modules.features.graphicalEditor.portTypes import PORT_TYPE_CATALOG, SYSTEM_VARIABLES
from modules.shared.i18nRegistry import normalizePrimaryLanguageTag, resolveText
@ -119,6 +120,7 @@ def getNodeTypesForApi(
"categories": categories,
"portTypeCatalog": catalogSerialized,
"systemVariables": SYSTEM_VARIABLES,
"formFieldTypes": FORM_FIELD_TYPES,
}

View file

@ -34,6 +34,8 @@ class PortField(BaseModel):
# FeatureInstanceRef.featureCode). Pickers/validators use it to filter compatible
# producers by sub-type. Type must be "str" when discriminator is True.
discriminator: bool = False
# Surfaces this field at the top of the DataPicker list as the most common pick.
recommended: bool = False
class PortSchema(BaseModel):
@ -83,7 +85,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
PortField(name="listId", type="str", description="ClickUp-Listen-ID"),
PortField(name="name", type="str", required=False, description="Listenname"),
PortField(name="spaceId", type="str", required=False, description="Space-ID"),
PortField(name="folderId", type="str", required=False, description="Ordner-ID"),
PortField(name="groupId", type="str", required=False, description="Gruppen-ID für die Gruppierungszuordnung"),
PortField(name="connection", type="ConnectionRef", required=False,
description="ClickUp-Verbindung"),
]),
@ -153,7 +155,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
]),
"DocumentList": PortSchema(name="DocumentList", fields=[
PortField(name="documents", type="List[Document]",
description="Dokumentenliste"),
description="Dokumente aus vorherigen Schritten", recommended=True),
PortField(name="connection", type="ConnectionRef", required=False,
description="Verbindung, mit der die Liste erzeugt wurde"),
PortField(name="source", type="SharePointFolderRef", required=False,
@ -219,9 +221,9 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = {
PortField(name="prompt", type="str",
description="Prompt"),
PortField(name="response", type="str",
description="Antworttext"),
description="Antworttext", recommended=True),
PortField(name="responseData", type="Dict", required=False,
description="Strukturierte Antwort"),
description="Strukturierte Antwort (nur bei JSON-Ausgabe)"),
PortField(name="context", type="str",
description="Kontext"),
PortField(name="documents", type="List[Document]",
@ -658,8 +660,11 @@ def normalizeToSchema(raw: Any, schemaName: str) -> Dict[str, Any]:
if not schema or schemaName == "Transit":
return result
# Only default **required** fields. Optional fields stay absent so DataRefs / context
# resolution never pick a synthetic `{}` or `[]` (e.g. AiResult.responseData when the
# model returned plain text only).
for field in schema.fields:
if field.name not in result:
if field.name not in result and field.required:
result[field.name] = _defaultForType(field.type)
return result
@ -740,6 +745,9 @@ def _resolveTransitChain(
def deriveFormPayloadSchemaFromParam(node: Dict[str, Any], param_key: str) -> Optional[PortSchema]:
"""Derive output schema from a field-builder JSON list (``fields``, ``formFields``, …)."""
from modules.features.graphicalEditor.nodeDefinitions.input import FORM_FIELD_TYPES
_FORM_TYPE_TO_PORT: Dict[str, str] = {f["id"]: f["portType"] for f in FORM_FIELD_TYPES}
fields_param = (node.get("parameters") or {}).get(param_key)
if not fields_param or not isinstance(fields_param, list):
return None
@ -749,9 +757,11 @@ def deriveFormPayloadSchemaFromParam(node: Dict[str, Any], param_key: str) -> Op
_desc = resolveText(lab) if lab is not None else fname
if not str(_desc).strip():
_desc = fname
raw_type = str(ftype) if ftype is not None else "str"
port_type = _FORM_TYPE_TO_PORT.get(raw_type, raw_type)
portFields.append(PortField(
name=fname,
type=str(ftype) if ftype is not None else "str",
type=port_type,
description=_desc,
required=required,
))

View file

@ -111,6 +111,44 @@ def _validateInstanceAccess(instanceId: str, context: RequestContext) -> str:
return str(instance.mandateId) if instance.mandateId else ""
def _validateTargetInstance(
workflowData: Dict[str, Any],
ownerInstanceId: str,
context: RequestContext,
) -> None:
"""Enforce targetFeatureInstanceId rules for non-template workflows.
- Templates (isTemplate=True) may omit targetFeatureInstanceId.
- Non-templates MUST have a non-empty targetFeatureInstanceId.
- If the targetFeatureInstanceId differs from the GE owner instance,
the user must also have FeatureAccess on that target instance.
"""
if workflowData.get("isTemplate"):
return
targetId = workflowData.get("targetFeatureInstanceId")
if not targetId:
return
if targetId == ownerInstanceId:
return
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
targetInstance = rootInterface.getFeatureInstance(targetId)
if not targetInstance:
raise HTTPException(
status_code=400,
detail=routeApiMsg("targetFeatureInstanceId refers to a non-existent feature instance"),
)
targetAccess = rootInterface.getFeatureAccess(str(context.user.id), targetId)
if not targetAccess or not targetAccess.enabled:
raise HTTPException(
status_code=403,
detail=routeApiMsg("Access denied to target feature instance"),
)
@router.get("/{instanceId}/node-types")
@limiter.limit("60/minute")
def get_node_types(
@ -318,9 +356,12 @@ async def post_execute(
workflowId = body.get("workflowId")
req_nodes = graph.get("nodes") or []
workflow_for_envelope: Optional[Dict[str, Any]] = None
targetFeatureInstanceId: Optional[str] = None
if workflowId and not str(workflowId).startswith("transient-"):
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
workflow_for_envelope = iface.getWorkflow(workflowId)
if workflow_for_envelope:
targetFeatureInstanceId = workflow_for_envelope.get("targetFeatureInstanceId")
if workflowId and len(req_nodes) == 0:
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
wf = iface.getWorkflow(workflowId)
@ -328,10 +369,18 @@ async def post_execute(
graph = wf["graph"]
logger.info("graphicalEditor execute: loaded graph from workflow %s", workflowId)
workflow_for_envelope = wf
targetFeatureInstanceId = wf.get("targetFeatureInstanceId")
if not workflowId:
import uuid
workflowId = f"transient-{uuid.uuid4().hex[:12]}"
logger.info("graphicalEditor execute: using transient workflowId=%s", workflowId)
if targetFeatureInstanceId and targetFeatureInstanceId != instanceId:
_validateTargetInstance(
{"targetFeatureInstanceId": targetFeatureInstanceId},
instanceId,
context,
)
nodes_count = len(graph.get("nodes") or [])
connections_count = len(graph.get("connections") or [])
logger.info(
@ -363,6 +412,7 @@ async def post_execute(
automation2_interface=ge_interface,
run_envelope=run_env,
label=_wfLabel,
targetFeatureInstanceId=targetFeatureInstanceId,
)
logger.info(
"graphicalEditor execute result: success=%s error=%s nodeOutputs_keys=%s failedNode=%s paused=%s",
@ -1371,6 +1421,7 @@ def create_workflow(
) -> dict:
"""Create a new workflow."""
mandateId = _validateInstanceAccess(instanceId, context)
_validateTargetInstance(body, instanceId, context)
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
created = iface.createWorkflow(body)
return created
@ -1388,6 +1439,11 @@ def update_workflow(
"""Update a workflow."""
mandateId = _validateInstanceAccess(instanceId, context)
iface = getGraphicalEditorInterface(context.user, mandateId, instanceId)
existing = iface.getWorkflow(workflowId)
if not existing:
raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found"))
merged = {**existing, **body}
_validateTargetInstance(merged, instanceId, context)
updated = iface.updateWorkflow(workflowId, body)
if not updated:
raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found"))

View file

@ -361,6 +361,17 @@ QUICK_ACTIONS = [
# The placeholder {{featureInstanceId}} is replaced by _copyTemplateWorkflows.
# ---------------------------------------------------------------------------
_FINANCE_STYLE_HINT = (
"\n\nWenn du ein Dokument erstellst, verwende einen professionellen Finanz-Stil:\n"
"- Schriftart: Calibri\n"
"- Primaerfarbe: #1F3864 (Dunkelblau)\n"
"- Akzentfarbe: #2980B9\n"
"- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n"
"- Konservatives, seriöses Layout\n"
"Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen."
)
def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]:
"""Build a standard analysis graph: trigger -> refreshAccountingData -> ai.prompt."""
return {
@ -370,8 +381,9 @@ def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]:
"parameters": {"featureInstanceId": "{{featureInstanceId}}", "forceRefresh": False}, "position": {"x": 250, "y": 0}},
{"id": "analyse", "type": "ai.prompt", "label": "Analyse", "_method": "ai", "_action": "process",
"parameters": {
"aiPrompt": prompt,
"aiPrompt": prompt + _FINANCE_STYLE_HINT,
"context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]},
"requireNeutralization": False,
"simpleMode": False,
}, "position": {"x": 500, "y": 0}},
],
@ -440,15 +452,33 @@ TEMPLATE_WORKFLOWS = [
{"id": "analyse", "type": "ai.prompt", "label": "Budget-Analyse", "_method": "ai", "_action": "process",
"parameters": {
"aiPrompt": (
"Fuehre einen Budget-Soll/Ist-Vergleich durch.\n"
"Die Budget-Datei (Excel) wurde als Dokument uebergeben. "
"Die aktuellen Buchhaltungsdaten sind im Kontext verfuegbar.\n"
"1. Lies die Soll-Werte aus dem uebergebenen Budget-Dokument\n"
"2. Vergleiche sie mit den Ist-Werten aus der Buchhaltung pro Konto\n"
"3. Berechne die Abweichung (absolut und prozentual)\n"
"4. Erstelle ein Abweichungs-Chart (Balkendiagramm: Soll vs. Ist pro Konto)\n"
"5. Markiere kritische Abweichungen (>10%) und gib eine kurze Einschaetzung"
"Fuehre einen Budget-Soll/Ist-Vergleich durch und liefere EIN Excel-Dokument "
"mit folgender Struktur:\n\n"
"1. Tabelle \"Konten-Vergleich\" -- EINE Tabelle, EINE Zeile pro Konto:\n"
" Spalten: Konto-Nr | Konto-Name | Soll | Ist | Abweichung absolut | "
"Abweichung % | Status (OK / Warnung / Kritisch).\n"
"2. EINE Visualisierung \"Soll vs. Ist gesamt\" -- ein einziges "
"Balkendiagramm UNTER der Tabelle, das ALLE Konten in einer Grafik "
"gegenueberstellt (gruppierte Balken: Soll und Ist je Konto).\n"
"3. Kurzer Management-Summary-Absatz (3-5 Saetze) UNTER dem Chart "
"mit den 3 groessten Abweichungen (>10%) und einer fachlichen "
"Einschaetzung.\n\n"
"Verwende die uebergebene Budget-Datei als Soll-Quelle und die im "
"Kontext bereitgestellten Buchhaltungsdaten als Ist-Quelle.\n"
"WICHTIG: Erstelle KEINEN separaten Chart pro Konto. Nur EIN "
"Uebersichts-Chart ueber alle Konten ist gewuenscht.\n\n"
"Hinweis: Das documentTheme ist 'finance'. Wenn du ein Dokument erstellst, "
"verwende einen professionellen Finanz-Stil:\n"
"- Schriftart: Calibri\n"
"- Primaerfarbe: #1F3864 (Dunkelblau)\n"
"- Akzentfarbe: #2980B9\n"
"- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n"
"- Konservatives, seriöses Layout\n"
"Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen."
),
"resultType": "xlsx",
"documentTheme": "finance",
"requireNeutralization": False,
"documentList": {"type": "ref", "nodeId": "trigger", "path": ["payload", "documentList"]},
"context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]},
"simpleMode": False,

View file

@ -2,8 +2,8 @@
# All rights reserved.
"""Workspace feature data models — WorkspaceUserSettings."""
from typing import Optional
from pydantic import BaseModel, Field
from typing import List, Optional
from pydantic import Field
from modules.datamodels.datamodelBase import PowerOnModel
from modules.shared.i18nRegistry import i18nModel
import uuid
@ -52,3 +52,18 @@ class WorkspaceUserSettings(PowerOnModel):
description="Max agent rounds override (None = instance default)",
json_schema_extra={"label": "Max. Agenten-Runden", "frontend_type": "number", "frontend_readonly": False, "frontend_required": False},
)
requireNeutralization: bool = Field(
default=False,
description="Default neutralization setting for this user",
json_schema_extra={"label": "Neutralisierung", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False},
)
allowedProviders: List[str] = Field(
default_factory=list,
description="Allowed AI providers (empty = all permitted by RBAC)",
json_schema_extra={"label": "Erlaubte Provider", "frontend_type": "multiselect", "frontend_readonly": False, "frontend_required": False},
)
allowedModels: List[str] = Field(
default_factory=list,
description="Allowed AI models (empty = all permitted)",
json_schema_extra={"label": "Erlaubte Modelle", "frontend_type": "modelMultiSelect", "frontend_readonly": False, "frontend_required": False},
)

View file

@ -110,6 +110,7 @@ class WorkspaceInputRequest(BaseModel):
workflowId: Optional[str] = Field(default=None, description="Continue existing workflow")
userLanguage: str = Field(default="en", description="User language code")
allowedProviders: List[str] = Field(default_factory=list, description="Restrict AI to these providers")
allowedModels: List[str] = Field(default_factory=list, description="Restrict AI to these models")
requireNeutralization: Optional[bool] = Field(default=None, description="Per-request neutralization override")
@ -635,6 +636,7 @@ async def streamWorkspaceStart(
userLanguage=userInput.userLanguage,
instanceConfig=instanceConfig,
allowedProviders=userInput.allowedProviders,
allowedModels=userInput.allowedModels,
requireNeutralization=userInput.requireNeutralization,
billingFeatureCode=wsBillingFeatureCode,
)
@ -692,6 +694,7 @@ async def _runWorkspaceAgent(
userLanguage: str = "en",
instanceConfig: Dict[str, Any] = None,
allowedProviders: List[str] = None,
allowedModels: List[str] = None,
requireNeutralization: Optional[bool] = None,
billingFeatureCode: Optional[str] = None,
):
@ -715,6 +718,9 @@ async def _runWorkspaceAgent(
logger.info(f"Workspace agent: allowedProviders={allowedProviders}")
else:
logger.debug("Workspace agent: no allowedProviders in request")
if allowedModels:
aiService.services.allowedModels = allowedModels
logger.info(f"Workspace agent: allowedModels={allowedModels}")
if requireNeutralization is not None:
ctx.requireNeutralization = requireNeutralization
@ -1202,7 +1208,7 @@ async def patchWorkspaceWorkflowAttachments(
# ---------------------------------------------------------------------------
# File and folder list endpoints
# File endpoints
# ---------------------------------------------------------------------------
@router.get("/{instanceId}/files")
@ -1210,7 +1216,6 @@ async def patchWorkspaceWorkflowAttachments(
async def listWorkspaceFiles(
request: Request,
instanceId: str = Path(...),
folderId: Optional[str] = Query(None),
tags: Optional[str] = Query(None),
search: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
@ -1265,30 +1270,6 @@ async def getFileContent(
return Response(content=content, media_type=mimeType)
@router.get("/{instanceId}/folders")
@limiter.limit("300/minute")
async def listWorkspaceFolders(
request: Request,
instanceId: str = Path(...),
parentId: Optional[str] = Query(None),
context: RequestContext = Depends(getRequestContext),
):
_mandateId, _ = _validateInstanceAccess(instanceId, context)
try:
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(
user=context.user,
mandate_id=_mandateId or "",
feature_instance_id=instanceId,
)
chatService = getService("chat", ctx)
folders = chatService.listFolders(parentId=parentId)
return JSONResponse({"folders": folders or []})
except Exception:
return JSONResponse({"folders": []})
@router.get("/{instanceId}/datasources")
@limiter.limit("300/minute")
async def listWorkspaceDataSources(
@ -2139,6 +2120,76 @@ async def updateGeneralSettings(
return await getGeneralSettings(request, instanceId, context)
# =========================================================================
# User-level AI settings (neutralisation, providers, models)
# =========================================================================
@router.get("/{instanceId}/user-settings")
@limiter.limit("120/minute")
async def getWorkspaceUserSettings(
request: Request,
instanceId: str = Path(...),
context: RequestContext = Depends(getRequestContext),
):
"""Get the current user's workspace AI settings (auto-creates with defaults if not exists)."""
_mandateId, _ = _validateInstanceAccess(instanceId, context)
wsInterface = _getWorkspaceInterface(context, instanceId)
userId = str(context.user.id)
settings = wsInterface.getWorkspaceUserSettings(userId)
if settings:
return JSONResponse({
"requireNeutralization": settings.requireNeutralization,
"allowedProviders": settings.allowedProviders,
"allowedModels": settings.allowedModels,
})
data = {
"userId": userId,
"mandateId": str(context.mandateId) if context.mandateId else "",
"featureInstanceId": instanceId,
}
created = wsInterface.saveWorkspaceUserSettings(data)
return JSONResponse({
"requireNeutralization": created.requireNeutralization,
"allowedProviders": created.allowedProviders,
"allowedModels": created.allowedModels,
})
@router.put("/{instanceId}/user-settings")
@limiter.limit("120/minute")
async def putWorkspaceUserSettings(
request: Request,
instanceId: str = Path(...),
body: dict = Body(...),
context: RequestContext = Depends(getRequestContext),
):
"""Save the current user's workspace AI settings."""
_mandateId, _ = _validateInstanceAccess(instanceId, context)
wsInterface = _getWorkspaceInterface(context, instanceId)
userId = str(context.user.id)
data = {
"userId": userId,
"mandateId": str(context.mandateId) if context.mandateId else "",
"featureInstanceId": instanceId,
}
if "requireNeutralization" in body:
data["requireNeutralization"] = bool(body["requireNeutralization"])
if "allowedProviders" in body:
data["allowedProviders"] = body["allowedProviders"]
if "allowedModels" in body:
data["allowedModels"] = body["allowedModels"]
saved = wsInterface.saveWorkspaceUserSettings(data)
return JSONResponse({
"requireNeutralization": saved.requireNeutralization,
"allowedProviders": saved.allowedProviders,
"allowedModels": saved.allowedModels,
})
# =========================================================================
# RAG / Knowledge — anonymised instance statistics (presentation / KPIs)
# =========================================================================

View file

@ -0,0 +1,198 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Lightweight Bootstrap-Telemetrie fuer entfernte Migrationsroutinen.
Wenn eine idempotente Bootstrap-Migration (z.B. ``_migrateAndDropSysAdminRole``)
aus dem Boot-Pfad entfernt wird, koennte ein theoretischer Edge-Case (alte
DB-Restore, manueller INSERT) wieder Legacy-Daten ins System bringen. Damit das
nicht still bleibt, ruft ``initBootstrap`` nach Abschluss aller Init-Schritte
einmalig ``runLegacyDataChecks`` auf -- das logged WARN bei Restbestand.
Designprinzipien:
- KEINE Schreibzugriffe (rein lesend).
- Process-lokal gecached (``_cache``), damit identische Boots/Reloads den Check
nur einmal laufen lassen.
- Pro Check eine Recordset-Abfrage; Ausnahmen werden als WARN geloggt, nicht
re-raised, damit Telemetrie den Boot nie crasht.
"""
from __future__ import annotations
import logging
from typing import Any
from modules.connectors.connectorDbPostgre import DatabaseConnector
from modules.datamodels.datamodelRbac import Role
from modules.datamodels.datamodelUam import Mandate
from modules.shared.mandateNameUtils import isValidMandateName
logger = logging.getLogger(__name__)
_alreadyRan: bool = False
def runLegacyDataChecks(db: DatabaseConnector) -> None:
"""Logged WARN, falls noch Legacy-Daten existieren, die durch entfernte
Migrationsroutinen behandelt wurden. Prozessweit nur einmal aktiv.
Aufruf: am Ende von ``initBootstrap``.
"""
global _alreadyRan
if _alreadyRan:
return
_alreadyRan = True
_checkMandateDescription(db)
_checkMandateSlugRules(db)
_checkLegacyRootMandate(db)
_checkSysadminRole(db)
_backfillTargetFeatureInstanceId()
def _safe(checkName: str, fn) -> Any:
try:
return fn()
except Exception as exc:
logger.warning(
"Legacy-data telemetry check '%s' failed: %s: %s",
checkName, type(exc).__name__, exc,
)
return None
def _checkMandateDescription(db: DatabaseConnector) -> None:
def _do() -> None:
rows = db.getRecordset(Mandate)
bad = [
r.get("id") for r in rows
if r.get("description") and not r.get("label")
]
if bad:
logger.warning(
"Legacy-data check: %d Mandate row(s) still have description "
"but empty label (removed migration: _migrateMandateDescriptionToLabel). "
"Run scripts/script_db_audit_legacy_state.py for details. IDs: %s",
len(bad), bad[:5],
)
_safe("mandate-description", _do)
def _checkMandateSlugRules(db: DatabaseConnector) -> None:
def _do() -> None:
rows = db.getRecordset(Mandate)
seen: set[str] = set()
bad: list[str] = []
for r in sorted(rows, key=lambda x: str(x.get("id", ""))):
mid = r.get("id")
if not mid:
continue
name = (r.get("name") or "").strip()
labelRaw = r.get("label")
labelEmpty = not (labelRaw or "").strip() if labelRaw is not None else True
invalid = not isValidMandateName(name)
collides = name in seen
if not invalid and not collides:
seen.add(name)
if labelEmpty or invalid or collides:
bad.append(str(mid))
if bad:
logger.warning(
"Legacy-data check: %d Mandate row(s) violate slug/label rules "
"(removed migration: _migrateMandateNameLabelSlugRules). "
"Run scripts/script_db_audit_legacy_state.py for details. IDs: %s",
len(bad), bad[:5],
)
_safe("mandate-slug-rules", _do)
def _checkLegacyRootMandate(db: DatabaseConnector) -> None:
def _do() -> None:
legacy = db.getRecordset(Mandate, recordFilter={"name": "Root"})
rootRows = db.getRecordset(Mandate, recordFilter={"name": "root"})
legacyByFlag = [r for r in rootRows if not r.get("isSystem")]
all_ = list(legacy) + legacyByFlag
if all_:
logger.warning(
"Legacy-data check: %d Root-Mandate row(s) still in legacy form "
"(removed migration: initRootMandate-legacy-branch). IDs: %s",
len(all_), [r.get("id") for r in all_][:5],
)
_safe("root-mandate-legacy", _do)
def _checkSysadminRole(db: DatabaseConnector) -> None:
def _do() -> None:
rootMandates = db.getRecordset(
Mandate, recordFilter={"name": "root", "isSystem": True}
)
if not rootMandates:
return
rootId = str(rootMandates[0].get("id"))
rows = db.getRecordset(
Role,
recordFilter={
"roleLabel": "sysadmin",
"mandateId": rootId,
"featureInstanceId": None,
},
)
if rows:
logger.warning(
"Legacy-data check: %d 'sysadmin' role(s) still present in root mandate "
"(removed migration: _migrateAndDropSysAdminRole). "
"Authority is now User.isPlatformAdmin -- migrate manually. IDs: %s",
len(rows), [r.get("id") for r in rows],
)
_safe("sysadmin-role", _do)
def _backfillTargetFeatureInstanceId() -> None:
"""Idempotent backfill: set targetFeatureInstanceId = featureInstanceId
for all non-template AutoWorkflow rows where it is still NULL.
Connects to ``poweron_graphicaleditor`` independently.
"""
def _do() -> None:
from modules.shared.configuration import APP_CONFIG
from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow
dbHost = APP_CONFIG.get("DB_HOST", "localhost")
dbUser = APP_CONFIG.get("DB_USER")
dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD")
dbPort = int(APP_CONFIG.get("DB_PORT", 5432))
geDb = DatabaseConnector(
dbHost=dbHost,
dbDatabase="poweron_graphicaleditor",
dbUser=dbUser,
dbPassword=dbPassword,
dbPort=dbPort,
userId=None,
)
if not geDb._ensureTableExists(AutoWorkflow):
return
rows = geDb.getRecordset(AutoWorkflow) or []
backfilled = 0
for r in rows:
if r.get("isTemplate"):
continue
if r.get("targetFeatureInstanceId"):
continue
srcId = r.get("featureInstanceId")
if not srcId:
continue
geDb.recordModify(AutoWorkflow, r["id"], {"targetFeatureInstanceId": srcId})
backfilled += 1
if backfilled:
logger.info(
"targetFeatureInstanceId backfill: set %d non-template AutoWorkflow row(s) "
"to their featureInstanceId",
backfilled,
)
_safe("backfill-targetFeatureInstanceId", _do)

View file

@ -111,6 +111,19 @@ class AiObjects:
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1,
)
allowedModels = getattr(options, 'allowedModels', None) if options else None
if allowedModels:
filteredModels = [m for m in availableModels if m.name in allowedModels]
if filteredModels:
availableModels = filteredModels
else:
errorMsg = f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}"
logger.error(errorMsg)
return AiCallResponse(
content=errorMsg, modelName="error", priceCHF=0.0,
processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1,
)
failoverModelList = modelSelector.getFailoverModelList(prompt, context, options, availableModels)
if not failoverModelList:
@ -364,6 +377,19 @@ class AiObjects:
)
return
allowedModels = getattr(options, 'allowedModels', None) if options else None
if allowedModels:
filtered = [m for m in availableModels if m.name in allowedModels]
if filtered:
availableModels = filtered
else:
yield AiCallResponse(
content=f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}",
modelName="error", priceCHF=0.0, processingTime=0.0,
bytesSent=0, bytesReceived=0, errorCount=1,
)
return
failoverModelList = modelSelector.getFailoverModelList(
request.prompt, request.context or "", options, availableModels
)
@ -516,6 +542,14 @@ class AiObjects:
else:
logger.warning(f"No embedding models match allowedProviders {allowedProviders}")
allowedModels = getattr(options, 'allowedModels', None) if options else None
if allowedModels:
filtered = [m for m in availableModels if m.name in allowedModels]
if filtered:
availableModels = filtered
else:
logger.warning(f"No embedding models match allowedModels {allowedModels}")
failoverModelList = modelSelector.getFailoverModelList(
combinedText, "", options, availableModels
)

View file

@ -56,14 +56,8 @@ def initBootstrap(db: DatabaseConnector) -> None:
logger.info("Starting system bootstrap")
# Initialize root mandate
mandateId = initRootMandate(db)
# Migrate existing mandate records: description -> label
_migrateMandateDescriptionToLabel(db)
_migrateMandateNameLabelSlugRules(db)
# Clean up duplicate roles and fix corrupted templates FIRST
_deduplicateRoles(db)
# Initialize system role TEMPLATES (mandateId=None, isSystemRole=True)
@ -76,14 +70,6 @@ def initBootstrap(db: DatabaseConnector) -> None:
# This also serves as migration for existing mandates that don't have instance roles yet
_ensureAllMandatesHaveSystemRoles(db)
# Migration: eliminate the legacy ``sysadmin`` role in root mandate
# (replaced by ``User.isPlatformAdmin`` flag — see
# wiki/c-work/4-done/2026-04-sysadmin-authority-split.md).
# Idempotent: noop after first successful run.
if mandateId:
_migrateAndDropSysAdminRole(db, mandateId)
# Ensure UI rules for navigation items (admin/user/viewer roles)
_ensureUiContextRules(db)
# Initialize admin user
@ -129,9 +115,22 @@ def initBootstrap(db: DatabaseConnector) -> None:
# Bootstrap system workflow templates for graphical editor
_bootstrapSystemTemplates(db)
# Sync feature template workflows (update graph of existing instance workflows
# whose templateSourceId matches a current code-defined template)
_syncFeatureTemplateWorkflows()
# Ensure billing settings and accounts exist for all mandates
_bootstrapBilling()
# Telemetrie: warne falls Restbestaende der entfernten idempotenten
# Migrationen wieder auftauchen (Edge-Case: alter DB-Restore o.ae.).
# Schreibt nicht, scheitert nicht den Boot.
try:
from modules.interfaces._legacyMigrationTelemetry import runLegacyDataChecks
runLegacyDataChecks(db)
except Exception as e:
logger.warning(f"Legacy-data telemetry skipped: {e}")
def _bootstrapBilling() -> None:
"""
@ -195,6 +194,97 @@ def _bootstrapSystemTemplates(db: DatabaseConnector) -> None:
logger.warning(f"System workflow template bootstrap failed: {e}")
def _syncFeatureTemplateWorkflows() -> None:
"""Sync existing instance-scoped workflows with current code-defined templates.
For each feature that exposes getTemplateWorkflows(), find all AutoWorkflow
rows whose templateSourceId matches a template ID and update their graph
if the code-defined version has changed. Preserves instance-specific
fields (label, tags, targetFeatureInstanceId, invocations, active).
Idempotent, runs on every boot.
"""
import json
try:
from modules.system.registry import loadFeatureMainModules
from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow
from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
mainModules = loadFeatureMainModules()
templatesBySourceId: dict = {}
for featureCode, mod in mainModules.items():
getTemplateWorkflows = getattr(mod, "getTemplateWorkflows", None)
if not getTemplateWorkflows:
continue
try:
templates = getTemplateWorkflows() or []
except Exception:
continue
for tpl in templates:
tplId = tpl.get("id")
if tplId:
templatesBySourceId[tplId] = tpl
if not templatesBySourceId:
logger.info("_syncFeatureTemplateWorkflows: no templates found, skipping")
return
logger.info(f"_syncFeatureTemplateWorkflows: found {len(templatesBySourceId)} template(s): {list(templatesBySourceId.keys())}")
greenfieldDb = DatabaseConnector(
dbHost=APP_CONFIG.get("DB_HOST", "localhost"),
dbDatabase=graphicalEditorDatabase,
dbUser=APP_CONFIG.get("DB_USER"),
dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"),
)
updated = 0
for sourceId, tpl in templatesBySourceId.items():
instances = greenfieldDb.getRecordset(AutoWorkflow, recordFilter={
"templateSourceId": sourceId,
"isTemplate": False,
})
if not instances:
continue
canonicalGraph = tpl.get("graph", {})
for inst in instances:
instId = inst.get("id") if isinstance(inst, dict) else getattr(inst, "id", None)
targetInstanceId = (
inst.get("targetFeatureInstanceId") if isinstance(inst, dict)
else getattr(inst, "targetFeatureInstanceId", None)
) or ""
graphJson = json.dumps(canonicalGraph)
graphJson = graphJson.replace("{{featureInstanceId}}", targetInstanceId)
newGraph = json.loads(graphJson)
existingGraph = inst.get("graph") if isinstance(inst, dict) else getattr(inst, "graph", None)
if isinstance(existingGraph, str):
try:
existingGraph = json.loads(existingGraph)
except Exception:
existingGraph = None
if existingGraph == newGraph:
logger.debug(f"_syncFeatureTemplateWorkflows: graph unchanged for workflow {instId} (template={sourceId})")
continue
logger.debug(f"_syncFeatureTemplateWorkflows: graph DIFFERS for workflow {instId} (template={sourceId}), updating")
greenfieldDb.recordModify(AutoWorkflow, instId, {"graph": newGraph})
updated += 1
logger.info(f"_syncFeatureTemplateWorkflows: updated graph for workflow {instId} (template={sourceId})")
if updated:
logger.info(f"_syncFeatureTemplateWorkflows: synced {updated} workflow(s) with current templates")
else:
logger.info("_syncFeatureTemplateWorkflows: all instance graphs already match current templates")
greenfieldDb.close()
except Exception as e:
logger.warning(f"Feature template workflow sync failed: {e}")
def _buildSystemTemplates():
"""Build the graph definitions for platform system templates."""
return [
@ -396,21 +486,12 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]:
Returns:
Mandate ID if created or found, None otherwise
"""
# Find existing root mandate by name AND isSystem flag
existingMandates = db.getRecordset(Mandate, recordFilter={"name": "root", "isSystem": True})
if existingMandates:
mandateId = existingMandates[0].get("id")
logger.info(f"Root mandate already exists with ID {mandateId}")
return mandateId
# Check for legacy root mandates (name="Root" without isSystem flag) and migrate
legacyMandates = db.getRecordset(Mandate, recordFilter={"name": "Root"})
if legacyMandates:
mandateId = legacyMandates[0].get("id")
logger.info(f"Migrating legacy Root mandate {mandateId}: setting name='root', isSystem=True")
db.recordModify(Mandate, mandateId, {"name": "root", "isSystem": True})
return mandateId
logger.info("Creating Root mandate")
rootMandate = Mandate(name="root", label="Root", isSystem=True, enabled=True)
createdMandate = db.recordCreate(Mandate, rootMandate)
@ -419,98 +500,6 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]:
return mandateId
def _migrateMandateDescriptionToLabel(db: DatabaseConnector) -> None:
"""
Migration: Rename 'description' field to 'label' in all Mandate records.
Copies existing 'description' values to 'label' and removes the old field.
Safe to run multiple times (idempotent).
"""
allMandates = db.getRecordset(Mandate)
migratedCount = 0
for mandateRecord in allMandates:
mandateId = mandateRecord.get("id")
hasDescription = "description" in mandateRecord and mandateRecord.get("description") is not None
hasLabel = "label" in mandateRecord and mandateRecord.get("label") is not None
if hasDescription and not hasLabel:
# Copy description to label
updateData = {"label": mandateRecord["description"]}
db.recordModify(Mandate, mandateId, updateData)
migratedCount += 1
logger.info(f"Migrated mandate {mandateId}: description -> label")
if migratedCount > 0:
logger.info(f"Migrated {migratedCount} mandate(s) from description to label")
else:
logger.debug("No mandate description->label migration needed")
def _migrateMandateNameLabelSlugRules(db: DatabaseConnector) -> None:
"""
Migration: normalize Mandate.name to the slug rules ([a-z0-9-], length 2..32, single
hyphen segments) and ensure Mandate.label is non-empty.
Rules (see wiki/c-work/1-plan/2026-04-mandate-name-label-logic.md):
1. If ``label`` is empty/None set ``label := name`` (or "Mandate" when both empty).
2. If ``name`` is not a valid slug, or collides with an earlier mandate in stable id
order, allocate a unique slug from the (now non-empty) ``label`` using
``slugifyMandateName`` + ``allocateUniqueMandateSlug``.
Idempotent: a second run is a no-op because all valid names stay valid and stay unique.
Each rename and label fill-in is logged for audit.
"""
from modules.shared.mandateNameUtils import (
allocateUniqueMandateSlug,
isValidMandateName,
slugifyMandateName,
)
allRows = db.getRecordset(Mandate)
if not allRows:
return
sortedRows = sorted(allRows, key=lambda r: str(r.get("id", "")))
used: set[str] = set()
labelFills = 0
nameRenames: list[tuple[str, str, str]] = []
for rec in sortedRows:
mid = rec.get("id")
if not mid:
continue
name = (rec.get("name") or "").strip()
labelRaw = rec.get("label")
label = (labelRaw or "").strip() if labelRaw is not None else ""
if not label:
label = name if name else "Mandate"
db.recordModify(Mandate, mid, {"label": label})
labelFills += 1
logger.info(f"Mandate {mid}: filled empty label with '{label}'")
nameFits = isValidMandateName(name)
nameCollides = name in used
if nameFits and not nameCollides:
used.add(name)
continue
base = slugifyMandateName(label) or "mn"
newName = allocateUniqueMandateSlug(base, used)
used.add(newName)
if newName != name:
db.recordModify(Mandate, mid, {"name": newName})
nameRenames.append((str(mid), name, newName))
logger.info(f"Mandate {mid}: renamed name '{name}' -> '{newName}'")
if labelFills or nameRenames:
logger.info(
"Mandate name/label slug migration: %d label fill-in(s), %d name rename(s)",
labelFills, len(nameRenames),
)
else:
logger.debug("No mandate name/label slug migration needed")
def initAdminUser(db: DatabaseConnector, mandateId: Optional[str]) -> Optional[str]:
"""
Creates the Admin user if it doesn't exist.
@ -837,101 +826,6 @@ def copySystemRolesToMandate(db: DatabaseConnector, mandateId: str) -> int:
return copiedCount
def _migrateAndDropSysAdminRole(db: DatabaseConnector, mandateId: str) -> None:
"""
One-shot migration: eliminate the legacy ``sysadmin`` role in the root mandate.
Authority semantics moved to two orthogonal flags on User:
- ``isSysAdmin`` Infrastructure-Operator (RBAC bypass)
- ``isPlatformAdmin`` Cross-Mandate-Governance (no bypass)
Migration steps (idempotent):
1. Find sysadmin role(s) in root mandate. If none exist done.
2. For every UserMandateRole row referencing such a role: set
``user.isPlatformAdmin = True`` (preserves cross-mandate authority).
3. Delete those UserMandateRole rows.
4. Delete AccessRules attached to the sysadmin role.
5. Delete the sysadmin Role record.
Args:
db: Database connector instance
mandateId: Root mandate ID
"""
sysadminRoles = db.getRecordset(
Role,
recordFilter={"roleLabel": "sysadmin", "mandateId": mandateId, "featureInstanceId": None},
)
if not sysadminRoles:
logger.debug("Sysadmin role migration: no legacy sysadmin role present, nothing to do")
return
sysadminRoleIds = [str(r.get("id")) for r in sysadminRoles if r.get("id")]
logger.warning(
f"Sysadmin role migration: found {len(sysadminRoleIds)} legacy sysadmin role(s) "
f"in root mandate, migrating to isPlatformAdmin flag"
)
# 1) Promote every holder to isPlatformAdmin=True
promoted = 0
for sysadminRoleId in sysadminRoleIds:
umRoleRows = db.getRecordset(
UserMandateRole, recordFilter={"roleId": sysadminRoleId}
)
userMandateIds = [str(r.get("userMandateId")) for r in umRoleRows if r.get("userMandateId")]
if not userMandateIds:
continue
# Resolve userIds via UserMandate
userIds = set()
for umId in userMandateIds:
ums = db.getRecordset(UserMandate, recordFilter={"id": umId})
for um in ums:
uid = um.get("userId") if isinstance(um, dict) else getattr(um, "userId", None)
if uid:
userIds.add(str(uid))
for userId in userIds:
users = db.getRecordset(UserInDB, recordFilter={"id": userId})
if not users:
continue
current = users[0].get("isPlatformAdmin", False)
if not current:
db.recordModify(UserInDB, userId, {"isPlatformAdmin": True})
promoted += 1
logger.warning(
f"Sysadmin role migration: granted isPlatformAdmin=True to user {userId}"
)
# 2) Delete UserMandateRole rows
for umRow in umRoleRows:
rowId = umRow.get("id") if isinstance(umRow, dict) else getattr(umRow, "id", None)
if rowId:
try:
db.recordDelete(UserMandateRole, str(rowId))
except Exception as e:
logger.error(f"Sysadmin role migration: failed to drop UserMandateRole {rowId}: {e}")
# 3) Delete AccessRules
accessRules = db.getRecordset(AccessRule, recordFilter={"roleId": sysadminRoleId})
for ar in accessRules:
arId = ar.get("id") if isinstance(ar, dict) else getattr(ar, "id", None)
if arId:
try:
db.recordDelete(AccessRule, str(arId))
except Exception as e:
logger.error(f"Sysadmin role migration: failed to drop AccessRule {arId}: {e}")
# 4) Delete the Role
try:
db.recordDelete(Role, sysadminRoleId)
except Exception as e:
logger.error(f"Sysadmin role migration: failed to drop Role {sysadminRoleId}: {e}")
logger.warning(
f"Sysadmin role migration: completed; promoted {promoted} user(s) to isPlatformAdmin"
)
def _getRoleId(db: DatabaseConnector, roleLabel: str) -> Optional[str]:
"""
Get role ID by label, using cache or database lookup.

View file

@ -1268,19 +1268,7 @@ class AppObjects:
result = []
for conn_dict in connections:
try:
# Create UserConnection object
connection = UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
connection = UserConnection.model_validate(conn_dict)
result.append(connection)
except Exception as e:
logger.error(
@ -1293,6 +1281,28 @@ class AppObjects:
logger.error(f"Error getting user connections: {str(e)}")
return []
def getActiveKnowledgeConnections(self) -> List[UserConnection]:
"""Return all UserConnections with knowledgeIngestionEnabled=True and status=active.
Used by the daily re-sync scheduler to determine which connections to re-index.
"""
try:
rows = self.db.getRecordset(
UserConnection,
recordFilter={"knowledgeIngestionEnabled": True, "status": ConnectionStatus.ACTIVE.value},
)
result = []
for row in rows or []:
try:
conn = UserConnection.model_validate(row) if isinstance(row, dict) else row
result.append(conn)
except Exception as _e:
logger.warning(f"getActiveKnowledgeConnections: could not parse row: {_e}")
return result
except Exception as e:
logger.error(f"getActiveKnowledgeConnections failed: {e}")
return []
def getUserConnectionById(self, connectionId: str) -> Optional[UserConnection]:
"""Get a single UserConnection by ID or by reference string (connection:authority:username)."""
try:
@ -1317,18 +1327,21 @@ class AppObjects:
if connections:
conn_dict = connections[0]
return UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
try:
return UserConnection.model_validate(conn_dict)
except Exception:
return UserConnection(
id=conn_dict["id"],
userId=conn_dict["userId"],
authority=conn_dict.get("authority"),
externalId=conn_dict.get("externalId", ""),
externalUsername=conn_dict.get("externalUsername", ""),
externalEmail=conn_dict.get("externalEmail"),
status=conn_dict.get("status", "pending"),
connectedAt=conn_dict.get("connectedAt"),
lastChecked=conn_dict.get("lastChecked"),
expiresAt=conn_dict.get("expiresAt"),
)
return None
except Exception as e:
logger.error(f"Error getting user connection by ID: {str(e)}")
@ -4014,6 +4027,59 @@ class AppObjects:
logger.error(f"Error deleting role {roleId}: {str(e)}")
raise
# -------------------------------------------------------------------------
# Table Grouping (user-defined groups for FormGeneratorTable instances)
# -------------------------------------------------------------------------
def getTableGrouping(self, contextKey: str):
"""
Load the group tree for the current user and the given contextKey.
Returns a TableGrouping instance or None if no grouping has been saved yet.
contextKey identifies the table instance, e.g. "connections", "prompts",
"admin/users", "trustee/{instanceId}/documents".
"""
from modules.datamodels.datamodelPagination import TableGrouping
try:
records = self.db.getRecordset(
TableGrouping,
recordFilter={"userId": str(self.userId), "contextKey": contextKey},
)
if not records:
return None
row = records[0]
return TableGrouping.model_validate(row) if isinstance(row, dict) else row
except Exception as e:
logger.error(f"getTableGrouping failed for user={self.userId} key={contextKey}: {e}")
return None
def upsertTableGrouping(self, contextKey: str, rootGroups: list):
"""
Create or replace the group tree for the current user and contextKey.
rootGroups is a list of TableGroupNode-compatible dicts (the full tree).
Returns the saved TableGrouping instance.
"""
from modules.datamodels.datamodelPagination import TableGrouping
from modules.shared.timeUtils import getUtcTimestamp
try:
existing = self.getTableGrouping(contextKey)
data = {
"id": existing.id if existing else str(uuid.uuid4()),
"userId": str(self.userId),
"contextKey": contextKey,
"rootGroups": rootGroups,
"updatedAt": getUtcTimestamp(),
}
if existing:
self.db.recordModify(TableGrouping, existing.id, data)
else:
self.db.recordCreate(TableGrouping, data)
return TableGrouping.model_validate(data)
except Exception as e:
logger.error(f"upsertTableGrouping failed for user={self.userId} key={contextKey}: {e}")
raise
# Public Methods

View file

@ -93,6 +93,46 @@ class KnowledgeObjects:
self.db.recordModify(FileContentIndex, fileId, {"status": status})
return True
def deleteFileContentIndexByConnectionId(self, connectionId: str) -> Dict[str, int]:
"""Delete all FileContentIndex rows (and their ContentChunks) for a connection.
Used when a UserConnection is revoked / disconnected so the knowledge corpus
no longer references data the user no longer grants access to. Returns a dict
with counts to support observability logs.
"""
if not connectionId:
return {"indexRows": 0, "chunks": 0}
rows = self.db.getRecordset(
FileContentIndex, recordFilter={"connectionId": connectionId}
)
mandateIds: set = set()
chunkCount = 0
indexCount = 0
for row in rows:
fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None)
mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "")
if not fid:
continue
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid})
for chunk in chunks:
if self.db.recordDelete(ContentChunk, chunk["id"]):
chunkCount += 1
if self.db.recordDelete(FileContentIndex, fid):
indexCount += 1
if mid:
mandateIds.add(str(mid))
for mid in mandateIds:
try:
from modules.interfaces.interfaceDbBilling import _getRootInterface
_getRootInterface().reconcileMandateStorageBilling(mid)
except Exception as ex:
logger.warning("reconcileMandateStorageBilling after connection purge failed: %s", ex)
return {"indexRows": indexCount, "chunks": chunkCount}
def deleteFileContentIndex(self, fileId: str) -> bool:
"""Delete a FileContentIndex and all associated ContentChunks."""
existing = self.getFileContentIndex(fileId)
@ -603,41 +643,10 @@ def aggregateMandateRagTotalBytes(mandateId: str) -> int:
if rid and str(rid) not in byId:
byId[str(rid)] = row
# DEPRECATED: file-ID-correlation fallback from poweron_management.
# Only needed for pre-migration data where mandateId/featureInstanceId on the
# FileContentIndex are empty. Safe to remove once all environments are migrated.
_fallbackCount = 0
try:
from modules.datamodels.datamodelFiles import FileItem
from modules.interfaces.interfaceDbManagement import ComponentObjects
mgmtDb = ComponentObjects().db
knowledgeIf = getInterface(None)
fileIds: set = set()
for f in mgmtDb.getRecordset(FileItem, recordFilter={"mandateId": mandateId}):
fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
if fid:
fileIds.add(str(fid))
for instId in instIds:
for f in mgmtDb.getRecordset(FileItem, recordFilter={"featureInstanceId": instId}):
fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
if fid:
fileIds.add(str(fid))
for fid in fileIds:
if fid in byId:
continue
row = knowledgeIf.getFileContentIndex(fid)
if row:
byId[fid] = row
_fallbackCount += 1
except Exception as e:
logger.warning("aggregateMandateRagTotalBytes fallback failed: %s", e)
total = sum(int(r.get("totalSize") or 0) for r in byId.values())
logger.info(
"aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes (fallback: %d)",
mandateId, len(byId), total, _fallbackCount,
"aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes",
mandateId, len(byId), total,
)
return total

View file

@ -20,7 +20,6 @@ from modules.security.rbac import RbacClass
from modules.datamodels.datamodelRbac import AccessRuleContext
from modules.datamodels.datamodelUam import AccessLevel
from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData
from modules.datamodels.datamodelFileFolder import FileFolder
from modules.datamodels.datamodelUtils import Prompt
from modules.datamodels.datamodelMessaging import (
MessagingSubscription,
@ -115,7 +114,15 @@ class ComponentObjects:
# Update database context
self.db.updateContext(self.userId)
def _effective_user_id(self) -> Optional[str]:
"""User id for audit + FileData writes; singleton hub may unset userId but keep currentUser."""
if self.userId:
return self.userId
if self.currentUser is not None:
return getattr(self.currentUser, "id", None)
return None
def __del__(self):
"""Cleanup method to close database connection."""
if hasattr(self, 'db') and self.db is not None:
@ -1103,15 +1110,12 @@ class ComponentObjects:
return newfileName
counter += 1
def createFile(self, name: str, mimeType: str, content: bytes, folderId: Optional[str] = None) -> FileItem:
def createFile(self, name: str, mimeType: str, content: bytes) -> FileItem:
"""Creates a new file entry if user has permission. Computes fileHash and fileSize from content.
Duplicate check: if a file with the same user + fileHash + fileName already exists,
the existing file is returned instead of creating a new one.
Same hash with different name is allowed (intentional copy by user).
Args:
folderId: Optional parent folder ID. None/empty means the root folder.
"""
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to create files")
@ -1139,11 +1143,6 @@ class ComponentObjects:
else:
scope = "personal"
# Normalize folderId: treat empty string as "no folder" (= root) NULL in DB
normalizedFolderId: Optional[str] = folderId
if isinstance(normalizedFolderId, str) and not normalizedFolderId.strip():
normalizedFolderId = None
fileItem = FileItem(
mandateId=mandateId,
featureInstanceId=featureInstanceId,
@ -1152,12 +1151,32 @@ class ComponentObjects:
mimeType=mimeType,
fileSize=fileSize,
fileHash=fileHash,
folderId=normalizedFolderId,
)
# Ensure audit user is always stored: workflow/singleton contexts sometimes leave
# the connector without _current_user_id, so _saveRecord skips sysCreatedBy →
# getFile/createFileData RBAC then breaks (None != self.userId).
uid = self._effective_user_id()
if uid:
fileItem = fileItem.model_copy(update={"sysCreatedBy": str(uid)})
# Store in database
self.db.recordCreate(FileItem, fileItem)
verify = self.db.getRecordset(FileItem, recordFilter={"id": fileItem.id})
verify_creator = (verify[0].get("sysCreatedBy") if verify else None)
logger.info(
"createFile: id=%s name=%s scope=%s model_sysCreatedBy=%r db_sysCreatedBy=%r mandateId=%r featureInstanceId=%r "
"verify_rows=%s db=%s",
fileItem.id,
uniqueName,
fileItem.scope,
getattr(fileItem, "sysCreatedBy", None),
verify_creator,
mandateId or None,
featureInstanceId if featureInstanceId else None,
len(verify) if verify else 0,
getattr(self.db, "dbDatabase", "?"),
)
return fileItem
def _isFileOwner(self, file) -> bool:
@ -1277,382 +1296,47 @@ class ComponentObjects:
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting files in batch: {str(e)}")
# ---- Folder methods ----
_RESERVED_FOLDER_NAMES = {"(Global)"}
def _validateFolderName(self, name: str, parentId: Optional[str], excludeFolderId: Optional[str] = None):
"""Ensures folder name is not reserved and is unique within parent."""
if name in self._RESERVED_FOLDER_NAMES:
raise ValueError(f"Folder name '{name}' is reserved")
if not name or not name.strip():
raise ValueError("Folder name cannot be empty")
existingFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId or ""})
for f in existingFolders:
if f.get("name") == name and f.get("id") != excludeFolderId:
raise ValueError(f"Folder '{name}' already exists in this directory")
def _isDescendantOf(self, folderId: str, ancestorId: str) -> bool:
"""Checks if folderId is a descendant of ancestorId (circular reference check)."""
visited = set()
currentId = folderId
while currentId:
if currentId == ancestorId:
return True
if currentId in visited:
break
visited.add(currentId)
folders = self.db.getRecordset(FileFolder, recordFilter={"id": currentId})
if not folders:
break
currentId = folders[0].get("parentId")
return False
def _ensureFeatureInstanceFolder(self, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
"""Return the folder ID for a feature instance, creating it on first use.
The folder is named after the feature instance label."""
existing = self.db.getRecordset(
FileFolder,
recordFilter={
"featureInstanceId": featureInstanceId,
"sysCreatedBy": self.userId or "",
},
)
if existing:
return existing[0].get("id")
# Resolve the instance label for the folder name
folderName = featureInstanceId[:8]
def _ensureFeatureInstanceGroup(self, featureInstanceId: str, contextKey: str = "files/list") -> Optional[str]:
"""Return the groupId of the default group for a feature instance.
Creates the group if it doesn't exist yet."""
try:
from modules.datamodels.datamodelFeatures import FeatureInstance
from modules.security.rootAccess import getRootDbAppConnector
dbApp = getRootDbAppConnector()
instances = dbApp.getRecordset(FeatureInstance, recordFilter={"id": featureInstanceId})
if instances:
folderName = instances[0].get("label") or folderName
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(self._currentUser)
existing = appInterface.getTableGrouping(contextKey)
nodes = [n.model_dump() if hasattr(n, 'model_dump') else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
# Look for group with name matching featureInstanceId
def _find(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
nmeta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
if (nmeta or {}).get("featureInstanceId") == featureInstanceId:
return nid
subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
result = _find(subs)
if result:
return result
return None
found = _find(nodes)
if found:
return found
# Create new group
import uuid
newId = str(uuid.uuid4())
newGroup = {
"id": newId,
"name": featureInstanceId,
"itemIds": [],
"subGroups": [],
"meta": {"featureInstanceId": featureInstanceId},
}
nodes.append(newGroup)
appInterface.upsertTableGrouping(contextKey, nodes)
return newId
except Exception as e:
logger.warning(f"Could not resolve feature instance label: {e}")
logger.error(f"_ensureFeatureInstanceGroup failed: {e}")
return None
folder = FileFolder(
name=folderName,
parentId=None,
mandateId=mandateId,
featureInstanceId=featureInstanceId,
)
created = self.db.recordCreate(FileFolder, folder)
return created.get("id") if isinstance(created, dict) else getattr(created, "id", None)
def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]:
"""Returns a folder by ID if it belongs to the current user."""
folders = self.db.getRecordset(FileFolder, recordFilter={"id": folderId, "sysCreatedBy": self.userId or ""})
return folders[0] if folders else None
def listFolders(self, parentId: Optional[str] = None) -> List[Dict[str, Any]]:
"""List folders visible to the current user.
Own folders are always returned. Other users' folders are only
returned when they contain files visible to the current user.
Each folder is enriched with ``fileCount``."""
recordFilter = {}
if parentId is not None:
recordFilter["parentId"] = parentId
folders = self.db.getRecordset(FileFolder, recordFilter=recordFilter if recordFilter else None)
if not folders:
return folders
folderIds = [f["id"] for f in folders if f.get("id")]
fileCounts: Dict[str, int] = {}
try:
from modules.interfaces.interfaceRbac import buildFilesScopeWhereClause
scopeClause = buildFilesScopeWhereClause(
self.currentUser, "FileItem", self.db,
self.mandateId, self.featureInstanceId,
[], [],
)
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
baseQuery = (
'SELECT "folderId", COUNT(*) AS cnt '
'FROM "FileItem" '
'WHERE "folderId" = ANY(%s)'
)
queryValues: list = [folderIds]
if scopeClause:
baseQuery += ' AND (' + scopeClause["condition"] + ')'
queryValues.extend(scopeClause["values"])
baseQuery += ' GROUP BY "folderId"'
cursor.execute(baseQuery, queryValues)
for row in cursor.fetchall():
fileCounts[row["folderId"]] = row["cnt"]
except Exception as e:
logger.warning(f"Could not count files per folder: {e}")
userId = self.userId or ""
result = []
for folder in folders:
fc = fileCounts.get(folder.get("id", ""), 0)
folder["fileCount"] = fc
isOwn = folder.get("sysCreatedBy") == userId
if isOwn or fc > 0:
result.append(folder)
return result
def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]:
"""Create a new folder with unique name validation."""
self._validateFolderName(name, parentId)
folder = FileFolder(
name=name,
parentId=parentId,
mandateId=self.mandateId or "",
featureInstanceId=self.featureInstanceId or "",
)
return self.db.recordCreate(FileFolder, folder)
def renameFolder(self, folderId: str, newName: str) -> bool:
"""Rename a folder with unique name validation."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
self._validateFolderName(newName, folder.get("parentId"), excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, {"name": newName})
def updateFolder(self, folderId: str, updateData: Dict[str, Any]) -> bool:
"""
Update folder metadata (e.g. ``scope``, ``neutralize``). Owner-only,
same access model as renameFolder/moveFolder. Use ``renameFolder`` for
``name`` changes (uniqueness validation) and ``moveFolder`` for
``parentId`` changes (cycle/uniqueness validation).
"""
if not updateData:
return True
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
forbiddenKeys = {"id", "sysCreatedBy", "sysCreatedAt", "sysUpdatedAt"}
cleaned: Dict[str, Any] = {k: v for k, v in updateData.items() if k not in forbiddenKeys}
if "name" in cleaned:
self._validateFolderName(cleaned["name"], folder.get("parentId"), excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, cleaned)
def moveFolder(self, folderId: str, targetParentId: Optional[str] = None) -> bool:
"""Move a folder to a new parent, with circular reference and unique name checks."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
if targetParentId and self._isDescendantOf(targetParentId, folderId):
raise ValueError("Cannot move folder into its own subtree")
self._validateFolderName(folder.get("name", ""), targetParentId, excludeFolderId=folderId)
return self.db.recordModify(FileFolder, folderId, {"parentId": targetParentId})
def moveFilesBatch(self, fileIds: List[str], targetFolderId: Optional[str] = None) -> Dict[str, Any]:
"""Move multiple files with one SQL update.
Owner can always move; non-owners need RBAC ALL level."""
uniqueIds = [str(fid) for fid in dict.fromkeys(fileIds or []) if fid]
if not uniqueIds:
return {"movedFiles": 0}
if targetFolderId:
targetFolder = self.getFolder(targetFolderId)
if not targetFolder:
raise FileNotFoundError(f"Target folder {targetFolderId} not found")
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'SELECT "id", "sysCreatedBy" FROM "FileItem" WHERE "id" = ANY(%s)',
(uniqueIds,),
)
rows = cursor.fetchall()
foundIds = {row["id"] for row in rows}
missing = sorted(set(uniqueIds) - foundIds)
if missing:
raise FileNotFoundError(f"Files not found: {missing}")
for row in rows:
self._requireFileWriteAccess(row, row["id"], "update")
accessibleIds = [row["id"] for row in rows]
cursor.execute(
'UPDATE "FileItem" SET "folderId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
'WHERE "id" = ANY(%s)',
(targetFolderId, getUtcTimestamp(), self.userId or "", accessibleIds),
)
movedFiles = cursor.rowcount
self.db.connection.commit()
return {"movedFiles": movedFiles}
except Exception as e:
logger.error(f"Error moving files in batch: {e}")
self.db.connection.rollback()
raise FileError(f"Error moving files in batch: {str(e)}")
def moveFoldersBatch(self, folderIds: List[str], targetParentId: Optional[str] = None) -> Dict[str, Any]:
"""Move multiple folders with one SQL update after validation."""
uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
if not uniqueIds:
return {"movedFolders": 0}
foldersToMove: List[Dict[str, Any]] = []
for folderId in uniqueIds:
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
if targetParentId and self._isDescendantOf(targetParentId, folderId):
raise ValueError("Cannot move folder into its own subtree")
foldersToMove.append(folder)
existingInTarget = self.db.getRecordset(
FileFolder,
recordFilter={"parentId": targetParentId or "", "sysCreatedBy": self.userId or ""},
)
existingNames = {f.get("name"): f.get("id") for f in existingInTarget}
movingNames: Dict[str, str] = {}
movingIds = set(uniqueIds)
for folder in foldersToMove:
name = folder.get("name", "")
folderId = folder.get("id")
if name in movingNames and movingNames[name] != folderId:
raise ValueError(f"Folder '{name}' already exists in this move batch")
movingNames[name] = folderId
existingId = existingNames.get(name)
if existingId and existingId not in movingIds:
raise ValueError(f"Folder '{name}' already exists in target directory")
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'UPDATE "FileFolder" SET "parentId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s '
'WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(targetParentId, getUtcTimestamp(), self.userId or "", uniqueIds, self.userId or ""),
)
movedFolders = cursor.rowcount
self.db.connection.commit()
return {"movedFolders": movedFolders}
except Exception as e:
logger.error(f"Error moving folders in batch: {e}")
self.db.connection.rollback()
raise FileError(f"Error moving folders in batch: {str(e)}")
def deleteFolder(self, folderId: str, recursive: bool = False) -> Dict[str, Any]:
"""Delete a folder. If recursive, deletes all contents. Returns summary of deletions."""
folder = self.getFolder(folderId)
if not folder:
raise FileNotFoundError(f"Folder {folderId} not found")
childFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": folderId, "sysCreatedBy": self.userId or ""})
childFiles = self._getFilesByCurrentUser(recordFilter={"folderId": folderId})
if not recursive and (childFolders or childFiles):
raise ValueError(
f"Folder '{folder.get('name')}' is not empty "
f"({len(childFiles)} files, {len(childFolders)} subfolders). "
f"Use recursive=true to delete contents."
)
deletedFiles = 0
deletedFolders = 0
if recursive:
for subFolder in childFolders:
subResult = self.deleteFolder(subFolder["id"], recursive=True)
deletedFiles += subResult.get("deletedFiles", 0)
deletedFolders += subResult.get("deletedFolders", 0)
for childFile in childFiles:
try:
self.deleteFile(childFile["id"])
deletedFiles += 1
except Exception as e:
logger.warning(f"Failed to delete file {childFile['id']} during folder deletion: {e}")
self.db.recordDelete(FileFolder, folderId)
deletedFolders += 1
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
def deleteFoldersBatch(self, folderIds: List[str], recursive: bool = True) -> Dict[str, Any]:
"""Delete multiple folders and their content in batched SQL calls."""
uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid]
if not uniqueIds:
return {"deletedFiles": 0, "deletedFolders": 0}
if not recursive:
deletedFiles = 0
deletedFolders = 0
for folderId in uniqueIds:
result = self.deleteFolder(folderId, recursive=False)
deletedFiles += result.get("deletedFiles", 0)
deletedFolders += result.get("deletedFolders", 0)
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
try:
self.db._ensure_connection()
with self.db.connection.cursor() as cursor:
cursor.execute(
'SELECT "id" FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(uniqueIds, self.userId or ""),
)
rootAccessibleIds = [row["id"] for row in cursor.fetchall()]
if len(rootAccessibleIds) != len(uniqueIds):
missingIds = sorted(set(uniqueIds) - set(rootAccessibleIds))
raise FileNotFoundError(f"Folders not found or not accessible: {missingIds}")
cursor.execute(
"""
WITH RECURSIVE folder_tree AS (
SELECT "id"
FROM "FileFolder"
WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s
UNION ALL
SELECT child."id"
FROM "FileFolder" child
INNER JOIN folder_tree ft ON child."parentId" = ft."id"
WHERE child."sysCreatedBy" = %s
)
SELECT DISTINCT "id" FROM folder_tree
""",
(rootAccessibleIds, self.userId or "", self.userId or ""),
)
allFolderIds = [row["id"] for row in cursor.fetchall()]
cursor.execute(
'SELECT "id" FROM "FileItem" WHERE "folderId" = ANY(%s) AND "sysCreatedBy" = %s',
(allFolderIds, self.userId or ""),
)
allFileIds = [row["id"] for row in cursor.fetchall()]
if allFileIds:
cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (allFileIds,))
cursor.execute(
'DELETE FROM "FileItem" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(allFileIds, self.userId or ""),
)
deletedFiles = cursor.rowcount
else:
deletedFiles = 0
cursor.execute(
'DELETE FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s',
(allFolderIds, self.userId or ""),
)
deletedFolders = cursor.rowcount
self.db.connection.commit()
return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders}
except Exception as e:
logger.error(f"Error deleting folders in batch: {e}")
self.db.connection.rollback()
raise FileDeletionError(f"Error deleting folders in batch: {str(e)}")
def copyFile(self, sourceFileId: str, targetFolderId: Optional[str] = None, newFileName: Optional[str] = None) -> FileItem:
def copyFile(self, sourceFileId: str, newFileName: Optional[str] = None) -> FileItem:
"""Create a full duplicate of a file (FileItem + FileData)."""
sourceFile = self.getFile(sourceFileId)
if not sourceFile:
@ -1665,11 +1349,6 @@ class ComponentObjects:
fileName = newFileName or sourceFile.fileName
copiedFile = self.createFile(fileName, sourceFile.mimeType, sourceData)
if targetFolderId:
self.updateFile(copiedFile.id, {"folderId": targetFolderId})
elif sourceFile.folderId:
self.updateFile(copiedFile.id, {"folderId": sourceFile.folderId})
self.createFileData(copiedFile.id, sourceData)
return copiedFile
@ -1694,14 +1373,134 @@ class ComponentObjects:
return success
# FileData methods - data operations
def _getFileItemForDataWrite(self, fileId: str) -> Optional[FileItem]:
"""Resolve FileItem for storing FileData: RBAC-aware getFile, then same-user row fallback.
createFile() can insert a row that getFile() still hides (e.g. scope NULL vs GROUP rules,
or connector / context edge cases). The creator must still be allowed to attach blob data.
"""
logger.info(
"[FileData] resolve start fileId=%s iface_userId=%r effective_uid=%r mandateId=%r featureInstanceId=%r db=%s",
fileId,
self.userId,
self._effective_user_id(),
self.mandateId,
self.featureInstanceId,
getattr(self.db, "dbDatabase", "?"),
)
file = self.getFile(fileId)
if file:
logger.info("[FileData] getFile OK fileId=%s", fileId)
return file
uid = self._effective_user_id()
if not uid:
logger.error(
"[FileData] FAIL no user id fileId=%s userId=%r hasCurrentUser=%s",
fileId,
self.userId,
self.currentUser is not None,
)
return None
uid_s = str(uid)
rows = self.db.getRecordset(FileItem, recordFilter={"id": fileId})
if not rows:
logger.error(
"[FileData] FAIL no FileItem row fileId=%s (createFile committed to same db? db=%s)",
fileId,
getattr(self.db, "dbDatabase", "?"),
)
return None
row = dict(rows[0])
creator = row.get("sysCreatedBy")
creator_s = str(creator) if creator is not None else None
if creator_s != uid_s:
if not creator_s:
try:
self.db.recordModify(FileItem, fileId, {"sysCreatedBy": uid_s})
row["sysCreatedBy"] = uid_s
logger.warning(
"[FileData] patched NULL sysCreatedBy fileId=%s -> %s",
fileId,
uid_s,
)
except Exception as e:
logger.error(
"[FileData] FAIL patch sysCreatedBy fileId=%s: %s",
fileId,
e,
exc_info=True,
)
return None
else:
# _saveRecord used to overwrite explicit creators with contextvar "system"
if creator_s == "system":
try:
self.db.recordModify(FileItem, fileId, {"sysCreatedBy": uid_s})
row["sysCreatedBy"] = uid_s
logger.warning(
"[FileData] patched sysCreatedBy system→user fileId=%s -> %s",
fileId,
uid_s,
)
except Exception as e:
logger.error(
"[FileData] FAIL patch system sysCreatedBy fileId=%s: %s",
fileId,
e,
exc_info=True,
)
return None
else:
logger.error(
"[FileData] FAIL creator mismatch fileId=%s row.sysCreatedBy=%r (%s) effective_uid=%r (%s) scope=%r",
fileId,
creator,
type(creator).__name__,
uid,
type(uid).__name__,
row.get("scope"),
)
return None
logger.info(
"[FileData] RBAC miss, owner fallback OK fileId=%s scope=%r sysCreatedBy=%r",
fileId,
row.get("scope"),
row.get("sysCreatedBy"),
)
try:
if row.get("sysCreatedAt") is None or row.get("sysCreatedAt") in (0, 0.0):
row["sysCreatedAt"] = getUtcTimestamp()
if row.get("scope") is None:
row["scope"] = "personal"
if row.get("neutralize") is None:
row["neutralize"] = False
return FileItem(**row)
except Exception as e:
logger.error(
"[FileData] FAIL FileItem(**row) fileId=%s keys=%s err=%s",
fileId,
list(row.keys()),
e,
exc_info=True,
)
return None
def createFileData(self, fileId: str, data: bytes) -> bool:
"""Stores the binary data of a file in the database."""
try:
logger.info(
"[FileData] createFileData enter fileId=%s bytes=%s",
fileId,
len(data) if data is not None else 0,
)
# Check file access
file = self.getFile(fileId)
file = self._getFileItemForDataWrite(fileId)
if not file:
logger.error(f"File with ID {fileId} not found when storing data")
logger.error(
"[FileData] FAIL _getFileItemForDataWrite returned None fileId=%s",
fileId,
)
return False
# Determine if this is a text-based format
@ -1745,13 +1544,11 @@ class ComponentObjects:
}
self.db.recordCreate(FileData, fileDataObj)
# Clear cache to ensure fresh data
logger.debug(f"Successfully stored data for file {fileId} (base64Encoded: {base64Encoded})")
logger.info("[FileData] recordCreate OK fileId=%s base64Encoded=%s", fileId, base64Encoded)
return True
except Exception as e:
logger.error(f"Error storing data for file {fileId}: {str(e)}")
logger.error("Error storing data for file %s: %s", fileId, e, exc_info=True)
return False
def getFileData(self, fileId: str) -> Optional[bytes]:
@ -1884,18 +1681,14 @@ class ComponentObjects:
logger.error(f"Error getting file content: {str(e)}")
return None
def saveUploadedFile(self, fileContent: bytes, fileName: str, folderId: Optional[str] = None) -> tuple[FileItem, str]:
"""Saves an uploaded file if user has permission.
Args:
folderId: Optional parent folder ID. None means root folder.
"""
def saveUploadedFile(self, fileContent: bytes, fileName: str) -> tuple[FileItem, str]:
"""Saves an uploaded file if user has permission."""
try:
# Check file creation permission
if not self.checkRbacPermission(FileItem, "create"):
raise PermissionError("No permission to upload files")
logger.debug(f"Starting upload process for file: {fileName} (folderId={folderId!r})")
logger.debug(f"Starting upload process for file: {fileName}")
if not isinstance(fileContent, bytes):
logger.error(f"Invalid fileContent type: {type(fileContent)}")
@ -1921,7 +1714,6 @@ class ComponentObjects:
name=fileName,
mimeType=mimeType,
content=fileContent,
folderId=folderId,
)
# Save binary data

View file

@ -347,6 +347,7 @@ class FeatureInterface:
"templateSourceId": templateId,
"templateScope": "instance",
"active": True,
"targetFeatureInstanceId": instanceId,
})
copied += 1
except Exception as e:

View file

@ -204,7 +204,6 @@ TABLE_NAMESPACE = {
# Files - benutzer-eigen
"FileItem": "files",
"FileData": "files",
"FileFolder": "files",
# Automation - benutzer-eigen
"AutomationDefinition": "automation",
"AutomationTemplate": "automation",
@ -529,8 +528,7 @@ def getRecordsetPaginatedWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
# Covers both historical empty-string values and true NULLs
# e.g. root-folder files where folderId may be "" or NULL.
# Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):
@ -689,8 +687,7 @@ def getDistinctColumnValuesWithRBAC(
if val is None:
# val=None in pagination.filters means "match empty/null"
# (same convention as connectorDbPostgre._buildPaginationClauses).
# Covers both historical empty-string values and true NULLs
# e.g. root-folder files where folderId may be "" or NULL.
# Covers both historical empty-string values and true NULLs.
whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')')
continue
if isinstance(val, dict):
@ -749,6 +746,7 @@ def buildFilesScopeWhereClause(
Only own files: sysCreatedBy = currentUser
WITH instance context (Instanz-Seiten):
- scope = 'personal' AND sysCreatedBy = me (creator's personal files; e.g. workflow outputs)
- sysCreatedBy = me AND featureInstanceId = X (own personal files of this instance)
- scope = 'featureInstance' AND featureInstanceId = X
- scope = 'mandate' AND mandateId = M (M = mandate of the instance)
@ -782,6 +780,15 @@ def buildFilesScopeWhereClause(
scopeParts: List[str] = []
scopeValues: List = []
# Personal files created by this user must remain visible even when the request
# carries mandate/instance context (GROUP reads use this clause). Otherwise
# createFile → createFileData → getFile fails and workflow outputs vanish from /files.
# Also treat scope IS NULL as legacy/personal for the owner (column default not applied).
scopeParts.append(
'(("scope" = \'personal\' OR "scope" IS NULL) AND "sysCreatedBy" = %s)'
)
scopeValues.append(currentUser.id)
if featureInstanceId:
# 1) Own personal files of this specific instance
scopeParts.append('("sysCreatedBy" = %s AND "featureInstanceId" = %s)')

View file

View file

@ -0,0 +1,240 @@
"""
One-time migration: Convert FileFolder tree + FileItem.folderId table_groupings.
Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column
from the database (those are separate Alembic/SQL steps).
Usage:
python -m modules.migrations.migrate_folders_to_groups [--dry-run] [--verbose]
Steps:
1. For each distinct (userId, mandateId) combination that has FileFolder records:
a. Build the full folder tree (recursive)
b. Write it as a TableGroupNode tree into table_groupings (contextKey='files/list')
merges with any existing groups rather than overwriting
c. For each FileItem with a folderId that maps into this tree,
add its id to the matching group's itemIds
2. Print a summary (rows migrated, groups created, files assigned)
3. If not --dry-run: commits the inserts/updates
NOTE: Schema changes (ALTER TABLE DROP COLUMN, DROP TABLE) are intentionally
NOT performed by this script. Run the corresponding Alembic migration
(migrations/versions/xxxx_drop_folder_columns.py) afterwards.
"""
import argparse
import json
import logging
import uuid
from typing import Optional
logger = logging.getLogger(__name__)
# ── Helpers ──────────────────────────────────────────────────────────────────
def _build_tree(folders: list, parent_id: Optional[str]) -> list:
"""Recursively build TableGroupNode-compatible dicts from a flat folder list."""
children = [f for f in folders if f.get("parentId") == parent_id]
result = []
for folder in children:
node = {
"id": str(uuid.uuid4()),
"name": folder["name"],
"itemIds": [],
"subGroups": _build_tree(folders, folder["id"]),
"meta": {"migratedFromFolderId": folder["id"]},
}
result.append(node)
return result
def _assign_files_to_nodes(nodes: list, files_by_folder: dict) -> list:
"""Recursively assign file IDs to group nodes based on folder mapping."""
for node in nodes:
folder_id = (node.get("meta") or {}).get("migratedFromFolderId")
if folder_id and folder_id in files_by_folder:
node["itemIds"] = list(files_by_folder[folder_id])
node["subGroups"] = _assign_files_to_nodes(node.get("subGroups", []), files_by_folder)
return nodes
def _count_items(nodes: list) -> int:
total = 0
for node in nodes:
total += len(node.get("itemIds", []))
total += _count_items(node.get("subGroups", []))
return total
def _now_ts() -> str:
from modules.shared.timeUtils import getUtcTimestamp
return getUtcTimestamp()
# ── Main migration ────────────────────────────────────────────────────────────
def run_migration(dry_run: bool = True, verbose: bool = False):
"""Main migration entry point."""
logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
logger.info(f"Starting folder→group migration (dry_run={dry_run})")
from modules.connectors.connectorDbPostgre import getCachedConnector
connector = getCachedConnector()
if not connector or not connector.connection:
logger.error("Could not obtain a DB connection. Aborting.")
return
conn = connector.connection
cur = conn.cursor()
# ── 1. Check that the source tables still exist ───────────────────────────
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_name = 'FileFolder'
)
""")
folder_table_exists = cur.fetchone()[0]
cur.execute("""
SELECT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'FileItem' AND column_name = 'folderId'
)
""")
folder_column_exists = cur.fetchone()[0]
if not folder_table_exists and not folder_column_exists:
logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.")
return
if not folder_table_exists:
logger.warning("FileFolder table missing but FileItem.folderId column still present. Only file assignments will be migrated.")
if not folder_column_exists:
logger.warning("FileItem.folderId column missing but FileFolder table still present. Only group tree structure will be migrated.")
# ── 2. Load all folders ───────────────────────────────────────────────────
folders_by_user: dict = {}
if folder_table_exists:
cur.execute('SELECT "id", "name", "parentId", "sysCreatedBy", "mandateId" FROM "FileFolder"')
for row in cur.fetchall():
fid, fname, parent_id, user_id, mandate_id = row
key = (str(user_id), str(mandate_id) if mandate_id else "")
folders_by_user.setdefault(key, []).append({
"id": fid, "name": fname, "parentId": parent_id,
})
logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations")
# ── 3. Load file→folder assignments ──────────────────────────────────────
files_by_key: dict = {}
if folder_column_exists:
cur.execute(
'SELECT "id", "folderId", "sysCreatedBy", "mandateId" FROM "FileItem" WHERE "folderId" IS NOT NULL AND "folderId" != \'\''
)
for row in cur.fetchall():
file_id, folder_id, user_id, mandate_id = row
key = (str(user_id), str(mandate_id) if mandate_id else "")
files_by_key.setdefault(key, {}).setdefault(folder_id, []).append(file_id)
total_files = sum(
sum(len(v) for v in d.values()) for d in files_by_key.values()
)
logger.info(f"Found {total_files} file→folder assignments across {len(files_by_key)} (user, mandate) combos")
# ── 4. Combine and upsert groupings ──────────────────────────────────────
all_keys = set(folders_by_user.keys()) | set(files_by_key.keys())
stats = {"groups_created": 0, "groupings_upserted": 0, "files_assigned": 0}
for key in all_keys:
user_id, mandate_id = key
folders = folders_by_user.get(key, [])
files_by_folder = files_by_key.get(key, {})
# Build tree
roots = _build_tree(folders, None)
roots = _assign_files_to_nodes(roots, files_by_folder)
# Handle files in unknown folders (folder no longer in tree)
known_folder_ids = {f["id"] for f in folders}
for folder_id, file_ids in files_by_folder.items():
if folder_id not in known_folder_ids:
# Orphaned files: put them in an "Orphaned" group
roots.append({
"id": str(uuid.uuid4()),
"name": f"Orphaned (folder {folder_id[:8]}…)",
"itemIds": file_ids,
"subGroups": [],
"meta": {"migratedFromFolderId": folder_id, "orphaned": True},
})
if not roots:
continue
n_items = _count_items(roots)
stats["groups_created"] += len(roots)
stats["files_assigned"] += n_items
context_key = "files/list"
if verbose:
logger.debug(f" user={user_id} mandate={mandate_id}: {len(roots)} root groups, {n_items} files")
if not dry_run:
# Check for existing grouping
cur.execute(
'SELECT "id", "rootGroups" FROM "TableGrouping" WHERE "userId" = %s AND "contextKey" = %s',
(user_id, context_key),
)
existing_row = cur.fetchone()
if existing_row:
existing_id, existing_raw = existing_row
existing_roots = json.loads(existing_raw) if isinstance(existing_raw, str) else (existing_raw or [])
# Merge: append migrated groups (avoid duplicates by migratedFromFolderId)
existing_meta_ids = {
(n.get("meta") or {}).get("migratedFromFolderId")
for n in existing_roots
if (n.get("meta") or {}).get("migratedFromFolderId")
}
new_roots = existing_roots + [
r for r in roots
if (r.get("meta") or {}).get("migratedFromFolderId") not in existing_meta_ids
]
cur.execute(
'UPDATE "TableGrouping" SET "rootGroups" = %s, "updatedAt" = %s WHERE "id" = %s',
(json.dumps(new_roots), _now_ts(), existing_id),
)
else:
new_id = str(uuid.uuid4())
cur.execute(
'INSERT INTO "TableGrouping" ("id", "userId", "contextKey", "rootGroups", "updatedAt") VALUES (%s, %s, %s, %s, %s)',
(new_id, user_id, context_key, json.dumps(roots), _now_ts()),
)
stats["groupings_upserted"] += 1
# ── 5. Summary ────────────────────────────────────────────────────────────
if not dry_run:
conn.commit()
logger.info("Migration committed.")
else:
logger.info("DRY RUN — no changes written.")
logger.info(
f"Summary: groupings_upserted={stats['groupings_upserted']}, "
f"groups_created={stats['groups_created']}, "
f"files_assigned={stats['files_assigned']}"
)
logger.info(
"Next steps (run after verifying data):\n"
" 1. Run Alembic migration to DROP COLUMN FileItem.folderId\n"
" 2. Run Alembic migration to DROP TABLE FileFolder"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings")
parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)")
parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)")
parser.add_argument("--verbose", action="store_true", help="Show per-user details")
args = parser.parse_args()
dry_run = not args.execute
run_migration(dry_run=dry_run, verbose=args.verbose)

View file

@ -0,0 +1,305 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
User-facing Automation Workspace API.
Lists workflow runs the user can access (via FeatureAccess on
targetFeatureInstanceId) and provides detail views with step logs
and linked files. Designed for the "Workspace" tab under
Nutzung > Automation.
"""
import logging
import math
from typing import Optional
from fastapi import APIRouter, Depends, Request, Query, Path, HTTPException
from slowapi import Limiter
from slowapi.util import get_remote_address
from modules.auth.authentication import getRequestContext, RequestContext
from modules.connectors.connectorDbPostgre import DatabaseConnector
from modules.shared.configuration import APP_CONFIG
from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import (
AutoRun,
AutoStepLog,
AutoWorkflow,
)
from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase
from modules.shared.i18nRegistry import apiRouteContext
routeApiMsg = apiRouteContext("routeAutomationWorkspace")
logger = logging.getLogger(__name__)
limiter = Limiter(key_func=get_remote_address)
router = APIRouter(prefix="/api/automations/runs", tags=["AutomationWorkspace"])
def _getDb() -> DatabaseConnector:
return DatabaseConnector(
dbHost=APP_CONFIG.get("DB_HOST", "localhost"),
dbDatabase=graphicalEditorDatabase,
dbUser=APP_CONFIG.get("DB_USER"),
dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"),
dbPort=int(APP_CONFIG.get("DB_PORT", 5432)),
userId=None,
)
def _getUserAccessibleInstanceIds(userId: str) -> list[str]:
"""Return all featureInstanceIds the user has enabled FeatureAccess for."""
from modules.interfaces.interfaceDbApp import getRootInterface
rootIface = getRootInterface()
allAccess = rootIface.getFeatureAccessesForUser(userId) or []
return [
a.featureInstanceId
for a in allAccess
if a.featureInstanceId and a.enabled
]
_FILE_REF_KEYS = ("fileId", "documentId", "fileIds", "documents")
def _extractFileIdsFromValue(value, accumulator: set[str]) -> None:
"""Recursively scan a value (dict/list/str) for file id references."""
if isinstance(value, dict):
for key, sub in value.items():
if key in _FILE_REF_KEYS:
_collectFileIdsFromRef(sub, accumulator)
else:
_extractFileIdsFromValue(sub, accumulator)
elif isinstance(value, list):
for item in value:
_extractFileIdsFromValue(item, accumulator)
def _collectFileIdsFromRef(val, accumulator: set[str]) -> None:
"""Add file ids from a value located under a known file-reference key."""
if isinstance(val, str) and val:
accumulator.add(val)
elif isinstance(val, list):
for v in val:
if isinstance(v, str) and v:
accumulator.add(v)
elif isinstance(v, dict) and v.get("id"):
accumulator.add(v["id"])
elif isinstance(val, dict) and val.get("id"):
accumulator.add(val["id"])
@router.get("")
@limiter.limit("60/minute")
def listWorkspaceRuns(
request: Request,
scope: str = Query("mine", description="mine = own runs, mandate = all accessible"),
status: Optional[str] = Query(None, description="Filter by run status"),
targetInstanceId: Optional[str] = Query(None, description="Filter by targetFeatureInstanceId"),
workflowId: Optional[str] = Query(None, description="Filter by workflow"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
context: RequestContext = Depends(getRequestContext),
) -> dict:
"""List workflow runs visible to the user.
scope=mine: only runs owned by the user.
scope=mandate: all runs where the user has FeatureAccess on the
workflow's targetFeatureInstanceId.
"""
db = _getDb()
if not db._ensureTableExists(AutoRun):
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
userId = str(context.user.id) if context.user else None
if not userId:
raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required"))
accessibleInstanceIds = _getUserAccessibleInstanceIds(userId)
if not accessibleInstanceIds:
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
if not db._ensureTableExists(AutoWorkflow):
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
wfFilter: dict = {}
if targetInstanceId:
if targetInstanceId not in accessibleInstanceIds:
raise HTTPException(status_code=403, detail=routeApiMsg("Access denied to target instance"))
wfFilter["targetFeatureInstanceId"] = targetInstanceId
workflows = db.getRecordset(AutoWorkflow, recordFilter=wfFilter or None) or []
visibleWfIds: set[str] = set()
wfMap: dict = {}
for wf in workflows:
wfDict = dict(wf)
tid = wfDict.get("targetFeatureInstanceId") or wfDict.get("featureInstanceId")
if tid and tid in accessibleInstanceIds:
wfId = wfDict.get("id")
if wfId:
visibleWfIds.add(wfId)
wfMap[wfId] = wfDict
if workflowId:
if workflowId not in visibleWfIds:
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
visibleWfIds = {workflowId}
if not visibleWfIds:
return {"runs": [], "total": 0, "limit": limit, "offset": offset}
allRuns = db.getRecordset(AutoRun, recordFilter={}) or []
filtered = []
for r in allRuns:
row = dict(r)
if row.get("workflowId") not in visibleWfIds:
continue
if scope == "mine" and row.get("ownerId") != userId:
continue
if status and row.get("status") != status:
continue
filtered.append(row)
filtered.sort(
key=lambda x: x.get("startedAt") or x.get("sysCreatedAt") or 0,
reverse=True,
)
total = len(filtered)
page = filtered[offset: offset + limit]
from modules.routes.routeHelpers import enrichRowsWithFkLabels, resolveMandateLabels, resolveInstanceLabels
for row in page:
wf = wfMap.get(row.get("workflowId"), {})
row["workflowLabel"] = row.get("label") or wf.get("label") or row.get("workflowId", "")
row["targetFeatureInstanceId"] = wf.get("targetFeatureInstanceId") or wf.get("featureInstanceId")
enrichRowsWithFkLabels(
page,
labelResolvers={
"mandateId": resolveMandateLabels,
"targetFeatureInstanceId": resolveInstanceLabels,
},
)
for row in page:
row["targetInstanceLabel"] = row.pop("targetFeatureInstanceIdLabel", None)
row["mandateLabel"] = row.pop("mandateIdLabel", None)
return {"runs": page, "total": total, "limit": limit, "offset": offset}
@router.get("/{runId}/detail")
@limiter.limit("60/minute")
def getWorkspaceRunDetail(
request: Request,
runId: str = Path(..., description="Run ID"),
context: RequestContext = Depends(getRequestContext),
) -> dict:
"""Get full detail for a single run: metadata, step logs, linked files."""
db = _getDb()
userId = str(context.user.id) if context.user else None
if not userId:
raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required"))
if not db._ensureTableExists(AutoRun):
raise HTTPException(status_code=404, detail=routeApiMsg("Run not found"))
runs = db.getRecordset(AutoRun, recordFilter={"id": runId})
if not runs:
raise HTTPException(status_code=404, detail=routeApiMsg("Run not found"))
run = dict(runs[0])
wfId = run.get("workflowId")
workflow: dict = {}
if wfId and db._ensureTableExists(AutoWorkflow):
wfs = db.getRecordset(AutoWorkflow, recordFilter={"id": wfId})
if wfs:
workflow = dict(wfs[0])
tid = workflow.get("targetFeatureInstanceId") or workflow.get("featureInstanceId")
accessibleIds = _getUserAccessibleInstanceIds(userId)
isOwner = run.get("ownerId") == userId
if not isOwner and (not tid or tid not in accessibleIds) and not context.isPlatformAdmin:
raise HTTPException(status_code=403, detail=routeApiMsg("Access denied"))
steps: list = []
if db._ensureTableExists(AutoStepLog):
stepRecords = db.getRecordset(AutoStepLog, recordFilter={"runId": runId}) or []
steps = [dict(s) for s in stepRecords]
steps.sort(key=lambda s: s.get("startedAt") or 0)
allFileIds: set[str] = set()
perStepFileIds: list[tuple[set[str], set[str]]] = []
for step in steps:
inputIds: set[str] = set()
outputIds: set[str] = set()
_extractFileIdsFromValue(step.get("inputSnapshot") or {}, inputIds)
_extractFileIdsFromValue(step.get("output") or {}, outputIds)
perStepFileIds.append((inputIds, outputIds))
allFileIds.update(inputIds)
allFileIds.update(outputIds)
nodeOutputs = run.get("nodeOutputs") or {}
runLevelIds: set[str] = set()
_extractFileIdsFromValue(nodeOutputs, runLevelIds)
allFileIds.update(runLevelIds)
fileMetaById: dict[str, dict] = {}
try:
from modules.datamodels.datamodelFiles import FileItem
from modules.interfaces.interfaceDbManagement import ComponentObjects
mgmtDb = ComponentObjects().db
if mgmtDb._ensureTableExists(FileItem):
for fid in allFileIds:
try:
rec = mgmtDb.getRecord(FileItem, fid)
if rec:
recDict = dict(rec)
fileMetaById[fid] = {
"id": fid,
"fileName": recDict.get("fileName") or recDict.get("name"),
}
except Exception:
pass
except Exception as e:
logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e)
def _resolveFileList(ids: set[str]) -> list[dict]:
return [fileMetaById[fid] for fid in ids if fid in fileMetaById]
assignedFileIds: set[str] = set()
for step, (inputIds, outputIds) in zip(steps, perStepFileIds):
step["inputFiles"] = _resolveFileList(inputIds)
step["outputFiles"] = _resolveFileList(outputIds)
assignedFileIds.update(inputIds)
assignedFileIds.update(outputIds)
unassignedFiles = _resolveFileList(allFileIds - assignedFileIds)
allFiles = _resolveFileList(allFileIds)
run["workflowLabel"] = run.get("label") or workflow.get("label") or wfId
run["targetFeatureInstanceId"] = tid
targetInstanceLabel = None
if tid:
try:
from modules.routes.routeHelpers import resolveInstanceLabels
labelMap = resolveInstanceLabels([tid])
targetInstanceLabel = labelMap.get(tid)
except Exception:
pass
run["targetInstanceLabel"] = targetInstanceLabel
return {
"run": run,
"workflow": {
"id": workflow.get("id"),
"label": workflow.get("label"),
"targetFeatureInstanceId": tid,
"featureInstanceId": workflow.get("featureInstanceId"),
"tags": workflow.get("tags", []),
} if workflow else None,
"steps": steps,
"files": allFiles,
"unassignedFiles": unassignedFiles,
}

View file

@ -152,10 +152,28 @@ async def get_connections(
- GET /api/connections/?mode=filterValues&column=status
- GET /api/connections/?mode=ids
"""
from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
handleGroupingInRequest, applyGroupScopeFilter,
)
CONTEXT_KEY = "connections"
# Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
interface = getInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, interface, CONTEXT_KEY)
def _buildEnhancedItems():
interface = getInterface(currentUser)
connections = interface.getUserConnections(currentUser.id)
items = []
for connection in connections:
@ -182,6 +200,7 @@ async def get_connections(
try:
items = _buildEnhancedItems()
enrichRowsWithFkLabels(items, UserConnection)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
except Exception as e:
logger.error(f"Error getting filter values for connections: {str(e)}")
@ -189,63 +208,40 @@ async def get_connections(
if mode == "ids":
try:
return handleIdsInMemory(_buildEnhancedItems(), pagination)
items = applyGroupScopeFilter(_buildEnhancedItems(), groupCtx.itemIds)
return handleIdsInMemory(items, pagination)
except Exception as e:
logger.error(f"Error getting IDs for connections: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
try:
interface = getInterface(currentUser)
# NOTE: Cannot use db.getRecordsetPaginated() here because each connection
# is enriched with computed tokenStatus/tokenExpiresAt (requires per-row DB lookup).
# Token refresh also may trigger re-fetch. Connections per user are typically < 10,
# so in-memory pagination is acceptable.
# Parse pagination parameter
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
# Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
# SECURITY FIX: All users (including admins) can only see their own connections
# This prevents admin from seeing other users' connections and causing confusion
connections = interface.getUserConnections(currentUser.id)
# Perform silent token refresh for expired OAuth connections
try:
refresh_result = await token_refresh_service.refresh_expired_tokens(currentUser.id)
if refresh_result.get("refreshed", 0) > 0:
logger.info(f"Silently refreshed {refresh_result['refreshed']} tokens for user {currentUser.id}")
# Re-fetch connections to get updated token status
connections = interface.getUserConnections(currentUser.id)
except Exception as e:
logger.warning(f"Silent token refresh failed for user {currentUser.id}: {str(e)}")
# Continue with original connections even if refresh fails
# Enhance each connection with token status information and convert to dict
enhanced_connections_dict = []
for connection in connections:
# Get token status for this connection
tokenStatus, tokenExpiresAt = getTokenStatusForConnection(interface, connection.id)
# Convert to dict for filtering/sorting
connection_dict = {
"id": connection.id,
"userId": connection.userId,
"authority": connection.authority.value if hasattr(connection.authority, 'value') else str(connection.authority),
"externalId": connection.externalId,
"externalUsername": connection.externalUsername or "",
"externalEmail": connection.externalEmail, # Keep None instead of converting to empty string
"externalEmail": connection.externalEmail,
"status": connection.status.value if hasattr(connection.status, 'value') else str(connection.status),
"connectedAt": connection.connectedAt,
"lastChecked": connection.lastChecked,
@ -254,24 +250,26 @@ async def get_connections(
"tokenExpiresAt": tokenExpiresAt
}
enhanced_connections_dict.append(connection_dict)
enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection)
enhanced_connections_dict = applyGroupScopeFilter(enhanced_connections_dict, groupCtx.itemIds)
if paginationParams is None:
return {
"items": enhanced_connections_dict,
"pagination": None,
"groupTree": groupCtx.groupTree,
}
# Apply filtering if provided
if paginationParams.filters:
component_interface = ComponentObjects()
component_interface.setUserContext(currentUser)
enhanced_connections_dict = component_interface._applyFilters(
enhanced_connections_dict,
enhanced_connections_dict,
paginationParams.filters
)
# Apply sorting if provided
if paginationParams.sort:
component_interface = ComponentObjects()
@ -280,14 +278,14 @@ async def get_connections(
enhanced_connections_dict,
paginationParams.sort
)
totalItems = len(enhanced_connections_dict)
totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0
startIdx = (paginationParams.page - 1) * paginationParams.pageSize
endIdx = startIdx + paginationParams.pageSize
paged_connections = enhanced_connections_dict[startIdx:endIdx]
return {
"items": paged_connections,
"pagination": PaginationMetadata(
@ -298,6 +296,7 @@ async def get_connections(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
except HTTPException:
@ -351,11 +350,18 @@ def create_connection(
externalUsername="", # Will be set after OAuth
status=ConnectionStatus.PENDING # Start with PENDING status
)
# Apply knowledge consent + preferences from request body before persisting
knowledge_enabled = connection_data.get("knowledgeIngestionEnabled")
if isinstance(knowledge_enabled, bool):
connection.knowledgeIngestionEnabled = knowledge_enabled
knowledge_prefs = connection_data.get("knowledgePreferences")
if isinstance(knowledge_prefs, dict):
connection.knowledgePreferences = knowledge_prefs
# Save connection record - models now handle timestamp serialization automatically
interface.db.recordModify(UserConnection, connection.id, connection.model_dump())
return connection
except HTTPException:
@ -586,8 +592,25 @@ def disconnect_service(
detail=routeApiMsg("Connection not found")
)
# Update connection status
connection.status = ConnectionStatus.INACTIVE
# Fire revoked event BEFORE DB status change so knowledge purge and
# status mutation form one logical step; subscribers see the
# connection as it was. INACTIVE does not exist on the enum — REVOKED
# is the correct terminal-but-retained state (deleted rows are
# handled in DELETE /{id}).
try:
from modules.shared.callbackRegistry import callbackRegistry
callbackRegistry.trigger(
"connection.revoked",
connectionId=connectionId,
authority=str(getattr(connection.authority, "value", connection.authority) or ""),
userId=str(currentUser.id),
reason="disconnected",
)
except Exception as _cbErr:
logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
connection.status = ConnectionStatus.REVOKED
connection.lastChecked = getUtcTimestamp()
# Update connection record - models now handle timestamp serialization automatically
@ -636,6 +659,23 @@ def delete_connection(
detail=routeApiMsg("Connection not found")
)
# Fire revoked event BEFORE the row disappears so consumers still
# have authority/connection context for observability; purge itself
# targets FileContentIndex rows by connectionId which are unaffected
# by the UserConnection delete.
try:
from modules.shared.callbackRegistry import callbackRegistry
callbackRegistry.trigger(
"connection.revoked",
connectionId=connectionId,
authority=str(getattr(connection.authority, "value", connection.authority) or ""),
userId=str(currentUser.id),
reason="deleted",
)
except Exception as _cbErr:
logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr)
# Remove the connection - only need connectionId since permissions are verified
interface.removeUserConnection(connectionId)

View file

@ -12,7 +12,6 @@ from modules.auth import limiter, getCurrentUser, getRequestContext, RequestCont
# Import interfaces
import modules.interfaces.interfaceDbManagement as interfaceDbManagement
from modules.datamodels.datamodelFiles import FileItem, FilePreview
from modules.datamodels.datamodelFileFolder import FileFolder
from modules.shared.attributeUtils import getModelAttributeDefinitions
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
@ -77,7 +76,7 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
"""Background task: pre-scan + extraction + knowledge indexing.
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
Step 2: Content extraction via runExtraction -> ContentParts
Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store"""
userId = user.id if hasattr(user, "id") else str(user)
try:
mgmtInterface = interfaceDbManagement.getInterface(user)
@ -122,9 +121,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
f"{contentIndex.totalObjects} objects"
)
# Persist FileContentIndex immediately
# Persist FileContentIndex immediately.
# IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any
# prior successful run — otherwise this upsert wipes the idempotency cache
# and requestIngestion cannot detect duplicates (AC4 breaks).
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
try:
_existing = knowledgeDb.getFileContentIndex(fileId)
except Exception:
_existing = None
if _existing:
_existingStruct = (
_existing.get("structure") if isinstance(_existing, dict)
else getattr(_existing, "structure", {})
) or {}
_existingStatus = (
_existing.get("status") if isinstance(_existing, dict)
else getattr(_existing, "status", "")
) or ""
if "_ingestion" in _existingStruct:
contentIndex.structure = dict(contentIndex.structure or {})
contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"]
if _existingStatus == "indexed":
contentIndex.status = "indexed"
knowledgeDb.upsertFileContentIndex(contentIndex)
# Step 2: Content extraction (AI-free, produces ContentParts)
@ -134,7 +154,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
options = ExtractionOptions()
# mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion.
# The default MergeStrategy concatenates all text parts into a single blob, which
# collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval.
options = ExtractionOptions(mergeStrategy=None)
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
@ -181,15 +204,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
)
knowledgeService = getService("knowledge", ctx)
await knowledgeService.indexFile(
fileId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
mandateId=str(mandate_id) if mandate_id else "",
contentObjects=contentObjects,
structure=contentIndex.structure,
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="file",
sourceId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
mandateId=str(mandate_id) if mandate_id else "",
contentObjects=contentObjects,
structure=contentIndex.structure,
provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"},
)
)
# Re-acquire interface after await to avoid stale user context from the singleton
@ -249,7 +278,6 @@ def get_files(
try:
paginationDict = json.loads(pagination)
if paginationDict:
# Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
@ -257,51 +285,43 @@ def get_files(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
from modules.routes.routeHelpers import (
handleIdsMode,
handleFilterValuesInMemory,
handleGroupingInRequest, applyGroupScopeFilter,
)
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
appInterface = _appIface.getInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, appInterface, "files/list")
def _filesToDicts(fileItems):
return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in fileItems]
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
allFiles = managementInterface.getAllFiles()
items = allFiles if isinstance(allFiles, list) else (allFiles.items if hasattr(allFiles, "items") else [])
itemDicts = [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items]
itemDicts = _filesToDicts(items)
enrichRowsWithFkLabels(itemDicts, FileItem)
itemDicts = applyGroupScopeFilter(itemDicts, groupCtx.itemIds)
return handleFilterValuesInMemory(itemDicts, column, pagination)
if mode == "ids":
recordFilter = {"sysCreatedBy": managementInterface.userId}
return handleIdsMode(managementInterface.db, FileItem, pagination, recordFilter)
recordFilter = None
if paginationParams and paginationParams.filters and "folderId" in paginationParams.filters:
fVal = paginationParams.filters.get("folderId")
# For a concrete folderId we use recordFilter (exact equality).
# For null / empty (= "root") we keep it in pagination.filters so the
# connector applies `IS NULL OR = ''` files predating the folderId
# fix were stored with an empty string instead of NULL.
if fVal is None or (isinstance(fVal, str) and fVal.strip() == ""):
paginationParams.filters["folderId"] = None
else:
paginationParams.filters.pop("folderId")
recordFilter = {"folderId": fVal}
result = managementInterface.getAllFiles(pagination=paginationParams, recordFilter=recordFilter)
def _filesToDicts(items):
return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items]
result = managementInterface.getAllFiles(pagination=paginationParams)
if paginationParams:
enriched = enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem)
enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem), groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -312,11 +332,12 @@ def get_files(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
else:
items = result if isinstance(result, list) else (result.items if hasattr(result, "items") else [result])
enriched = enrichRowsWithFkLabels(_filesToDicts(items), FileItem)
return {"items": enriched, "pagination": None}
enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(items), FileItem), groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:
@ -327,6 +348,36 @@ def get_files(
)
def _addFileToGroup(appInterface, fileId: str, groupId: str, contextKey: str = "files/list"):
"""Add a file to a group in the persisted groupTree (upsert)."""
from modules.routes.routeHelpers import _collectItemIds
try:
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return
nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups]
def _add(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == groupId:
itemIds = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fileId not in itemIds:
itemIds.append(fileId)
if isinstance(nd, dict):
nd["itemIds"] = itemIds
else:
nd.itemIds = itemIds
return True
subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])
if _add(subs):
return True
return False
_add(nodes)
appInterface.upsertTableGrouping(contextKey, nodes)
except Exception as e:
logger.warning(f"_addFileToGroup failed: {e}")
@router.post("/upload", status_code=status.HTTP_201_CREATED)
@limiter.limit("10/minute")
async def upload_file(
@ -334,7 +385,7 @@ async def upload_file(
file: UploadFile = File(...),
workflowId: Optional[str] = Form(None),
featureInstanceId: Optional[str] = Form(None),
folderId: Optional[str] = Form(None),
groupId: Optional[str] = Form(None),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
) -> JSONResponse:
@ -358,31 +409,22 @@ async def upload_file(
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"File too large. Maximum size: {interfaceDbManagement.APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB"
)
# Normalize folderId: empty string / "null" / "root" → None (root folder)
normalizedFolderId: Optional[str] = folderId
if isinstance(normalizedFolderId, str):
trimmed = normalizedFolderId.strip()
if not trimmed or trimmed.lower() in {"null", "none", "root"}:
normalizedFolderId = None
else:
normalizedFolderId = trimmed
# Save file via LucyDOM interface in the database
fileItem, duplicateType = managementInterface.saveUploadedFile(
fileContent, file.filename, folderId=normalizedFolderId
fileContent, file.filename
)
if featureInstanceId and not fileItem.featureInstanceId:
managementInterface.updateFile(fileItem.id, {"featureInstanceId": featureInstanceId})
fileItem.featureInstanceId = featureInstanceId
# For exact duplicates we keep the existing record, but move it into the
# target folder so the user actually sees their upload land where they expect.
if duplicateType == "exact_duplicate" and normalizedFolderId != getattr(fileItem, "folderId", None):
managementInterface.updateFile(fileItem.id, {"folderId": normalizedFolderId})
fileItem.folderId = normalizedFolderId
# Add to group if groupId was provided
if groupId:
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(currentUser)
_addFileToGroup(appInterface, fileItem.id, groupId)
# Determine response message based on duplicate type
if duplicateType == "exact_duplicate":
message = f"File '{file.filename}' already exists with identical content. Reusing existing file."
@ -447,347 +489,6 @@ async def upload_file(
detail=f"Error during file upload: {str(e)}"
)
# ── Folder endpoints (MUST be before /{fileId} catch-all) ─────────────────────
@router.get("/folders", response_model=List[Dict[str, Any]])
@limiter.limit("30/minute")
def list_folders(
request: Request,
parentId: Optional[str] = Query(None, description="Parent folder ID (omit for all folders)"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> List[Dict[str, Any]]:
"""List folders for the current user."""
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
if parentId is not None:
return mgmt.listFolders(parentId=parentId)
return mgmt.listFolders()
except Exception as e:
logger.error(f"Error listing folders: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/folders", status_code=status.HTTP_201_CREATED)
@limiter.limit("10/minute")
def create_folder(
request: Request,
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Create a new folder."""
name = body.get("name", "")
parentId = body.get("parentId")
if not name:
raise HTTPException(status_code=400, detail=routeApiMsg("name is required"))
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return mgmt.createFolder(name=name, parentId=parentId)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error creating folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.put("/folders/{folderId}")
@limiter.limit("10/minute")
def rename_folder(
request: Request,
folderId: str = Path(...),
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Rename a folder."""
newName = body.get("name", "")
if not newName:
raise HTTPException(status_code=400, detail=routeApiMsg("name is required"))
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
mgmt.renameFolder(folderId, newName)
return {"success": True, "folderId": folderId, "name": newName}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error renaming folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/folders/{folderId}")
@limiter.limit("10/minute")
def delete_folder(
request: Request,
folderId: str = Path(...),
recursive: bool = Query(False, description="Delete folder contents recursively"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Delete a folder. Use recursive=true to delete non-empty folders."""
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
return mgmt.deleteFolder(folderId, recursive=recursive)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error deleting folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/folders/{folderId}/move")
@limiter.limit("10/minute")
def move_folder(
request: Request,
folderId: str = Path(...),
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Move a folder to a new parent."""
targetParentId = body.get("targetParentId")
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
mgmt.moveFolder(folderId, targetParentId)
return {"success": True, "folderId": folderId, "parentId": targetParentId}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Error moving folder: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/folders/{folderId}/scope")
@limiter.limit("10/minute")
def _updateFolderScope(
request: Request,
folderId: str = Path(..., description="ID of the folder"),
scope: str = Body(..., embed=True),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Update the scope of a folder. Propagates to all files inside (recursively). Global scope requires sysAdmin."""
validScopes = {"personal", "featureInstance", "mandate", "global"}
if scope not in validScopes:
raise HTTPException(status_code=400, detail=f"Invalid scope: {scope}. Must be one of {validScopes}")
if scope == "global" and not context.isSysAdmin:
raise HTTPException(status_code=403, detail=routeApiMsg("Only sysadmins can set global scope"))
try:
mgmt = interfaceDbManagement.getInterface(
context.user,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
folder = mgmt.getFolder(folderId)
if not folder:
raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found"))
mgmt.updateFolder(folderId, {"scope": scope})
fileIds = _collectFolderFileIds(mgmt, folderId)
for fid in fileIds:
try:
mgmt.updateFile(fid, {"scope": scope})
except Exception as e:
logger.error("Folder scope propagation: failed to update file %s: %s", fid, e)
logger.info("Updated scope=%s for folder %s: %d files affected", scope, folderId, len(fileIds))
return {"folderId": folderId, "scope": scope, "filesUpdated": len(fileIds)}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error updating folder scope: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/folders/{folderId}/neutralize")
@limiter.limit("10/minute")
def updateFolderNeutralize(
request: Request,
background_tasks: BackgroundTasks,
folderId: str = Path(..., description="ID of the folder"),
neutralize: bool = Body(..., embed=True),
context: RequestContext = Depends(getRequestContext),
) -> Dict[str, Any]:
"""Toggle neutralization on a folder. Propagates to all files inside (recursively).
When turning ON: all files in the folder get ``neutralize=True``, their
knowledge indexes are purged synchronously, and background re-indexing
is triggered.
When turning OFF: files revert to ``neutralize=False`` unless they were
individually marked (not implemented yet -- all are reverted).
"""
try:
mgmt = interfaceDbManagement.getInterface(
context.user,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
folder = mgmt.getFolder(folderId)
if not folder:
raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found"))
mgmt.updateFolder(folderId, {"neutralize": neutralize})
fileIds = _collectFolderFileIds(mgmt, folderId)
logger.info("Folder neutralize toggle %s for folder %s: %d files affected", neutralize, folderId, len(fileIds))
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
for fid in fileIds:
try:
mgmt.updateFile(fid, {"neutralize": neutralize})
if neutralize:
try:
knowledgeDb.deleteFileContentIndex(fid)
except Exception as e:
logger.warning("Folder neutralize: failed to purge index for file %s: %s", fid, e)
else:
try:
from modules.datamodels.datamodelKnowledge import FileContentIndex
indices = knowledgeDb.db.getRecordset(FileContentIndex, recordFilter={"id": fid})
for idx in indices:
idxId = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
if idxId:
knowledgeDb.db.recordModify(FileContentIndex, idxId, {
"neutralizationStatus": "original",
"isNeutralized": False,
})
except Exception as e:
logger.warning("Folder neutralize OFF: metadata update failed for %s: %s", fid, e)
except Exception as e:
logger.error("Folder neutralize: failed to update file %s: %s", fid, e)
for fid in fileIds:
fileMeta = mgmt.getFile(fid)
if fileMeta:
fn = fileMeta.fileName if hasattr(fileMeta, "fileName") else fileMeta.get("fileName", "")
mt = fileMeta.mimeType if hasattr(fileMeta, "mimeType") else fileMeta.get("mimeType", "")
async def _reindex(fileId=fid, fileName=fn, mimeType=mt):
try:
await _autoIndexFile(fileId=fileId, fileName=fileName, mimeType=mimeType, user=context.user)
except Exception as ex:
logger.error("Folder neutralize re-index failed for %s: %s", fileId, ex)
background_tasks.add_task(_reindex)
return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": len(fileIds)}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error updating folder neutralize flag: {e}")
raise HTTPException(status_code=500, detail=str(e))
def _collectFolderFileIds(mgmt, folderId: str) -> List[str]:
"""Recursively collect all file IDs in a folder and its sub-folders."""
fileIds = []
try:
files = mgmt.listFiles(folderId=folderId)
if isinstance(files, dict):
files = files.get("files", [])
for f in (files or []):
fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
if fid:
fileIds.append(fid)
except Exception as e:
logger.warning("_collectFolderFileIds: listFiles failed for folder %s: %s", folderId, e)
try:
subFolders = mgmt.listFolders(parentId=folderId)
for sf in (subFolders or []):
sfId = sf.get("id") if isinstance(sf, dict) else getattr(sf, "id", None)
if sfId:
fileIds.extend(_collectFolderFileIds(mgmt, sfId))
except Exception as e:
logger.warning("_collectFolderFileIds: listFolders failed for folder %s: %s", folderId, e)
return fileIds
@router.get("/folders/{folderId}/download")
@limiter.limit("10/minute")
def download_folder(
request: Request,
folderId: str = Path(..., description="ID of the folder to download as ZIP"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Response:
"""Download a folder (including subfolders) as a ZIP archive."""
import io
import zipfile
import urllib.parse
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
folder = mgmt.getFolder(folderId)
if not folder:
raise HTTPException(status_code=404, detail=f"Folder {folderId} not found")
folderName = folder.get("name", "download")
def _collectFiles(parentId: str, pathPrefix: str):
"""Recursively collect (zipPath, fileId) tuples."""
entries = []
for f in mgmt._getFilesByCurrentUser(recordFilter={"folderId": parentId}):
fname = f.get("fileName") or f.get("name") or f.get("id", "file")
entries.append((f"{pathPrefix}{fname}", f["id"]))
for sub in mgmt.listFolders(parentId=parentId):
subName = sub.get("name", sub["id"])
entries.extend(_collectFiles(sub["id"], f"{pathPrefix}{subName}/"))
return entries
fileEntries = _collectFiles(folderId, "")
if not fileEntries:
raise HTTPException(status_code=404, detail=routeApiMsg("Folder is empty"))
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
for zipPath, fileId in fileEntries:
data = mgmt.getFileData(fileId)
if data:
zf.writestr(zipPath, data)
buf.seek(0)
zipBytes = buf.getvalue()
encodedName = urllib.parse.quote(f"{folderName}.zip")
return Response(
content=zipBytes,
media_type="application/zip",
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encodedName}"
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error downloading folder as ZIP: {e}")
raise HTTPException(status_code=500, detail=f"Error downloading folder: {str(e)}")
@router.post("/batch-delete")
@ -798,13 +499,11 @@ def batch_delete_items(
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Batch delete files/folders with a single SQL-backed operation per type."""
"""Batch delete files."""
fileIds = body.get("fileIds") or []
folderIds = body.get("folderIds") or []
recursiveFolders = bool(body.get("recursiveFolders", True))
if not isinstance(fileIds, list) or not isinstance(folderIds, list):
raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays"))
if not isinstance(fileIds, list):
raise HTTPException(status_code=400, detail=routeApiMsg("fileIds must be an array"))
try:
mgmt = interfaceDbManagement.getInterface(
@ -813,17 +512,12 @@ def batch_delete_items(
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
result = {"deletedFiles": 0, "deletedFolders": 0}
result = {"deletedFiles": 0}
if fileIds:
fileResult = mgmt.deleteFilesBatch(fileIds)
result["deletedFiles"] += fileResult.get("deletedFiles", 0)
if folderIds:
folderResult = mgmt.deleteFoldersBatch(folderIds, recursive=recursiveFolders)
result["deletedFiles"] += folderResult.get("deletedFiles", 0)
result["deletedFolders"] += folderResult.get("deletedFolders", 0)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
@ -832,45 +526,189 @@ def batch_delete_items(
raise HTTPException(status_code=500, detail=str(e))
@router.post("/batch-move")
@limiter.limit("10/minute")
def batch_move_items(
request: Request,
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Batch move files/folders with a single SQL-backed operation per type."""
fileIds = body.get("fileIds") or []
folderIds = body.get("folderIds") or []
targetFolderId = body.get("targetFolderId")
targetParentId = body.get("targetParentId")
if not isinstance(fileIds, list) or not isinstance(folderIds, list):
raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays"))
# ── Group bulk endpoints ──────────────────────────────────────────────────────
def _get_group_item_ids(contextKey: str, groupId: str, appInterface) -> set:
"""Collect all file IDs in a group and its sub-groups from the stored groupTree."""
from modules.routes.routeHelpers import _collectItemIds
try:
mgmt = interfaceDbManagement.getInterface(
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return set()
nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups]
result = _collectItemIds(nodes, groupId)
return result or set()
except Exception as e:
logger.error(f"_get_group_item_ids failed for groupId={groupId}: {e}")
return set()
@router.patch("/groups/{groupId}/scope")
@limiter.limit("60/minute")
def patch_group_scope(
request: Request,
groupId: str = Path(..., description="Group ID"),
body: dict = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Set scope for all files in a group (recursive)."""
scope = body.get("scope")
if not scope:
raise HTTPException(status_code=400, detail="scope is required")
try:
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
result = {"movedFiles": 0, "movedFolders": 0}
if fileIds:
fileResult = mgmt.moveFilesBatch(fileIds, targetFolderId=targetFolderId)
result["movedFiles"] += fileResult.get("movedFiles", 0)
if folderIds:
folderResult = mgmt.moveFoldersBatch(folderIds, targetParentId=targetParentId)
result["movedFolders"] += folderResult.get("movedFolders", 0)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
updated = 0
for fid in fileIds:
try:
managementInterface.updateFile(fid, {"scope": scope})
updated += 1
except Exception as e:
logger.error(f"patch_group_scope: failed to update file {fid}: {e}")
return {"groupId": groupId, "scope": scope, "filesUpdated": updated}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in batch move: {e}")
logger.error(f"patch_group_scope error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.patch("/groups/{groupId}/neutralize")
@limiter.limit("60/minute")
def patch_group_neutralize(
request: Request,
groupId: str = Path(..., description="Group ID"),
body: dict = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Toggle neutralize for all files in a group (recursive, incl. knowledge purge/reindex)."""
neutralize = body.get("neutralize")
if neutralize is None:
raise HTTPException(status_code=400, detail="neutralize is required")
try:
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
updated = 0
for fid in fileIds:
try:
managementInterface.updateFile(fid, {"neutralize": neutralize})
if not neutralize:
try:
from modules.interfaces import interfaceDbKnowledge
kIface = interfaceDbKnowledge.getInterface(currentUser)
kIface.purgeFileKnowledge(fid)
except Exception as ke:
logger.warning(f"patch_group_neutralize: knowledge purge failed for {fid}: {ke}")
updated += 1
except Exception as e:
logger.error(f"patch_group_neutralize: failed for file {fid}: {e}")
return {"groupId": groupId, "neutralize": neutralize, "filesUpdated": updated}
except HTTPException:
raise
except Exception as e:
logger.error(f"patch_group_neutralize error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/groups/{groupId}/download")
@limiter.limit("20/minute")
async def download_group_zip(
request: Request,
groupId: str = Path(..., description="Group ID"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Download all files in a group as a ZIP archive."""
import io, zipfile
try:
import modules.interfaces.interfaceDbApp as _appIface
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
if not fileIds:
raise HTTPException(status_code=404, detail="Group not found or empty")
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
for fid in fileIds:
try:
fileMeta = managementInterface.getFile(fid)
fileData = managementInterface.getFileData(fid)
if fileMeta and fileData:
name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid
zf.writestr(name, fileData)
except Exception as fe:
logger.warning(f"download_group_zip: skipping file {fid}: {fe}")
buf.seek(0)
from fastapi.responses import StreamingResponse
return StreamingResponse(
buf,
media_type="application/zip",
headers={"Content-Disposition": f'attachment; filename="group-{groupId}.zip"'},
)
except HTTPException:
raise
except Exception as e:
logger.error(f"download_group_zip error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/groups/{groupId}")
@limiter.limit("30/minute")
def delete_group(
request: Request,
groupId: str = Path(..., description="Group ID"),
deleteItems: bool = Query(False, description="If true, also delete all files in the group"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext),
):
"""Remove a group from the groupTree. Optionally delete all its files."""
try:
import modules.interfaces.interfaceDbApp as _appIface
appInterface = _appIface.getInterface(currentUser)
fileIds = _get_group_item_ids("files/list", groupId, appInterface)
# Remove group from tree
existing = appInterface.getTableGrouping("files/list")
if existing:
from modules.routes.routeHelpers import _removeGroupFromTree
newRoots = _removeGroupFromTree([n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups], groupId)
appInterface.upsertTableGrouping("files/list", newRoots)
# Optionally delete files
deletedFiles = 0
if deleteItems:
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
for fid in fileIds:
try:
managementInterface.deleteFile(fid)
deletedFiles += 1
except Exception as e:
logger.error(f"delete_group: failed to delete file {fid}: {e}")
return {"groupId": groupId, "deletedFiles": deletedFiles}
except HTTPException:
raise
except Exception as e:
logger.error(f"delete_group error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@ -1071,7 +909,7 @@ def update_file(
) -> FileItem:
"""Update file info"""
try:
_EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "folderId", "neutralize"}
_EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "neutralize"}
safeData = {k: v for k, v in file_info.items() if k in _EDITABLE_FIELDS}
if not safeData:
raise HTTPException(status_code=400, detail=routeApiMsg("No editable fields provided"))
@ -1226,37 +1064,3 @@ def preview_file(
)
@router.post("/{fileId}/move")
@limiter.limit("10/minute")
def move_file(
request: Request,
fileId: str = Path(...),
body: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Dict[str, Any]:
"""Move a file to a different folder."""
targetFolderId = body.get("targetFolderId")
try:
mgmt = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None,
)
mgmt.updateFile(fileId, {"folderId": targetFolderId})
if targetFolderId:
try:
targetFolder = mgmt.getFolder(targetFolderId)
folderNeut = (targetFolder.get("neutralize") if isinstance(targetFolder, dict)
else getattr(targetFolder, "neutralize", False)) if targetFolder else False
if folderNeut:
mgmt.updateFile(fileId, {"neutralize": True})
logger.info("File %s moved to neutralized folder %s — inherited neutralize=True", fileId, targetFolderId)
except Exception as e:
logger.warning("File move: folder neutralize inheritance check failed for %s: %s", fileId, e)
return {"success": True, "fileId": fileId, "folderId": targetFolderId}
except Exception as e:
logger.error(f"Error moving file: {e}")
raise HTTPException(status_code=500, detail=str(e))

View file

@ -112,8 +112,8 @@ def get_mandates(
status_code=status.HTTP_403_FORBIDDEN,
detail=routeApiMsg("Admin role required")
)
# Parse pagination parameter
# Parse pagination parameter early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@ -126,14 +126,24 @@ def get_mandates(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory,
handleFilterValuesMode, handleIdsMode,
parseCrossFilterPagination,
handleGroupingInRequest, applyGroupScopeFilter,
)
appInterface = interfaceDbApp.getRootInterface()
groupCtx = handleGroupingInRequest(paginationParams, appInterface, "mandates")
def _mandateItemsForAdmin():
items = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
items.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
return items
if mode == "filterValues":
if not column:
@ -144,54 +154,42 @@ def get_mandates(
values = appInterface.db.getDistinctColumnValues(Mandate, column, crossPagination)
return JSONResponse(content=sorted(values, key=lambda v: str(v).lower()))
else:
mandateItems = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleFilterValuesInMemory(mandateItems, column, pagination)
if mode == "ids":
if isPlatformAdmin:
return handleIdsMode(appInterface.db, Mandate, pagination)
else:
mandateItems = []
for mid in adminMandateIds:
m = appInterface.getMandate(mid)
if m and getattr(m, "enabled", True):
mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m))
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return handleIdsInMemory(mandateItems, pagination)
if isPlatformAdmin:
result = appInterface.getAllMandates(pagination=paginationParams)
else:
allMandates = []
for mandateId in adminMandateIds:
mandate = appInterface.getMandate(mandateId)
if mandate and getattr(mandate, "enabled", True):
mandateDict = mandate if isinstance(mandate, dict) else mandate.model_dump() if hasattr(mandate, 'model_dump') else vars(mandate)
allMandates.append(mandateDict)
result = allMandates
paginationParams = None
if paginationParams and hasattr(result, 'items'):
return PaginatedResponse(
items=result.items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
items = result.items if hasattr(result, 'items') else (result if isinstance(result, list) else [])
items = applyGroupScopeFilter(
[i.model_dump() if hasattr(i, 'model_dump') else (i if isinstance(i, dict) else vars(i)) for i in items],
groupCtx.itemIds,
)
if paginationParams and hasattr(result, 'items'):
return PaginatedResponse(
items=items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
),
groupTree=groupCtx.groupTree,
)
)
else:
return PaginatedResponse(items=items, pagination=None, groupTree=groupCtx.groupTree)
else:
items = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else result)
return PaginatedResponse(
items=items,
pagination=None
)
mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds)
return PaginatedResponse(items=mandateItems, pagination=None, groupTree=groupCtx.groupTree)
except HTTPException:
raise
except Exception as e:

View file

@ -44,27 +44,15 @@ def get_prompts(
- filterValues: distinct values for a column (cross-filtered)
- ids: all IDs matching current filters
"""
from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels
from modules.routes.routeHelpers import (
handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels,
handleGroupingInRequest, applyGroupScopeFilter,
)
from modules.interfaces.interfaceDbApp import getInterface as getAppInterface
def _promptsToEnrichedDicts(promptItems):
dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
enrichRowsWithFkLabels(dicts, Prompt)
return dicts
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
managementInterface = interfaceDbManagement.getInterface(currentUser)
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
return handleFilterValuesInMemory(items, column, pagination)
if mode == "ids":
managementInterface = interfaceDbManagement.getInterface(currentUser)
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
return handleIdsInMemory(items, pagination)
CONTEXT_KEY = "prompts"
# Parse pagination params early — needed for grouping in all modes
paginationParams = None
if pagination:
try:
@ -74,12 +62,35 @@ def get_prompts(
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
appInterface = getAppInterface(currentUser)
groupCtx = handleGroupingInRequest(paginationParams, appInterface, CONTEXT_KEY)
def _promptsToEnrichedDicts(promptItems):
dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems]
enrichRowsWithFkLabels(dicts, Prompt)
return dicts
managementInterface = interfaceDbManagement.getInterface(currentUser)
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleFilterValuesInMemory(items, column, pagination)
if mode == "ids":
result = managementInterface.getAllPrompts(pagination=None)
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(items, groupCtx.itemIds)
return handleIdsInMemory(items, pagination)
result = managementInterface.getAllPrompts(pagination=paginationParams)
if paginationParams:
items = _promptsToEnrichedDicts(result.items)
items = applyGroupScopeFilter(_promptsToEnrichedDicts(result.items), groupCtx.itemIds)
return {
"items": items,
"pagination": PaginationMetadata(
@ -90,12 +101,14 @@ def get_prompts(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": groupCtx.groupTree,
}
else:
items = _promptsToEnrichedDicts(result)
items = applyGroupScopeFilter(_promptsToEnrichedDicts(result), groupCtx.itemIds)
return {
"items": items,
"pagination": None,
"groupTree": groupCtx.groupTree,
}

View file

@ -208,6 +208,21 @@ def get_users(
- GET /api/users/ (no pagination - returns all users in mandate)
- GET /api/users/?pagination={"page":1,"pageSize":10,"sort":[]}
"""
# Parse pagination early — needed for grouping in all modes
_paginationParams = None
if pagination:
try:
_pd = json.loads(pagination)
if _pd:
_pd = normalize_pagination_dict(_pd)
_paginationParams = PaginationParams(**_pd)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}")
from modules.routes.routeHelpers import handleGroupingInRequest as _handleGrouping, applyGroupScopeFilter as _applyGroupScope
_appInterfaceForGrouping = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
_groupCtx = _handleGrouping(_paginationParams, _appInterfaceForGrouping, "users")
if mode == "filterValues":
if not column:
raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues")
@ -217,27 +232,15 @@ def get_users(
return _getUserFilterOrIds(context, pagination, idsMode=True)
try:
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
appInterface = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId)
paginationParams = _paginationParams
appInterface = _appInterfaceForGrouping
if context.mandateId:
# Get users for specific mandate using getUsersByMandate
result = appInterface.getUsersByMandate(str(context.mandateId), paginationParams)
if paginationParams and hasattr(result, 'items'):
enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -248,17 +251,18 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else result.items if hasattr(result, 'items') else []
enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
return {"items": enriched, "pagination": None}
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
elif context.isPlatformAdmin:
# PlatformAdmin without mandateId — DB-level pagination via interface
result = appInterface.getAllUsers(paginationParams)
if paginationParams and hasattr(result, 'items'):
enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User)
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds)
return {
"items": enriched,
"pagination": PaginationMetadata(
@ -269,11 +273,12 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
users = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else [])
enriched = enrichRowsWithFkLabels(_usersToDicts(users), User)
return {"items": enriched, "pagination": None}
enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds)
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
else:
# Non-SysAdmin without mandateId: aggregate users across all admin mandates
rootInterface = getRootInterface()
@ -313,16 +318,16 @@ def get_users(
]
from modules.routes.routeHelpers import applyFiltersAndSort as _applyFiltersAndSortHelper
filteredUsers = _applyFiltersAndSortHelper(allUsers, paginationParams)
filteredUsers = _applyGroupScope(_applyFiltersAndSortHelper(allUsers, paginationParams), _groupCtx.itemIds)
enriched = enrichRowsWithFkLabels(filteredUsers, User)
if paginationParams:
import math
totalItems = len(enriched)
totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0
startIdx = (paginationParams.page - 1) * paginationParams.pageSize
endIdx = startIdx + paginationParams.pageSize
return {
"items": enriched[startIdx:endIdx],
"pagination": PaginationMetadata(
@ -333,9 +338,10 @@ def get_users(
sort=paginationParams.sort,
filters=paginationParams.filters
).model_dump(),
"groupTree": _groupCtx.groupTree,
}
else:
return {"items": enriched, "pagination": None}
return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree}
except HTTPException:
raise
except Exception as e:

View file

@ -701,3 +701,157 @@ def paginateInMemory(
offset = (paginationParams.page - 1) * paginationParams.pageSize
pageItems = items[offset:offset + paginationParams.pageSize]
return pageItems, totalItems
# ---------------------------------------------------------------------------
# Table Grouping helpers
# ---------------------------------------------------------------------------
from dataclasses import dataclass, field as dc_field
@dataclass
class GroupingContext:
"""
Result of handleGroupingInRequest.
Carries the group tree for the response and the resolved item-ID set for
group-scope filtering (None = no active group scope).
"""
groupTree: Optional[list] # List[TableGroupNode] serialised as dicts — for response
itemIds: Optional[set] # Set[str] when groupId was set, else None
def _collectItemIds(nodes: list, groupId: str) -> Optional[set]:
"""
Recursively search *nodes* for a node whose id == groupId and collect
all itemIds from it and all its descendant subGroups.
Returns None if the group is not found.
"""
for node in nodes:
nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
if nodeId == groupId:
ids: set = set()
_collectAllIds(node, ids)
return ids
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
result = _collectItemIds(subGroups, groupId)
if result is not None:
return result
return None
def _collectAllIds(node, ids: set) -> None:
"""Collect itemIds from a node and all its descendants into ids."""
nodeItemIds = node.get("itemIds", []) if isinstance(node, dict) else getattr(node, "itemIds", [])
for iid in nodeItemIds:
ids.add(str(iid))
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
for child in subGroups:
_collectAllIds(child, ids)
def _removeGroupFromTree(nodes: list, groupId: str) -> list:
"""Remove a group node (and all descendants) from the tree by id."""
result = []
for node in nodes:
nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None)
if nodeId == groupId:
continue # skip this node (remove it)
subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", [])
filtered_sub = _removeGroupFromTree(subGroups, groupId)
if isinstance(node, dict):
node = {**node, "subGroups": filtered_sub}
result.append(node)
return result
def handleGroupingInRequest(
paginationParams: Optional[PaginationParams],
interface,
contextKey: str,
) -> GroupingContext:
"""
Central grouping handler call at the start of every list route that
supports table grouping.
Steps (in order):
1. If paginationParams.saveGroupTree is set:
persist the new tree via interface.upsertTableGrouping, then clear
saveGroupTree from paginationParams so it is not treated as a filter.
2. Load the current group tree from the DB (used in step 3 and response).
3. If paginationParams.groupId is set:
resolve it to a Set[str] of itemIds (including all sub-groups),
then clear groupId from paginationParams so it is not treated as a
normal filter field.
4. Return a GroupingContext with groupTree (for the response) and itemIds
(for applyGroupScopeFilter).
The caller does NOT need to handle any grouping logic itself just call
applyGroupScopeFilter(items, groupCtx.itemIds) and embed groupCtx.groupTree
in the response dict.
"""
from modules.datamodels.datamodelPagination import TableGroupNode
groupTree = None
itemIds = None
if paginationParams is None:
try:
existing = interface.getTableGrouping(contextKey)
if existing:
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
except Exception as e:
logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
return GroupingContext(groupTree=groupTree, itemIds=None)
# Step 1: persist saveGroupTree if present
if paginationParams.saveGroupTree is not None:
try:
saved = interface.upsertTableGrouping(contextKey, paginationParams.saveGroupTree)
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in saved.rootGroups]
except Exception as e:
logger.error(f"handleGroupingInRequest: upsertTableGrouping failed: {e}")
paginationParams.saveGroupTree = None
# Step 2: load current tree (only if not already set from save above)
if groupTree is None:
try:
existing = interface.getTableGrouping(contextKey)
if existing:
groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups]
except Exception as e:
logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}")
# Step 3: resolve groupId to itemIds set
if paginationParams.groupId is not None:
targetGroupId = paginationParams.groupId
paginationParams.groupId = None # remove so it is not treated as a normal filter
if groupTree:
itemIds = _collectItemIds(groupTree, targetGroupId)
if itemIds is None:
logger.warning(
f"handleGroupingInRequest: groupId={targetGroupId!r} not found in tree "
f"for contextKey={contextKey!r} — returning empty set"
)
itemIds = set() # unknown group → show nothing rather than everything
else:
# groupId sent but no tree saved yet → return empty (nothing belongs to any group)
logger.warning(
f"handleGroupingInRequest: groupId={targetGroupId!r} set but no tree exists "
f"for contextKey={contextKey!r} — returning empty set"
)
itemIds = set()
return GroupingContext(groupTree=groupTree, itemIds=itemIds)
def applyGroupScopeFilter(items: List[Dict[str, Any]], itemIds: Optional[set]) -> List[Dict[str, Any]]:
"""
Filter items to those whose "id" field is in itemIds.
Returns items unchanged when itemIds is None (no active group scope).
Works for both normal list items and for mode=ids / mode=filterValues flows
call it before handleIdsInMemory / handleFilterValuesInMemory.
"""
if itemIds is None:
return items
return [item for item in items if str(item.get("id", "")) in itemIds]

View file

@ -241,6 +241,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "clickup",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -479,6 +479,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "google"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "google",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -420,6 +420,29 @@ async def auth_connect_callback(
)
interface.saveConnectionToken(token)
try:
from modules.shared.callbackRegistry import callbackRegistry
if connection.knowledgeIngestionEnabled:
callbackRegistry.trigger(
"connection.established",
connectionId=connection.id,
authority=str(getattr(connection.authority, "value", connection.authority) or "msft"),
userId=str(user.id),
)
else:
logger.info(
"ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user",
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connection.id,
"authority": "msft",
"reason": "consent_disabled",
},
)
except Exception as _cbErr:
logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr)
return HTMLResponse(
content=f"""
<html>

View file

@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_getOrCreateTempFolder,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@ -392,65 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
if chunkMime:
mimeType = chunkMime
# 2) File not yet indexed -> trigger extraction via ExtractionService, then retry
if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId):
try:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
fileContent = chatService.getFileContent(fileId)
if fileContent and fileInfo:
rawData = fileContent.get("data", "")
if isinstance(rawData, str) and len(rawData) > 100:
rawBytes = _b64.b64decode(rawData)
elif isinstance(rawData, bytes):
rawBytes = rawData
else:
rawBytes = None
if rawBytes:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
fileMime = fileInfo.get("mimeType", "application/octet-stream")
fileName = fileInfo.get("fileName", fileId)
extracted = runExtraction(
ExtractorRegistry(), None,
rawBytes, fileName, fileMime, ExtractionOptions(),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})},
})
if contentObjects:
_diFiId, _diMId = _resolveFileScope(fileId, context)
await knowledgeService.indexFile(
fileId=fileId, fileName=fileName, mimeType=fileMime,
userId=context.get("userId", ""), contentObjects=contentObjects,
featureInstanceId=_diFiId,
mandateId=_diMId,
)
chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if pageIndex is not None:
imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex]
if imageChunks:
imageData = imageChunks[0].get("data", "")
except Exception as extractErr:
logger.warning(f"describeImage: on-demand extraction failed: {extractErr}")
# 3) Direct image file (not a container) - use raw file data
# 2) Direct image file (not a container) - use raw file data
if not imageData:
chatService = services.chat
fileContent = chatService.getFileContent(fileId)
@ -460,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
imageData = fileContent.get("data", "")
mimeType = fileMimeType
# 4) PDF page rendering: render the requested page as an image via PyMuPDF
# 3) PDF page rendering: render the requested page as an image via PyMuPDF
if not imageData:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None

View file

@ -1,6 +1,6 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Shared helpers for core agent tools (file scope, binary detection, temp folder)."""
"""Shared helpers for core agent tools (file scope, binary detection, group helpers)."""
import logging
import uuid
@ -46,39 +46,60 @@ def _looksLikeBinary(data: bytes, sampleSize: int = 1024) -> bool:
return nonPrintable / len(sample) > 0.10
def _getOrCreateInstanceFolder(chatService, featureInstanceId: str, mandateId: str = "") -> Optional[str]:
"""Return the folder ID for a feature instance, creating it on first use.
Delegates to interfaceDbManagement._ensureFeatureInstanceFolder.
AI tools call this when saving a file without an explicit folderId
so that instance-produced files land in a named folder automatically.
"""
try:
dbMgmt = chatService.interfaceDbComponent
return dbMgmt._ensureFeatureInstanceFolder(featureInstanceId, mandateId)
except Exception as e:
logger.warning(f"Could not get/create instance folder for {featureInstanceId}: {e}")
return None
def _getOrCreateTempFolder(chatService) -> Optional[str]:
"""Return the ID of the root-level 'Temp' folder, creating it if it doesn't exist."""
"""Deprecated stub: folder-based organisation has been replaced by grouping.
Returns None unconditionally so callers skip the (now removed) folderId
assignment. Remove callers incrementally and delete this stub afterwards.
"""
logger.debug("_getOrCreateTempFolder called folder support removed, returning None")
return None
async def _getOrCreateInstanceGroup(
appInterface,
featureInstanceId: str,
contextKey: str = "files/list",
) -> Optional[str]:
"""Return groupId of the default group for a feature instance; create if needed."""
try:
allFolders = chatService.interfaceDbComponent.listFolders()
tempFolder = next(
(f for f in allFolders
if f.get("name") == "Temp" and not f.get("parentId")),
None,
)
if tempFolder:
return tempFolder.get("id")
newFolder = chatService.interfaceDbComponent.createFolder("Temp", parentId=None)
return newFolder.get("id") if newFolder else None
existing = appInterface.getTableGrouping(contextKey)
nodes = [
n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
for n in (existing.rootGroups if existing else [])
]
def _find(nds):
for nd in nds:
meta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {})
if (meta or {}).get("featureInstanceId") == featureInstanceId:
return nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
found = _find(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []))
if found:
return found
return None
found = _find(nodes)
if found:
return found
newId = str(uuid.uuid4())
nodes.append({"id": newId, "name": featureInstanceId, "itemIds": [], "subGroups": [], "meta": {"featureInstanceId": featureInstanceId}})
appInterface.upsertTableGrouping(contextKey, nodes)
return newId
except Exception as e:
logger.warning(f"Could not get/create Temp folder: {e}")
logger.error(f"_getOrCreateInstanceGroup: {e}")
return None
async def _getOrCreateTempGroup(
appInterface,
sessionId: str,
contextKey: str = "files/list",
) -> Optional[str]:
"""Return groupId of a temporary group for a session; create if needed."""
return await _getOrCreateInstanceGroup(appInterface, f"_temp_{sessionId}", contextKey)
def _attachFileAsChatDocument(
services: Any,
fileItem: Any,

View file

@ -25,142 +25,11 @@ def _registerMediaTools(registry: ToolRegistry, services):
# ---- Document rendering tool ----
def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""Convert markdown content to the standard document JSON format expected by renderers."""
import re as _re
sections = []
order = 0
lines = markdown.split("\n")
i = 0
def _nextId():
nonlocal order
order += 1
return f"s_{order}"
while i < len(lines):
line = lines[i]
# --- Headings ---
headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line)
if headingMatch:
level = len(headingMatch.group(1))
text = headingMatch.group(2).strip()
sections.append({
"id": _nextId(), "content_type": "heading", "order": order,
"elements": [{"content": {"text": text, "level": level}}],
})
i += 1
continue
# --- Fenced code blocks ---
codeMatch = _re.match(r'^```(\w*)', line)
if codeMatch:
lang = codeMatch.group(1) or "text"
codeLines = []
i += 1
while i < len(lines) and not lines[i].startswith("```"):
codeLines.append(lines[i])
i += 1
i += 1
sections.append({
"id": _nextId(), "content_type": "code_block", "order": order,
"elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
})
continue
# --- Tables ---
tableMatch = _re.match(r'^\|(.+)\|$', line)
if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]):
rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
"id": _nextId(), "content_type": "table", "order": order,
"elements": [{"content": {"headers": headerCells, "rows": rows}}],
})
continue
# --- Bullet / numbered lists ---
listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line)
if listMatch:
isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2)))
items = []
while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]):
m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i])
items.append({"text": m.group(3).strip()})
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
"elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
})
continue
# --- Empty lines (skip) ---
if not line.strip():
i += 1
continue
# --- Images: ![alt](file:fileId) or ![alt](url) ---
imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
fileId = ""
if src.startswith("file:"):
fileId = src[5:]
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
"elements": [{
"content": {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
}],
})
i += 1
continue
# --- Paragraph (collect consecutive non-empty lines) ---
paraLines = []
while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]):
paraLines.append(lines[i])
i += 1
if paraLines:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": " ".join(paraLines)}}],
})
continue
i += 1
if not sections:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
})
return {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "agent_rendering",
"title": title,
"language": language,
},
"documents": [{
"id": "doc_1",
"title": title,
"sections": sections,
}],
}
"""Delegate to the consolidated parser in subDocumentUtility."""
from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson
result = markdownToDocumentJson(markdown, title, language)
result["metadata"]["extraction_method"] = "agent_rendering"
return result
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry."""
@ -245,35 +114,75 @@ def _registerMediaTools(registry: ToolRegistry, services):
except Exception as e:
logger.warning(f"renderDocument: knowledge service unavailable: {e}")
resolvedImages = 0
def _resolveImageRef(targetObj, fileRefKey="_fileRef", fileIdKey="fileId"):
"""Resolve a single image reference dict to base64Data in-place."""
nonlocal resolvedImages
fileRef = targetObj.get(fileRefKey, "") or targetObj.get(fileIdKey, "")
if not fileRef or targetObj.get("base64Data"):
return
if knowledgeService:
chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if imageChunks:
targetObj["base64Data"] = imageChunks[0].get("data", "")
chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
targetObj["mimeType"] = chunkMime
resolvedImages += 1
if not targetObj.get("base64Data"):
try:
rawBytes = services.chat.getFileData(fileRef)
if rawBytes:
import base64 as _b64
targetObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
targetObj["mimeType"] = "image/png"
resolvedImages += 1
except Exception as e:
logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}")
targetObj.pop("_fileRef", None)
targetObj.pop("_srcUrl", None)
def _resolveInlineRuns(runsList):
"""Scan a list of inline runs and resolve any image runs with fileId."""
for run in runsList:
if run.get("type") == "image" and run.get("fileId") and not run.get("base64Data"):
_resolveImageRef(run, fileRefKey="fileId", fileIdKey="fileId")
for doc in structuredContent.get("documents", []):
for section in doc.get("sections", []):
if section.get("content_type") != "image":
cType = section.get("content_type")
# Block-level image sections
if cType == "image":
for element in section.get("elements", []):
contentObj = element.get("content", {})
_resolveImageRef(contentObj)
continue
for element in section.get("elements", []):
contentObj = element.get("content", {})
fileRef = contentObj.get("_fileRef", "")
if not fileRef or contentObj.get("base64Data"):
continue
if knowledgeService:
chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if imageChunks:
contentObj["base64Data"] = imageChunks[0].get("data", "")
chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
contentObj["mimeType"] = chunkMime
resolvedImages += 1
if not contentObj.get("base64Data"):
try:
rawBytes = services.chat.getFileData(fileRef)
if rawBytes:
import base64 as _b64
contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
contentObj["mimeType"] = "image/png"
resolvedImages += 1
except Exception as e:
logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}")
contentObj.pop("_fileRef", None)
contentObj.pop("_srcUrl", None)
# Paragraphs with inlineRuns
if cType == "paragraph":
for element in section.get("elements", []):
runs = element.get("content", {}).get("inlineRuns")
if runs:
_resolveInlineRuns(runs)
continue
# Bullet lists - items are List[List[InlineRun]]
if cType == "bullet_list":
for element in section.get("elements", []):
items = element.get("content", {}).get("items", [])
for item in items:
if isinstance(item, list):
_resolveInlineRuns(item)
continue
# Tables - headers and row cells are List[InlineRun]
if cType == "table":
for element in section.get("elements", []):
contentObj = element.get("content", {})
for cell in contentObj.get("headers", []):
if isinstance(cell, list):
_resolveInlineRuns(cell)
for row in contentObj.get("rows", []):
for cell in row:
if isinstance(cell, list):
_resolveInlineRuns(cell)
sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", []))
logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}")
@ -285,6 +194,7 @@ def _registerMediaTools(registry: ToolRegistry, services):
language=language,
title=title,
userPrompt=content,
style=args.get("style"),
)
if not documents:
@ -367,6 +277,20 @@ def _registerMediaTools(registry: ToolRegistry, services):
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
"style": {
"type": "object",
"description": (
"Optional style overrides for the rendered document. Supports nested keys: "
"fonts (primary, monospace), colors (primary, secondary, accent, background), "
"headings (h1-h4 with sizePt, weight, color, spaceBeforePt, spaceAfterPt), "
"paragraph (sizePt, lineSpacing, color), table (headerBg, headerFg, headerSizePt, "
"bodySizePt, rowBandingEven, rowBandingOdd, borderColor, borderWidthPt), "
"list (bulletChar, indentPt, sizePt), image (defaultWidthPt, maxWidthPt, alignment), "
"codeBlock (fontSizePt, background, borderColor), "
"page (format, marginsPt, showPageNumbers, headerHeight, footerHeight, headerLogo, headerText, footerText). "
"Only provided keys override defaults; omitted keys keep their default values."
),
},
},
},
readOnly=False,

View file

@ -11,10 +11,9 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_attachFileAsChatDocument,
_formatToolFileResult,
_getOrCreateInstanceFolder,
_getOrCreateTempFolder,
_getOrCreateInstanceGroup,
_getOrCreateTempGroup,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS,
)
@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required")
try:
knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None
fileStatus = None
# 1) Knowledge Store: return already-extracted text chunks
if knowledgeService:
@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]",
)
# 2) Not indexed yet: try on-demand extraction
# 2) Not indexed yet: inspect file type to decide how to serve the agent
# (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline)
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
if not fileInfo:
@ -100,83 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
isBinary = _looksLikeBinary(rawBytes)
if isBinary:
try:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
extracted = runExtraction(
ExtractorRegistry(), ChunkerRegistry(),
rawBytes, fileName, mimeType, ExtractionOptions(),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {
"containerPath": fileName,
"location": part.label or "file",
**(part.metadata or {}),
},
})
if contentObjects:
if knowledgeService:
try:
userId = context.get("userId", "")
_fiId, _mId = _resolveFileScope(fileId, context)
await knowledgeService.indexFile(
fileId=fileId, fileName=fileName, mimeType=mimeType,
userId=userId, contentObjects=contentObjects,
featureInstanceId=_fiId,
mandateId=_mId,
)
except Exception as e:
logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
joined = ""
if knowledgeService:
_chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
_textChunks = [
c for c in (_chunks or [])
if c.get("contentType") != "image" and c.get("data")
]
if _textChunks:
joined = "\n\n".join(c["data"] for c in _textChunks)
if not joined:
textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"]
joined = "\n\n".join(textParts) if textParts else ""
if joined:
chunked = _applyOffsetLimit(joined, offset, limit)
if chunked is not None:
return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked)
if len(joined) > _MAX_TOOL_RESULT_CHARS:
joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]"
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=joined,
)
imgCount = sum(1 for o in contentObjects if o["contentType"] == "image")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' "
f"({imgCount} images, no readable text). "
f"Use describeImage(fileId='{fileId}') to analyze visual content.]",
)
except Exception as extractErr:
logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. "
f"Text extraction not available. Use describeImage for images.]",
data=(
f"[File '{fileName}' ({mimeType}) is not yet indexed "
f"(status: {fileStatus or 'unknown'}). Indexing runs automatically "
f"on upload. Please wait a few seconds and retry, or re-upload the file. "
f"For visual content use describeImage(fileId='{fileId}').]"
),
)
# 3) Text file: decode raw bytes
@ -237,7 +169,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
try:
chatService = services.chat
files = chatService.listFiles(
folderId=args.get("folderId"),
tags=args.get("tags"),
search=args.get("search"),
)
@ -290,18 +221,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="searchInFileContent", success=False, error=str(e))
async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]):
try:
chatService = services.chat
folders = chatService.listFolders(parentId=args.get("parentId"))
folderList = "\n".join(
f"- {f.get('name', 'unnamed')} (id: {f.get('id', '?')})"
for f in folders
) if folders else "No folders found."
return ToolResult(toolCallId="", toolName="listFolders", success=True, data=folderList)
except Exception as e:
return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e))
async def _webSearch(args: Dict[str, Any], context: Dict[str, Any]):
query = args.get("query", "")
if not query:
@ -339,35 +258,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="tagFile", success=False, error=str(e))
async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
targetFolderId = args.get("targetFolderId")
if not fileId:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required")
try:
chatService = services.chat
chatService.interfaceDbComponent.updateFile(fileId, {"folderId": targetFolderId})
return ToolResult(
toolCallId="", toolName="moveFile", success=True,
data=f"File {fileId} moved to folder {targetFolderId or 'root'}"
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e))
async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]):
name = args.get("name", "")
if not name:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required")
try:
chatService = services.chat
folder = chatService.createFolder(name=name, parentId=args.get("parentId"))
return ToolResult(
toolCallId="", toolName="createFolder", success=True,
data=f"Folder '{name}' created (id: {folder.get('id', '?')})"
)
except Exception as e:
return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e))
async def _writeFile(args: Dict[str, Any], context: Dict[str, Any]):
content = args.get("content", "")
mode = args.get("mode", "create")
@ -422,12 +312,52 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "")
if fiId:
dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId})
if args.get("folderId"):
dbMgmt.updateFile(fileItem.id, {"folderId": args["folderId"]})
if args.get("groupId"):
try:
appIface = chatService.interfaceDbApp
existing = appIface.getTableGrouping("files/list")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _addToGroup(nds, gid, fid):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == gid:
ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fid not in ids:
ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = ids
return True
if _addToGroup(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
return True
return False
_addToGroup(nodes, args["groupId"], fileItem.id)
appIface.upsertTableGrouping("files/list", nodes)
except Exception as _ge:
logger.warning(f"writeFile: failed to add file to group {args['groupId']}: {_ge}")
elif fiId:
instanceFolderId = _getOrCreateInstanceFolder(chatService, fiId, context.get("mandateId", ""))
if instanceFolderId:
dbMgmt.updateFile(fileItem.id, {"folderId": instanceFolderId})
try:
appIface = chatService.interfaceDbApp
instanceGroupId = await _getOrCreateInstanceGroup(appIface, fiId)
if instanceGroupId:
existing = appIface.getTableGrouping("files/list")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _addToGroup2(nds, gid, fid):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == gid:
ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
if fid not in ids:
ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = ids
return True
if _addToGroup2(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid):
return True
return False
_addToGroup2(nodes, instanceGroupId, fileItem.id)
appIface.upsertTableGrouping("files/list", nodes)
except Exception as _ge:
logger.warning(f"writeFile: failed to add file to instance group for {fiId}: {_ge}")
if args.get("tags"):
dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]})
@ -480,13 +410,13 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
registry.register(
"listFiles", _listFiles,
description=(
"List files in the local workspace. Filter by folder, tags, or search term. "
"List files in the local workspace. Filter by tags or search term. "
"To filter by group, use listItemsInGroup. "
"For external data sources, use browseDataSource instead."
),
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "Filter by folder ID"},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Filter by tags (any match)"},
"search": {"type": "string", "description": "Search in file names and descriptions"},
}
@ -513,18 +443,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
registry.register(
"listFolders", _listFolders,
description="List folders in the local workspace. For external data sources, use browseDataSource instead.",
parameters={
"type": "object",
"properties": {
"parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
}
},
readOnly=True
)
registry.register(
"webSearch", _webSearch,
description="Search the web for general information. Use readUrl to fetch content from a known URL instead.",
@ -550,34 +468,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
registry.register(
"moveFile", _moveFile,
description="Move a file to a different folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to move"},
"targetFolderId": {"type": "string", "description": "Target folder ID (null for root)"},
},
"required": ["fileId"]
},
readOnly=False
)
registry.register(
"createFolder", _createFolder,
description="Create a new folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"name": {"type": "string", "description": "Folder name"},
"parentId": {"type": "string", "description": "Parent folder ID (omit for root)"},
},
"required": ["name"]
},
readOnly=False
)
registry.register(
"writeFile", _writeFile,
description=(
@ -598,7 +488,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"content": {"type": "string", "description": "Content to write/append"},
"mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"},
"fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"},
"folderId": {"type": "string", "description": "Target folder ID (mode=create only)"},
"groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."},
"tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"},
},
"required": ["content"]
@ -758,55 +648,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=True
)
# ---- Phase 2: deleteFolder, renameFolder, moveFolder, copyFile, editFile ----
async def _deleteFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
recursive = args.get("recursive", False)
if not folderId:
return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error="folderId is required")
try:
chatService = services.chat
result = chatService.interfaceDbComponent.deleteFolder(folderId, recursive=recursive)
summary = f"Deleted {result.get('deletedFolders', 1)} folder(s) and {result.get('deletedFiles', 0)} file(s)"
return ToolResult(
toolCallId="", toolName="deleteFolder", success=True, data=summary,
sideEvents=[{"type": "folderDeleted", "data": {"folderId": folderId, **result}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error=str(e))
async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
newName = args.get("newName", "")
if not folderId or not newName:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required")
try:
chatService = services.chat
chatService.interfaceDbComponent.renameFolder(folderId, newName)
return ToolResult(
toolCallId="", toolName="renameFolder", success=True,
data=f"Folder {folderId} renamed to '{newName}'",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "name": newName}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e))
async def _moveFolder(args: Dict[str, Any], context: Dict[str, Any]):
folderId = args.get("folderId", "")
targetParentId = args.get("targetParentId")
if not folderId:
return ToolResult(toolCallId="", toolName="moveFolder", success=False, error="folderId is required")
try:
chatService = services.chat
chatService.interfaceDbComponent.moveFolder(folderId, targetParentId)
return ToolResult(
toolCallId="", toolName="moveFolder", success=True,
data=f"Folder {folderId} moved to {targetParentId or 'root'}",
sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "parentId": targetParentId}}],
)
except Exception as e:
return ToolResult(toolCallId="", toolName="moveFolder", success=False, error=str(e))
# ---- Phase 2: copyFile, editFile ----
async def _copyFile(args: Dict[str, Any], context: Dict[str, Any]):
fileId = args.get("fileId", "")
@ -816,7 +658,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
chatService = services.chat
copiedFile = chatService.interfaceDbComponent.copyFile(
fileId,
targetFolderId=args.get("targetFolderId"),
newFileName=args.get("newFileName"),
)
return ToolResult(
@ -891,48 +732,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
except Exception as e:
return ToolResult(toolCallId="", toolName="replaceInFile", success=False, error=str(e))
registry.register(
"deleteFolder", _deleteFolder,
description="Delete a folder from the local workspace. Set recursive=true to delete all contents.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to delete"},
"recursive": {"type": "boolean", "description": "If true, delete folder and all contents (files and subfolders). Default: false"},
},
"required": ["folderId"]
},
readOnly=False
)
registry.register(
"renameFolder", _renameFolder,
description="Rename a folder in the local workspace.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to rename"},
"newName": {"type": "string", "description": "New folder name"},
},
"required": ["folderId", "newName"]
},
readOnly=False
)
registry.register(
"moveFolder", _moveFolder,
description="Move a folder to a different parent in the local workspace.",
parameters={
"type": "object",
"properties": {
"folderId": {"type": "string", "description": "The folder ID to move"},
"targetParentId": {"type": "string", "description": "Target parent folder ID (null/omit for root)"},
},
"required": ["folderId"]
},
readOnly=False
)
registry.register(
"copyFile", _copyFile,
description="Create an independent copy of a file in the local workspace.",
@ -940,7 +739,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
"type": "object",
"properties": {
"fileId": {"type": "string", "description": "The file ID to copy"},
"targetFolderId": {"type": "string", "description": "Target folder for the copy (default: same folder)"},
"newFileName": {"type": "string", "description": "New file name (default: same name, auto-numbered if duplicate)"},
},
"required": ["fileId"]
@ -948,6 +746,137 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
readOnly=False
)
# ---- Group tools (replaces folder-based tools) ----
async def _listGroups(args: Dict[str, Any], context: Dict[str, Any]):
contextKey = args.get("contextKey", "files/list")
try:
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return ToolResult(toolCallId="", toolName="listGroups", success=True, data="No groups found.")
def _flatten(nodes, depth=0):
result = []
for n in nodes:
nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
result.extend(_flatten(nd.get("subGroups", []), depth + 1))
return result
groups = _flatten(existing.rootGroups)
lines = "\n".join(
f"{' ' * g['depth']}- {g['name']} (id: {g['id']}, items: {g['itemCount']})"
for g in groups
) if groups else "No groups found."
return ToolResult(toolCallId="", toolName="listGroups", success=True, data=lines)
except Exception as e:
return ToolResult(toolCallId="", toolName="listGroups", success=False, error=str(e))
async def _listItemsInGroup(args: Dict[str, Any], context: Dict[str, Any]):
groupId = args.get("groupId", "")
contextKey = args.get("contextKey", "files/list")
if not groupId:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error="groupId is required")
try:
from modules.routes.routeHelpers import _collectItemIds
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
if not existing:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=True, data="No groups found.")
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
ids = _collectItemIds(nodes, groupId)
itemList = list(ids) if ids else []
return ToolResult(
toolCallId="", toolName="listItemsInGroup", success=True,
data="\n".join(f"- {fid}" for fid in itemList) if itemList else "No items in group.",
)
except Exception as e:
return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error=str(e))
async def _addItemsToGroup(args: Dict[str, Any], context: Dict[str, Any]):
groupId = args.get("groupId", "")
itemIds = args.get("itemIds", [])
contextKey = args.get("contextKey", "files/list")
if not groupId:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="groupId is required")
if not itemIds:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="itemIds is required")
try:
chatService = services.chat
appInterface = chatService.interfaceDbApp
existing = appInterface.getTableGrouping(contextKey)
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])]
def _add(nds):
for nd in nds:
nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None)
if nid == groupId:
existing_ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", []))
for fid in itemIds:
if fid not in existing_ids:
existing_ids.append(fid)
if isinstance(nd, dict):
nd["itemIds"] = existing_ids
return True
if _add(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])):
return True
return False
found = _add(nodes)
if not found:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=f"Group {groupId} not found")
appInterface.upsertTableGrouping(contextKey, nodes)
return ToolResult(
toolCallId="", toolName="addItemsToGroup", success=True,
data=f"Added {len(itemIds)} item(s) to group {groupId}",
)
except Exception as e:
return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=str(e))
registry.register(
"listGroups", _listGroups,
description="List all groups in the file grouping tree. Groups replace folders for organising files.",
parameters={
"type": "object",
"properties": {
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
}
},
readOnly=True
)
registry.register(
"listItemsInGroup", _listItemsInGroup,
description="List all file IDs assigned to a specific group (includes sub-groups recursively).",
parameters={
"type": "object",
"properties": {
"groupId": {"type": "string", "description": "The group ID to inspect"},
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
},
"required": ["groupId"]
},
readOnly=True
)
registry.register(
"addItemsToGroup", _addItemsToGroup,
description="Add one or more file IDs to an existing group.",
parameters={
"type": "object",
"properties": {
"groupId": {"type": "string", "description": "The group ID to add files to"},
"itemIds": {"type": "array", "items": {"type": "string"}, "description": "List of file IDs to add"},
"contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"},
},
"required": ["groupId", "itemIds"]
},
readOnly=False
)
registry.register(
"replaceInFile", _replaceInFile,
description=(

View file

@ -268,24 +268,19 @@ class AgentService:
info = chatService.getFileInfo(fid)
if not info:
folderInfo = chatService.interfaceDbComponent.getFolder(fid)
if folderInfo:
folderName = folderInfo.get("name", fid)
folderFiles = chatService.listFiles(folderId=fid)
desc = f"### Folder: {folderName}\n - id: {fid}\n - type: folder\n - contains: {len(folderFiles)} file(s)"
if folderFiles:
desc += "\n - files:"
for ff in folderFiles[:30]:
ffName = ff.get("fileName", "?")
ffId = ff.get("id", "?")
ffMime = ff.get("mimeType", "?")
ffSize = ff.get("fileSize", ff.get("size", "?"))
desc += f"\n * {ffName} (id: {ffId}, type: {ffMime}, size: {ffSize} bytes)"
if len(folderFiles) > 30:
desc += f"\n ... and {len(folderFiles) - 30} more files"
desc += f'\nUse `listFiles(folderId="{fid}")` to get the full file list, then `readFile(fileId)` to read individual files.'
fileDescriptions.append(desc)
continue
# Check if fid is a group ID
try:
groupFileIds = chatService.listFilesInGroup(fid)
if groupFileIds:
allGroups = chatService.listGroups()
groupInfo = next((g for g in allGroups if g.get("id") == fid), None)
groupName = groupInfo.get("name", fid) if groupInfo else fid
desc = f"### Group: {groupName}\n - id: {fid}\n - type: group\n - contains: {len(groupFileIds)} file(s)"
desc += f'\nUse `listItemsInGroup(groupId="{fid}")` to get file IDs, then `readFile(fileId)` to read each.'
fileDescriptions.append(desc)
continue
except Exception:
pass
fileDescriptions.append(f"### File id: {fid}")
continue
@ -333,7 +328,7 @@ class AgentService:
"These files/folders have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n"
"For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n"
"For groups, use `listItemsInGroup(groupId)` to get the file IDs inside, then `readFile(fileId)` for each.\n"
"For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with "
"`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n"
"For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n"

View file

@ -51,6 +51,10 @@ class _ServicesAdapter:
def workflow(self):
return self._context.workflow
@workflow.setter
def workflow(self, value):
self._context.workflow = value
@property
def chat(self):
return self._get_service("chat")
@ -86,7 +90,7 @@ class _ServicesAdapter:
return getattr(w, "featureCode", None) if w else None
def __getattr__(self, name: str):
if name in ("allowedProviders", "preferredProviders", "currentUserLanguage"):
if name in ("allowedProviders", "allowedModels", "preferredProviders", "currentUserLanguage"):
return getattr(self.workflow, name, None) if self.workflow else None
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
@ -164,12 +168,29 @@ class AiService:
# SPEECH_TEAMS: Dedicated pipeline, bypasses standard model selection
if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS:
return await self._handleSpeechTeams(request)
# FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
self._preflightBillingCheck()
# Balance & provider permission checks
await self._checkBillingBeforeAiCall()
_opType = request.options.operationType if request.options else None
_isNeutralizationCall = _opType in (
OperationTypeEnum.NEUTRALIZATION_TEXT,
OperationTypeEnum.NEUTRALIZATION_IMAGE,
)
if not _isNeutralizationCall:
# FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check)
self._preflightBillingCheck()
# Balance & provider permission checks
await self._checkBillingBeforeAiCall()
else:
# Neutralization calls are system-level operations (connector anonymization).
# They run without a mandate context (e.g. personal-scope connections) and
# are billed the same way as embedding calls: best-effort, skipped when no
# billing settings exist for an empty mandate.
logger.debug(
"callAi: skipping billing preflight for neutralization call "
"(operationType=%s, user=%s)",
_opType,
getattr(getattr(self.services, 'user', None), 'id', 'unknown'),
)
# Calculate effective allowedProviders: RBAC ∩ Workflow
effectiveProviders = self._calculateEffectiveProviders()
@ -177,6 +198,11 @@ class AiService:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
logger.debug(f"Effective allowedProviders for AI request: {effectiveProviders}")
# Calculate effective allowedModels: Workflow ∩ Request (node-level)
effectiveModels = self._calculateEffectiveModels(request)
if effectiveModels and request.options:
request.options = request.options.model_copy(update={'allowedModels': effectiveModels})
# Neutralize prompt if enabled (before AI call)
_wasNeutralized = False
_excludedDocs: List[str] = []
@ -218,13 +244,25 @@ class AiService:
Rehydration happens on the final AiCallResponse (not on individual str deltas).
"""
await self.ensureAiObjectsInitialized()
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
_streamOpType = request.options.operationType if request.options else None
_isNeutralizationStream = _streamOpType in (
OperationTypeEnum.NEUTRALIZATION_TEXT,
OperationTypeEnum.NEUTRALIZATION_IMAGE,
)
if not _isNeutralizationStream:
self._preflightBillingCheck()
await self._checkBillingBeforeAiCall()
effectiveProviders = self._calculateEffectiveProviders()
if effectiveProviders and request.options:
request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders})
# Calculate effective allowedModels: Workflow ∩ Request (node-level)
effectiveModels = self._calculateEffectiveModels(request)
if effectiveModels and request.options:
request.options = request.options.model_copy(update={'allowedModels': effectiveModels})
# Neutralize prompt if enabled (before streaming)
_wasNeutralized = False
_excludedDocs: List[str] = []
@ -1240,6 +1278,43 @@ detectedIntent-Werte:
logger.warning(f"Error calculating effective providers: {e}")
return None
def _calculateEffectiveModels(self, request: AiCallRequest = None) -> Optional[List[str]]:
"""
Calculate effective allowed models: Workflow.allowedModels request.options.allowedModels.
AND-logic intersection:
- If workflow specifies allowedModels, start with those.
- If request (node-level) also specifies allowedModels, intersect.
- Returns None if no model filtering is needed.
"""
try:
effectiveModels = None
# Workflow-level allowedModels (from automation config)
workflowModels = getattr(self.services, 'allowedModels', None)
if workflowModels:
effectiveModels = list(workflowModels)
# Request-level (node-level) allowedModels
requestModels = None
if request and request.options and request.options.allowedModels:
requestModels = request.options.allowedModels
if requestModels:
if effectiveModels:
effectiveModels = [m for m in effectiveModels if m in requestModels]
else:
effectiveModels = list(requestModels)
if effectiveModels:
logger.debug(f"Model filter: Workflow={workflowModels}, Request={requestModels}, Effective={effectiveModels}")
return effectiveModels if effectiveModels else None
except Exception as e:
logger.warning(f"Error calculating effective models: {e}")
return None
async def ensureAiObjectsInitialized(self):
"""Ensure aiObjects is initialized and submodules are ready."""
if self.aiObjects is None:

View file

@ -142,6 +142,8 @@ class AiCallLooper:
MAX_MERGE_FAILS = 3
mergeFailCount = 0 # Global counter for merge failures across entire loop
lastValidCompletePart = None # Store last successfully parsed completePart for fallback
MAX_CONSECUTIVE_EMPTY_RESPONSES = 3
consecutive_empty_responses = 0
# Get parent operation ID for iteration operations (parentId should be operationId, not log entry ID)
parentOperationId = operationId # Use the parent's operationId directly
@ -284,8 +286,26 @@ class AiCallLooper:
break
if not result or not result.strip():
logger.warning(f"Iteration {iteration}: Empty response, stopping")
break
consecutive_empty_responses += 1
logger.warning(
"Iteration %s: Empty AI response (consecutive %s/%s) modelName=%s errorCount=%s",
iteration,
consecutive_empty_responses,
MAX_CONSECUTIVE_EMPTY_RESPONSES,
getattr(response, "modelName", None),
getattr(response, "errorCount", None),
)
if iterationOperationId:
self.services.chat.progressLogFinish(iterationOperationId, False)
if consecutive_empty_responses >= MAX_CONSECUTIVE_EMPTY_RESPONSES:
logger.error(
"Stopping loop: %s consecutive empty responses from model",
consecutive_empty_responses,
)
break
continue
consecutive_empty_responses = 0
# Check if this is a text response (not document generation)
# Text responses don't need JSON parsing - return immediately after first successful response
@ -535,7 +555,12 @@ class AiCallLooper:
# This code path should never be reached because all registered use cases
# return early when JSON is complete. This would only execute for use cases that
# require section extraction, but no such use cases are currently registered.
logger.error(f"Unexpected code path: reached end of loop without return for use case '{useCaseId}'")
logger.error(
"End of callAiWithLooping without success for use case %r (iterations=%s, lastResultLen=%s)",
useCaseId,
iteration,
len(result) if isinstance(result, str) else 0,
)
return result if result else ""
def _isJsonStringIncomplete(self, jsonString: str) -> bool:

View file

@ -90,8 +90,7 @@ class StructureGenerator:
)
try:
# Baue Chapter-Struktur-Prompt mit Content-Index
structurePrompt = self._buildChapterStructurePrompt(
structurePrompt, templateStructure = self._buildChapterStructurePrompt(
userPrompt=userPrompt,
contentParts=contentParts,
outputFormat=outputFormat
@ -108,12 +107,6 @@ class StructureGenerator:
resultFormat="json"
)
structurePrompt, templateStructure = self._buildChapterStructurePrompt(
userPrompt=userPrompt,
contentParts=contentParts,
outputFormat=outputFormat
)
# Create prompt builder for continuation support
async def buildChapterStructurePromptWithContinuation(
continuationContext: Any,
@ -196,6 +189,13 @@ CRITICAL:
contentParts=None # Do not pass ContentParts - only metadata needed, not content extraction
)
if not isinstance(aiResponseJson, str) or not aiResponseJson.strip():
raise ValueError(
"Structure generation returned no JSON text from the model (empty response after retries). "
"Check the AI provider, allowed models, billing, and debug artifact "
"'chapter_structure_generation_response'."
)
# Parse the complete JSON response (looping system already handles completion)
extractedJson = self.services.utils.jsonExtractString(aiResponseJson)
parsedJson, parseError, cleanedJson = self.services.utils.jsonTryParse(extractedJson)
@ -215,7 +215,12 @@ CRITICAL:
raise ValueError(f"Failed to parse JSON structure after repair: {str(parseError)}")
else:
logger.error(f"Failed to repair JSON. Parse error: {str(parseError)}")
logger.error(f"Cleaned JSON preview (first 500 chars): {cleanedJson[:500]}")
raw_preview = (extractedJson or "")[:500]
logger.error(
"Raw extract preview (first 500 chars): %r",
raw_preview,
)
logger.error(f"Cleaned JSON preview (first 500 chars): {cleanedJson[:500]!r}")
raise ValueError(f"Failed to parse JSON structure: {str(parseError)}")
else:
structure = parsedJson

View file

@ -23,7 +23,11 @@ class ChatService:
from modules.interfaces.interfaceDbManagement import getInterface as getComponentInterface
from modules.interfaces.interfaceDbChat import getInterface as getChatInterface
self.interfaceDbApp = getAppInterface(context.user, mandateId=context.mandate_id)
self.interfaceDbComponent = getComponentInterface(context.user, mandateId=context.mandate_id)
self.interfaceDbComponent = getComponentInterface(
context.user,
mandateId=context.mandate_id,
featureInstanceId=context.feature_instance_id,
)
self.interfaceDbChat = getChatInterface(
context.user,
mandateId=context.mandate_id,
@ -199,13 +203,8 @@ class ChatService:
label = parts[1]
messageFound = None
for message in workflow.messages:
# Validate message belongs to this workflow
msgWorkflowId = getattr(message, 'workflowId', None)
if not msgWorkflowId or msgWorkflowId != workflowId:
if msgWorkflowId:
logger.warning(f"Message {message.id} has workflowId {msgWorkflowId} but belongs to workflow {workflowId}. Skipping.")
else:
logger.warning(f"Message {message.id} has no workflowId. Skipping.")
continue
msgLabel = getattr(message, 'documentsLabel', None)
@ -213,7 +212,6 @@ class ChatService:
messageFound = message
break
# If found, add documents
if messageFound and messageFound.documents:
allDocuments.extend(messageFound.documents)
else:
@ -419,7 +417,7 @@ class ChatService:
return None
def getFileInfo(self, fileId: str) -> Dict[str, Any]:
"""Get file information including new fields (tags, folderId, description, status)."""
"""Get file information including new fields (tags, description, status)."""
fileItem = self.interfaceDbComponent.getFile(fileId)
if fileItem:
return {
@ -430,7 +428,6 @@ class ChatService:
"fileHash": fileItem.fileHash,
"creationDate": fileItem.sysCreatedAt,
"tags": getattr(fileItem, "tags", None),
"folderId": getattr(fileItem, "folderId", None),
"description": getattr(fileItem, "description", None),
"status": getattr(fileItem, "status", None),
}
@ -449,14 +446,12 @@ class ChatService:
def listFiles(
self,
folderId: str = None,
tags: List[str] = None,
search: str = None,
) -> List[Dict[str, Any]]:
"""List files for the current user with optional filters.
Args:
folderId: Filter by folder (None = root / all).
tags: Filter by tags (any match).
search: Search in fileName and description.
@ -469,10 +464,6 @@ class ChatService:
allFiles = self.interfaceDbComponent.getAllFiles()
results = []
for fileItem in allFiles:
if folderId is not None:
if fileItem.get("folderId") != folderId:
continue
if tags:
itemTags = fileItem.get("tags") or []
if not any(t in itemTags for t in tags):
@ -492,27 +483,40 @@ class ChatService:
"fileSize": fileItem.get("fileSize"),
"creationDate": fileItem.get("sysCreatedAt"),
"tags": fileItem.get("tags"),
"folderId": fileItem.get("folderId"),
"description": fileItem.get("description"),
"status": fileItem.get("status"),
})
return results
def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]:
"""List file folders for the current user.
def listGroups(self, contextKey: str = "files/list") -> list:
"""List all groups in the groupTree for the current context."""
try:
existing = self.interfaceDbApp.getTableGrouping(contextKey)
if not existing:
return []
def _flatten(nodes, depth=0):
result = []
for n in nodes:
nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n))
result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))})
result.extend(_flatten(nd.get("subGroups", []), depth + 1))
return result
return _flatten(existing.rootGroups)
except Exception as e:
return []
Args:
parentId: Optional parent folder ID to filter by.
None = return ALL folders (for tree building).
Returns:
List of folder dicts.
"""
return self.interfaceDbComponent.listFolders(parentId=parentId)
def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]:
"""Create a new file folder with unique name validation."""
return self.interfaceDbComponent.createFolder(name=name, parentId=parentId)
def listFilesInGroup(self, groupId: str, contextKey: str = "files/list") -> list:
"""List file IDs in a specific group (recursive)."""
try:
from modules.routes.routeHelpers import _collectItemIds
existing = self.interfaceDbApp.getTableGrouping(contextKey)
if not existing:
return []
nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups]
ids = _collectItemIds(nodes, groupId)
return list(ids) if ids else []
except Exception:
return []
# ---- DataSource CRUD ----

View file

@ -14,6 +14,7 @@ from .subDocumentUtility import (
detectMimeTypeFromData,
convertDocumentDataToString
)
from .styleDefaults import resolveStyle
logger = logging.getLogger(__name__)
@ -382,7 +383,7 @@ class GenerationService:
'workflowId': 'unknown'
}
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None) -> List[RenderedDocument]:
async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None, style: Optional[Dict[str, Any]] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to the specified output format.
Processes EACH document separately and calls renderer for each.
@ -399,12 +400,14 @@ class GenerationService:
userPrompt: User's original prompt for report generation
aiService: AI service instance for generation prompt creation
parentOperationId: Optional parent operation ID for hierarchical logging
style: Optional style overrides (deep-merged with DEFAULT_STYLE)
Returns:
List of RenderedDocument objects.
Each RenderedDocument represents one rendered file (main document or supporting file)
"""
try:
resolvedStyle = resolveStyle(style)
# Validate JSON input
if not isinstance(extractedContent, dict):
raise ValueError("extractedContent must be a JSON dictionary")
@ -469,7 +472,7 @@ class GenerationService:
docTitle = doc.get("title", title)
# Render this document (can return multiple files, e.g., HTML + images)
renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService)
renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService, style=resolvedStyle)
allRenderedDocuments.extend(renderedDocs)
logger.info(f"Rendered {len(documents)} document(s) into {len(allRenderedDocuments)} file(s)")

View file

@ -84,7 +84,7 @@ class BaseRenderer(ABC):
return list(supportedSectionTypes)
@abstractmethod
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render extracted JSON content to multiple documents.
Each renderer must implement this method.
@ -95,6 +95,9 @@ class BaseRenderer(ABC):
title: Report title
userPrompt: Original user prompt for context
aiService: AI service instance for additional processing
style: Fully-resolved unified style dict from styleDefaults.resolveStyle().
When provided, renderers use these values instead of their
own defaults / AI-generated styles.
Returns:
List of RenderedDocument objects.
@ -102,6 +105,112 @@ class BaseRenderer(ABC):
Even if only one document is returned, it must be wrapped in a list.
"""
pass
def _convertUnifiedStyleToInternal(self, style: Dict[str, Any]) -> Dict[str, Any]:
"""Convert the unified resolvedStyle dict (from styleDefaults) into
the renderer-internal style-set format that all rendering methods already
consume. Override in subclasses for format-specific tweaks."""
h1 = style["headings"]["h1"]
h2 = style["headings"]["h2"]
h3 = style["headings"].get("h3", h2)
h4 = style["headings"].get("h4", h3)
tbl = style["table"]
para = style["paragraph"]
lst = style["list"]
cb = style["codeBlock"]
return {
"title": {
"font_size": h1["sizePt"], "color": h1["color"],
"bold": h1.get("weight") == "bold", "align": "left",
},
"heading1": {
"font_size": h1["sizePt"], "color": h1["color"],
"bold": h1.get("weight") == "bold", "align": "left",
},
"heading2": {
"font_size": h2["sizePt"], "color": h2["color"],
"bold": h2.get("weight") == "bold", "align": "left",
},
"heading3": {
"font_size": h3["sizePt"], "color": h3["color"],
"bold": h3.get("weight") == "bold", "align": "left",
},
"heading4": {
"font_size": h4["sizePt"], "color": h4["color"],
"bold": h4.get("weight") == "bold", "align": "left",
},
"paragraph": {
"font_size": para["sizePt"], "color": para["color"],
"bold": False, "align": "left",
},
"table_header": {
"background": tbl["headerBg"], "text_color": tbl["headerFg"],
"bold": True, "align": "center",
},
"table_cell": {
"background": tbl["rowBandingOdd"], "text_color": para["color"],
"bold": False, "align": "left",
},
"table_border": {
"style": "grid", "color": tbl["borderColor"],
},
"bullet_list": {
"font_size": lst["sizePt"], "color": para["color"],
"indent": lst["indentPt"],
},
"code_block": {
"font": style["fonts"]["monospace"],
"font_size": cb["fontSizePt"], "color": para["color"],
"background": cb["background"],
},
}
@staticmethod
def _inlineRunsFromContent(content: Dict[str, Any], *, itemsKey: str = None) -> Any:
"""Extract inline runs from new-format content, falling back to old format.
For paragraphs (itemsKey=None):
new: content["inlineRuns"] -> List[InlineRun]
old: content["text"] -> wrapped in [{"type":"text","value":text}]
For list items (itemsKey="items"):
new: content["items"] is List[List[InlineRun]]
old: content["items"] is List[str] or List[{"text":}]
Returns the items list (caller decides per-item conversion).
For table headers/cells:
new: each header/cell is List[InlineRun]
old: each header/cell is a plain str
Caller handles per-cell.
"""
if itemsKey:
return content.get(itemsKey, [])
inlineRuns = content.get("inlineRuns")
if inlineRuns:
return inlineRuns
text = content.get("text", "")
if text:
return [{"type": "text", "value": text}]
return []
@staticmethod
def _inlineRunsForCell(cell) -> list:
"""Normalize a single table header or cell value to List[InlineRun].
Accepts either a plain string or an already-correct list of run dicts."""
if isinstance(cell, list):
return cell
return [{"type": "text", "value": str(cell) if cell is not None else ""}]
@staticmethod
def _inlineRunsForListItem(item) -> list:
"""Normalize a single list item to List[InlineRun].
Accepts a plain string, a dict with 'text', or an already-correct list of run dicts."""
if isinstance(item, list):
return item
if isinstance(item, dict):
text = item.get("text", "")
return [{"type": "text", "value": text}]
return [{"type": "text", "value": str(item)}]
def _determineFilename(self, title: str, mimeType: str) -> str:
"""Determine filename from title and mimeType."""

View file

@ -79,7 +79,15 @@ class RendererCodeCsv(BaseCodeRenderer):
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.
@ -94,7 +102,7 @@ class RendererCodeCsv(BaseCodeRenderer):
# Document generation path - delegate to document renderer
from .rendererCsv import RendererCsv
documentRenderer = RendererCsv(self.services)
return await documentRenderer.render(extractedContent, title, userPrompt, aiService)
return await documentRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
def _validateAndFixCsv(self, content: str) -> str:
"""Validate CSV structure and fix common issues."""

View file

@ -91,7 +91,15 @@ class RendererCodeJson(BaseCodeRenderer):
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
Delegates to document renderer if needed, or handles code files directly.
@ -107,7 +115,7 @@ class RendererCodeJson(BaseCodeRenderer):
# Import here to avoid circular dependency
from .rendererJson import RendererJson
documentRenderer = RendererJson(self.services)
return await documentRenderer.render(extractedContent, title, userPrompt, aiService)
return await documentRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
def _extractJsonStatistics(self, parsed: Any) -> Dict[str, Any]:
"""Extract JSON statistics for validation (object count, array count, key count)."""

View file

@ -78,11 +78,20 @@ class RendererCodeXml(BaseCodeRenderer):
return renderedDocs
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""
Render method for document generation compatibility.
For XML, we only support code generation (no document renderer exists yet).
"""
_ = style
# Check if this is code generation (has files array)
if "files" in extractedContent:
# Code generation path - use renderCodeFiles

View file

@ -39,8 +39,17 @@ class RendererCsv(BaseRenderer):
"""
return ["table", "code_block"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""Render extracted JSON content to CSV format. Produces one CSV file per table section."""
_ = style
try:
# Validate JSON structure
if not self._validateJsonStructure(extractedContent):

View file

@ -53,18 +53,17 @@ class RendererDocx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER")
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .rendererHtml import RendererHtml
htmlRenderer = RendererHtml()
return await htmlRenderer.render(extractedContent, title, userPrompt, aiService)
return await htmlRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate DOCX using AI-analyzed styling
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@ -114,23 +113,27 @@ class RendererDocx(BaseRenderer):
)
]
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, unifiedStyle: Dict[str, Any] = None) -> str:
"""Generate DOCX content from structured JSON document."""
import time
start_time = time.time()
try:
self.logger.debug("_generateDocxFromJson: Starting document generation")
# Create new document
doc = Document()
self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s")
# Get style set: use styles from metadata if available, otherwise enhance with AI
template_from_metadata = None
if json_content and isinstance(json_content.get("metadata"), dict):
template_from_metadata = json_content["metadata"].get("templateName")
# Phase 3: prefer unified style when provided
style_start = time.time()
self.logger.debug("_generateDocxFromJson: About to get style set")
styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata)
if unifiedStyle:
styleSet = self._convertUnifiedStyleToInternal(unifiedStyle)
self._unifiedStyle = unifiedStyle
else:
template_from_metadata = None
if json_content and isinstance(json_content.get("metadata"), dict):
template_from_metadata = json_content["metadata"].get("templateName")
styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata)
self._unifiedStyle = None
self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s")
# Setup basic document styles and create all styles from style set
@ -298,11 +301,11 @@ class RendererDocx(BaseRenderer):
def _setupBasicDocumentStyles(self, doc: Document) -> None:
"""Set up basic document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
us = getattr(self, '_unifiedStyle', None)
font.name = us["fonts"]["primary"] if us else 'Calibri'
font.size = Pt(us["paragraph"]["sizePt"] if us else 11)
except Exception as e:
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
@ -421,6 +424,8 @@ class RendererDocx(BaseRenderer):
def _addMarkdownInlineRuns(self, paragraph, text: str) -> None:
"""Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph."""
pos = 0
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier New"
for m in self._MD_INLINE_RE.finditer(text):
if m.start() > pos:
paragraph.add_run(text[pos:m.start()])
@ -434,12 +439,45 @@ class RendererDocx(BaseRenderer):
paragraph.add_run(m.group(6)).italic = True
elif m.group(7):
run = paragraph.add_run(m.group(7))
run.font.name = "Courier New"
run.font.name = monoFont
run.font.size = Pt(9)
pos = m.end()
if pos < len(text):
paragraph.add_run(text[pos:])
def _renderInlineRuns(self, runs: list, paragraph, styleSet: Dict[str, Any]) -> None:
"""Process a list of InlineRun dicts into python-docx Runs on a paragraph."""
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier New"
for run in runs:
runType = run.get("type", "text")
value = run.get("value", "")
if runType == "text":
paragraph.add_run(value)
elif runType == "bold":
paragraph.add_run(value).bold = True
elif runType == "italic":
paragraph.add_run(value).italic = True
elif runType == "code":
r = paragraph.add_run(value)
r.font.name = monoFont
r.font.size = Pt(9)
elif runType == "link":
r = paragraph.add_run(value)
r.font.underline = True
r.font.color.rgb = RGBColor(0x29, 0x80, 0xB9)
elif runType == "image":
b64 = run.get("base64Data", "")
if b64:
try:
imgBytes = base64.b64decode(b64)
imgStream = io.BytesIO(imgBytes)
paragraph.add_run().add_picture(imgStream, width=Inches(2))
except Exception:
paragraph.add_run(f"[Image: {run.get('altText', '')}]")
else:
paragraph.add_run(value)
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""
Render a JSON table to DOCX using AI-generated styles.
@ -485,7 +523,7 @@ class RendererDocx(BaseRenderer):
except Exception as e:
self.logger.error(f"Error rendering table: {str(e)}", exc_info=True)
def _renderTableFastXml(self, doc: Document, headers: List[str], rows: List[List[Any]], styles: Dict[str, Any]) -> None:
def _renderTableFastXml(self, doc: Document, headers: list, rows: list, styles: Dict[str, Any]) -> None:
"""
High-performance table rendering using direct XML manipulation.
@ -546,24 +584,34 @@ class RendererDocx(BaseRenderer):
# Build all rows using fast XML
rows_start = time.time()
# Header row
headerRow = self._createTableRowXml(headers, isHeader=True)
# Resolve header style colors
tableStyle = styles.get("table_header", {})
headerBg = tableStyle.get("background", "")
headerFg = tableStyle.get("text_color", "")
# Flatten inline-run headers to plain strings for fast XML path
flatHeaders = []
for h in headers:
runs = self._inlineRunsForCell(h)
flatHeaders.append("".join(r.get("value", "") for r in runs))
headerRow = self._createTableRowXml(flatHeaders, isHeader=True, headerBgHex=headerBg or None, headerFgHex=headerFg or None)
tbl.append(headerRow)
header_time = time.time() - rows_start
self.logger.debug(f"_renderTableFastXml: Header row created in {header_time:.3f}s")
# Data rows - batch process for performance
data_start = time.time()
rowCount = len(rows)
for idx, rowData in enumerate(rows):
# Convert all cells to strings
cellTexts = [str(cell) if cell is not None else '' for cell in rowData]
# Pad if needed
while len(cellTexts) < len(headers):
cellTexts = []
for cell in rowData:
runs = self._inlineRunsForCell(cell)
cellTexts.append("".join(r.get("value", "") for r in runs))
while len(cellTexts) < len(flatHeaders):
cellTexts.append('')
row = self._createTableRowXml(cellTexts, isHeader=False)
tbl.append(row)
@ -641,74 +689,64 @@ class RendererDocx(BaseRenderer):
return tblBorders
def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any:
"""
Create a table row XML element with cells.
This is the core fast-path: builds the row XML directly without
going through python-docx's slow cell.text assignment.
"""
def _createTableRowXml(self, cells: list, isHeader: bool = False, headerBgHex: str = None, headerFgHex: str = None) -> Any:
"""Create a table row XML element with cells.
Fast-path: builds row XML directly via lxml."""
from docx.oxml.shared import OxmlElement, qn
if headerBgHex is None:
us = getattr(self, '_unifiedStyle', None)
headerBgHex = us["table"]["headerBg"].lstrip('#') if us else '1F3864'
else:
headerBgHex = headerBgHex.lstrip('#')
if headerFgHex is None:
us = getattr(self, '_unifiedStyle', None)
headerFgHex = us["table"]["headerFg"].lstrip('#') if us else 'FFFFFF'
else:
headerFgHex = headerFgHex.lstrip('#')
tr = OxmlElement('w:tr')
# Row properties for header
if isHeader:
trPr = OxmlElement('w:trPr')
tblHeader = OxmlElement('w:tblHeader')
trPr.append(tblHeader)
trPr.append(OxmlElement('w:tblHeader'))
tr.append(trPr)
for cellText in cells:
# Create cell
tc = OxmlElement('w:tc')
# Cell properties
tcPr = OxmlElement('w:tcPr')
tcW = OxmlElement('w:tcW')
tcW.set(qn('w:type'), 'auto')
tcW.set(qn('w:w'), '0')
tcPr.append(tcW)
# Header cell styling - light blue background
if isHeader:
shd = OxmlElement('w:shd')
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:fill'), '4472C4') # Professional blue
shd.set(qn('w:fill'), headerBgHex)
tcPr.append(shd)
tc.append(tcPr)
# Paragraph with text
p = OxmlElement('w:p')
# Add run with text
r = OxmlElement('w:r')
# Header text styling - bold and white
if isHeader:
rPr = OxmlElement('w:rPr')
b = OxmlElement('w:b')
rPr.append(b)
# White text color
rPr.append(OxmlElement('w:b'))
color = OxmlElement('w:color')
color.set(qn('w:val'), 'FFFFFF')
color.set(qn('w:val'), headerFgHex)
rPr.append(color)
r.append(rPr)
# Text element
t = OxmlElement('w:t')
# Preserve spaces if text starts/ends with whitespace
if cellText and (cellText[0] == ' ' or cellText[-1] == ' '):
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
t.text = cellText
r.append(t)
p.append(r)
tc.append(p)
tr.append(tc)
return tr
def _applyHorizontalBordersOnly(self, table) -> None:
@ -836,47 +874,37 @@ class RendererDocx(BaseRenderer):
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON bullet list to DOCX using AI-generated styles - OPTIMIZED for performance."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return
items = content.get("items", [])
bullet_style = styles.get("bullet_list", {})
# Pre-calculate and cache style objects to avoid repeated parsing
font_size_pt = None
font_size_pt = Pt(bullet_style["font_size"]) if bullet_style.get("font_size") else None
text_color_rgb = None
if bullet_style:
if "font_size" in bullet_style:
font_size_pt = Pt(bullet_style["font_size"])
if "color" in bullet_style:
color_hex = bullet_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
if bullet_style.get("color"):
color_hex = bullet_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
for item in items:
itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "")
if not itemText:
itemRuns = self._inlineRunsForListItem(item)
if not itemRuns or not any(r.get("value") for r in itemRuns):
continue
para = doc.add_paragraph(style='List Bullet')
self._addMarkdownInlineRuns(para, itemText)
# Apply bullet list styling from style set - use cached objects
if bullet_style and para.runs:
# Use direct access instead of iterating
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
else:
# Create run if none exists
run = para.add_run()
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
isNewRunFormat = isinstance(item, list)
if isNewRunFormat:
self._renderInlineRuns(itemRuns, para, styles)
else:
itemText = "".join(r.get("value", "") for r in itemRuns)
self._addMarkdownInlineRuns(para, itemText)
if bullet_style and para.runs and len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
@ -905,90 +933,79 @@ class RendererDocx(BaseRenderer):
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON paragraph to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
inlineRuns = self._inlineRunsFromContent(content)
elif isinstance(content, str):
text = content
inlineRuns = [{"type": "text", "value": content}]
else:
text = ""
# CRITICAL: Prevent rendering base64 image data as text
# Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
(len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
# This looks like base64 data - don't render as text
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
inlineRuns = []
if not inlineRuns:
return
plainText = "".join(r.get("value", "") for r in inlineRuns)
if plainText and (plainText.startswith("/9j/") or plainText.startswith("iVBORw0KGgo") or
(len(plainText) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in plainText[:100]))):
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(plainText)})")
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
if para.runs:
para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
para.runs[0].font.color.rgb = RGBColor(255, 0, 0)
return
if text:
para = doc.add_paragraph()
self._addMarkdownInlineRuns(para, text)
paragraph_style = styles.get("paragraph", {})
if paragraph_style:
# Pre-calculate and cache style objects
font_size_pt = None
text_color_rgb = None
if "font_size" in paragraph_style:
font_size_pt = Pt(paragraph_style["font_size"])
if "color" in paragraph_style:
color_hex = paragraph_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
bold = paragraph_style.get("bold", False)
# Use direct access instead of iterating
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
para = doc.add_paragraph()
hasNewRuns = content.get("inlineRuns") if isinstance(content, dict) else None
if hasNewRuns:
self._renderInlineRuns(inlineRuns, para, styles)
else:
self._addMarkdownInlineRuns(para, plainText)
paragraph_style = styles.get("paragraph", {})
if paragraph_style:
font_size_pt = Pt(paragraph_style["font_size"]) if "font_size" in paragraph_style else None
text_color_rgb = None
if "color" in paragraph_style:
color_hex = paragraph_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
bold = paragraph_style.get("bold", False)
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
if "align" in paragraph_style:
align = paragraph_style["align"]
if align == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
# Create run if none exists
run = para.add_run()
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
if "align" in paragraph_style:
align = paragraph_style["align"]
if align == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON code block to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = code_data.get("content", {})
if not isinstance(content, dict):
return
code = content.get("code", "")
language = content.get("language", "")
code_style = styles.get("code_block", {})
us = getattr(self, '_unifiedStyle', None)
if code:
if language:
lang_para = doc.add_paragraph(f"Code ({language}):")
if len(lang_para.runs) > 0:
lang_para.runs[0].bold = True
# Pre-calculate and cache style objects
code_font_name = code_style.get("font", "Courier New")
code_font_size_pt = Pt(code_style.get("font_size", 9))
code_font_name = code_style.get("font", us["fonts"]["monospace"] if us else "Courier New")
code_font_size_pt = Pt(code_style.get("font_size", us["codeBlock"]["fontSizePt"] if us else 9))
code_text_color_rgb = None
if "color" in code_style:
color_hex = code_style["color"].lstrip('#')

View file

@ -40,7 +40,7 @@ class RendererHtml(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render HTML document with images as separate files.
Returns list of documents: [HTML document, image1, image2, ...]
@ -54,7 +54,7 @@ class RendererHtml(BaseRenderer):
self._renderedImages = images
# Generate HTML using AI-analyzed styling
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService)
htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService, style=style)
# Replace base64 data URIs with relative file paths if images exist
if images:
@ -107,11 +107,16 @@ class RendererHtml(BaseRenderer):
return resultDocuments
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str:
"""Generate HTML content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Use unified style when provided, otherwise fall back to existing flow
if style:
styles = self._convertUnifiedStyleToInternal(style)
self._unifiedStyle = style
else:
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
self._unifiedStyle = None
# Validate JSON structure
if not self._validateJsonStructure(jsonContent):
@ -272,6 +277,10 @@ class RendererHtml(BaseRenderer):
def _generateCssStyles(self, styles: Dict[str, Any]) -> str:
"""Generate CSS from style definitions."""
# When unified style is available, generate CSS directly from it
if getattr(self, "_unifiedStyle", None):
return self._generateCssFromUnifiedStyle(self._unifiedStyle)
css_parts = []
# Body styles
@ -368,6 +377,164 @@ class RendererHtml(BaseRenderer):
return '\n'.join(css_parts)
def _generateCssFromUnifiedStyle(self, style: Dict[str, Any]) -> str:
"""Generate CSS directly from unified style dict."""
fonts = style.get("fonts", {})
colors = style.get("colors", {})
headings = style.get("headings", {})
para = style.get("paragraph", {})
tbl = style.get("table", {})
lst = style.get("list", {})
cb = style.get("codeBlock", {})
page = style.get("page", {})
primaryFont = fonts.get("primary", "Arial, sans-serif")
monoFont = fonts.get("monospace", "Courier New, monospace")
bgColor = colors.get("background", "#FFFFFF")
primaryColor = colors.get("primary", "#1F3864")
paraColor = para.get("color", "#333333")
paraSizePt = para.get("sizePt", 11)
lineSpacing = para.get("lineSpacing", 1.15)
css_parts = []
# Body
css_parts.append("body {")
css_parts.append(f" font-family: {primaryFont};")
css_parts.append(f" background: {bgColor};")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" font-size: {paraSizePt}pt;")
css_parts.append(f" line-height: {lineSpacing};")
margins = page.get("marginsPt", {})
if margins:
css_parts.append(f" margin: {margins.get('top', 60)}pt {margins.get('right', 60)}pt {margins.get('bottom', 60)}pt {margins.get('left', 60)}pt;")
else:
css_parts.append(" margin: 0; padding: 20px;")
css_parts.append("}")
# Document title (uses h1 style)
h1 = headings.get("h1", {})
css_parts.append(".document-title {")
css_parts.append(f" font-size: {h1.get('sizePt', 24)}pt;")
css_parts.append(f" color: {h1.get('color', primaryColor)};")
css_parts.append(f" font-weight: {h1.get('weight', 'bold')};")
css_parts.append(" margin: 0 0 1em 0;")
css_parts.append("}")
# Headings h1-h4
for level in range(1, 5):
key = f"h{level}"
h = headings.get(key, h1 if level == 1 else headings.get(f"h{level-1}", {}))
css_parts.append(f"h{level} {{")
css_parts.append(f" font-size: {h.get('sizePt', max(24 - (level-1)*4, 12))}pt;")
css_parts.append(f" color: {h.get('color', primaryColor)};")
css_parts.append(f" font-weight: {h.get('weight', 'bold')};")
css_parts.append(f" margin: 1.2em 0 0.4em 0;")
css_parts.append("}")
# Paragraphs
css_parts.append("p {")
css_parts.append(f" font-size: {paraSizePt}pt;")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" line-height: {lineSpacing};")
css_parts.append(" margin: 0 0 1em 0;")
css_parts.append("}")
# Tables
borderColor = tbl.get("borderColor", "#DEE2E6")
css_parts.append("table {")
css_parts.append(f" border-collapse: collapse;")
css_parts.append(f" width: 100%;")
css_parts.append(f" margin: 1em 0;")
css_parts.append(f" border: 1px solid {borderColor};")
css_parts.append("}")
# Table headers
css_parts.append("th {")
css_parts.append(f" background: {tbl.get('headerBg', '#1F3864')};")
css_parts.append(f" color: {tbl.get('headerFg', '#FFFFFF')};")
css_parts.append(" font-weight: bold;")
css_parts.append(" text-align: center;")
css_parts.append(f" padding: 10px;")
css_parts.append(f" border: 1px solid {borderColor};")
css_parts.append("}")
# Table cells
css_parts.append("td {")
css_parts.append(f" color: {paraColor};")
css_parts.append(" padding: 8px;")
css_parts.append(f" border: 1px solid {borderColor};")
css_parts.append("}")
# Lists
css_parts.append("ul {")
css_parts.append(f" font-size: {lst.get('sizePt', paraSizePt)}pt;")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" padding-left: {lst.get('indentPt', 18)}pt;")
css_parts.append(" margin: 0 0 1em 0;")
css_parts.append("}")
# Code blocks
css_parts.append("pre {")
css_parts.append(f" font-family: {monoFont};")
css_parts.append(f" font-size: {cb.get('fontSizePt', 9)}pt;")
css_parts.append(f" color: {paraColor};")
css_parts.append(f" background: {cb.get('background', '#F8F9FA')};")
css_parts.append(f" border: 1px solid {cb.get('borderColor', '#E2E8F0')};")
css_parts.append(" border-radius: 4px;")
css_parts.append(" padding: 1em;")
css_parts.append(" margin: 1em 0;")
css_parts.append(" overflow-x: auto;")
css_parts.append("}")
# Images
css_parts.append("img {")
css_parts.append(" max-width: 100%;")
css_parts.append(" height: auto;")
css_parts.append(" margin: 1em 0;")
css_parts.append(" border-radius: 4px;")
css_parts.append("}")
# Generated info
css_parts.append(".generated-info {")
css_parts.append(" font-size: 0.9em;")
css_parts.append(" color: #666;")
css_parts.append(" text-align: center;")
css_parts.append(" margin-top: 2em;")
css_parts.append(" padding-top: 1em;")
css_parts.append(" border-top: 1px solid #ddd;")
css_parts.append("}")
return '\n'.join(css_parts)
def _renderInlineRuns(self, runs: list) -> str:
"""Convert inline runs to HTML markup."""
import html as htmlLib
parts = []
for run in runs:
runType = run.get("type", "text")
value = htmlLib.escape(run.get("value", ""))
if runType == "text":
parts.append(value)
elif runType == "bold":
parts.append(f"<strong>{value}</strong>")
elif runType == "italic":
parts.append(f"<em>{value}</em>")
elif runType == "code":
parts.append(f"<code>{value}</code>")
elif runType == "link":
href = htmlLib.escape(run.get("href", ""))
parts.append(f'<a href="{href}">{value}</a>')
elif runType == "image":
b64 = run.get("base64Data", "")
mime = run.get("mimeType", "image/png")
alt = value
if b64:
parts.append(f'<img src="data:{mime};base64,{b64}" alt="{alt}" style="max-width:100%;height:auto;">')
else:
parts.append(value)
return "".join(parts)
def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str:
"""Render a single JSON section to HTML using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
@ -419,6 +586,11 @@ class RendererHtml(BaseRenderer):
# Regular paragraph element - extract from nested content structure (standard JSON format)
content = element.get("content", {})
if isinstance(content, dict):
# New format: inlineRuns
inlineRuns = content.get("inlineRuns")
if inlineRuns and isinstance(inlineRuns, list):
htmlParts.append(f'<p>{self._renderInlineRuns(inlineRuns)}</p>')
continue
text = content.get("text", "")
elif isinstance(content, str):
text = content
@ -495,7 +667,8 @@ class RendererHtml(BaseRenderer):
# Table header
htmlParts.append('<thead><tr>')
for header in headers:
htmlParts.append(f'<th>{header}</th>')
runs = self._inlineRunsForCell(header)
htmlParts.append(f'<th>{self._renderInlineRuns(runs)}</th>')
htmlParts.append('</tr></thead>')
# Table body
@ -503,7 +676,8 @@ class RendererHtml(BaseRenderer):
for row in rows:
htmlParts.append('<tr>')
for cellData in row:
htmlParts.append(f'<td>{cellData}</td>')
runs = self._inlineRunsForCell(cellData)
htmlParts.append(f'<td>{self._renderInlineRuns(runs)}</td>')
htmlParts.append('</tr>')
htmlParts.append('</tbody>')
@ -528,10 +702,8 @@ class RendererHtml(BaseRenderer):
htmlParts = ['<ul>']
for item in items:
if isinstance(item, str):
htmlParts.append(f'<li>{item}</li>')
elif isinstance(item, dict) and "text" in item:
htmlParts.append(f'<li>{item["text"]}</li>')
runs = self._inlineRunsForListItem(item)
htmlParts.append(f'<li>{self._renderInlineRuns(runs)}</li>')
htmlParts.append('</ul>')
return '\n'.join(htmlParts)
@ -571,6 +743,11 @@ class RendererHtml(BaseRenderer):
if isinstance(el, dict):
content = el.get("content", {})
if isinstance(content, dict):
# New format: inlineRuns
inlineRuns = content.get("inlineRuns")
if inlineRuns and isinstance(inlineRuns, list):
texts.append(self._renderInlineRuns(inlineRuns))
continue
text = content.get("text", "")
elif isinstance(content, str):
text = content
@ -581,16 +758,18 @@ class RendererHtml(BaseRenderer):
elif isinstance(el, str):
texts.append(el)
if texts:
# Join multiple paragraphs with <p> tags
return '\n'.join(f'<p>{text}</p>' for text in texts)
return ""
elif isinstance(paragraphData, str):
return f'<p>{paragraphData}</p>'
elif isinstance(paragraphData, dict):
# Handle nested content structure: element.content vs element.text
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
# New format: inlineRuns
inlineRuns = content.get("inlineRuns")
if inlineRuns and isinstance(inlineRuns, list):
return f'<p>{self._renderInlineRuns(inlineRuns)}</p>'
text = content.get("text", "")
elif isinstance(content, str):
text = content

View file

@ -43,8 +43,17 @@ class RendererImage(BaseRenderer):
"""
return ["image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""Render extracted JSON content to image format using AI image generation."""
_ = style
try:
# Generate AI image from content
imageContent = await self._generateAiImage(extractedContent, title, userPrompt, aiService)

View file

@ -42,8 +42,17 @@ class RendererJson(BaseRenderer):
# Return all types except image
return [st for st in supportedSectionTypes if st != "image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""Render extracted JSON content to JSON format."""
_ = style
try:
# The extracted content should already be JSON from the AI
# Just validate and format it

View file

@ -40,8 +40,17 @@ class RendererMarkdown(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return [st for st in supportedSectionTypes if st != "image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""Render extracted JSON content to Markdown format."""
_ = style
try:
# Generate markdown from JSON structure
markdownContent = self._generateMarkdownFromJson(extractedContent, title)

View file

@ -106,17 +106,17 @@ class RendererPdf(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to PDF format using AI-analyzed styling."""
try:
if not REPORTLAB_AVAILABLE:
# Fallback to HTML if reportlab not available
from .rendererHtml import RendererHtml
html_renderer = RendererHtml()
return await html_renderer.render(extractedContent, title, userPrompt, aiService)
return await html_renderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate PDF using AI-analyzed styling
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService)
pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@ -163,11 +163,28 @@ class RendererPdf(BaseRenderer):
)
]
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, unifiedStyle: Dict[str, Any] = None) -> str:
"""Generate PDF content from structured JSON document using AI-generated styling."""
try:
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(json_content, userPrompt, aiService)
# Get style set from unified style or legacy approach
if unifiedStyle:
styles = self._convertUnifiedStyleToInternal(unifiedStyle)
self._unifiedStyle = unifiedStyle
for level in range(1, 7):
hKey = f"heading{level}"
if hKey not in styles:
styles[hKey] = self._defaultHeadingStyleDef(level)
else:
styles[hKey].setdefault("space_after", 12)
styles[hKey].setdefault("space_before", 12)
styles["paragraph"].setdefault("space_after", 6)
styles["paragraph"].setdefault("line_height", unifiedStyle["paragraph"].get("lineSpacing", 1.2))
styles["bullet_list"].setdefault("space_after", 3)
styles["code_block"].setdefault("space_after", 6)
styles["code_block"].setdefault("align", "left")
else:
styles = await self._getStyleSet(json_content, userPrompt, aiService)
self._unifiedStyle = None
# Validate JSON structure
if not self._validateJsonStructure(json_content):
@ -179,15 +196,13 @@ class RendererPdf(BaseRenderer):
# Create a buffer to hold the PDF
buffer = io.BytesIO()
# Create PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Create PDF document with unified page margins or defaults
pageCfg = unifiedStyle["page"] if unifiedStyle else None
if pageCfg:
m = pageCfg["marginsPt"]
doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=m["right"], leftMargin=m["left"], topMargin=m["top"], bottomMargin=m["bottom"])
else:
doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18)
# Build PDF content (no cover page — body starts on page 1; filename still uses `title`)
story = []
@ -232,13 +247,28 @@ class RendererPdf(BaseRenderer):
removed = False
for idx, flowable in enumerate(story):
fRepr = repr(flowable)
if "Image" in fRepr and hasattr(flowable, 'drawWidth') and hasattr(flowable, 'drawHeight'):
from reportlab.platypus import Image as ReportLabImage
if isinstance(flowable, ReportLabImage):
frameH = 650.0
frameW = 450.0
if flowable.drawHeight > frameH or flowable.drawWidth > frameW:
scaleW = frameW / flowable.drawWidth if flowable.drawWidth > frameW else 1.0
scaleH = frameH / flowable.drawHeight if flowable.drawHeight > frameH else 1.0
s = min(scaleW, scaleH) * 0.9
flowable.drawWidth = flowable.drawWidth * s
flowable.drawHeight = flowable.drawHeight * s
flowable._width = flowable.drawWidth
flowable._height = flowable.drawHeight
removed = True
break
if "Table" in fRepr and hasattr(flowable, '_cellvalues'):
try:
nRows = len(flowable._cellvalues)
nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0
if nRows == 1 and nCols == 1:
errPara = Paragraph(
"[Code block omitted — content too large for PDF page]",
"[Code block omitted - content too large for PDF page]",
self._createNormalStyle({}),
)
story[idx] = errPara
@ -609,6 +639,31 @@ class RendererPdf(BaseRenderer):
.replace(">", "&gt;")
)
def _renderInlineRunsToPdfXml(self, runs: list) -> str:
"""Convert inline runs to ReportLab Paragraph XML."""
parts = []
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier"
for run in runs:
runType = run.get("type", "text")
value = self._escapeReportlabXml(run.get("value", ""))
if runType == "text":
parts.append(value)
elif runType == "bold":
parts.append(f"<b>{value}</b>")
elif runType == "italic":
parts.append(f"<i>{value}</i>")
elif runType == "code":
parts.append(f'<font name="{monoFont}">{value}</font>')
elif runType == "link":
href = self._escapeReportlabXml(run.get("href", ""))
parts.append(f'<a href="{href}">{value}</a>')
elif runType == "image":
parts.append(f"[Image: {value}]")
else:
parts.append(value)
return "".join(parts)
def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str:
"""Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately)."""
if not text:
@ -744,10 +799,10 @@ class RendererPdf(BaseRenderer):
return []
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return []
numCols = len(headers)
colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1)
colWidths = [colWidth] * numCols
@ -755,8 +810,12 @@ class RendererPdf(BaseRenderer):
hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header")
cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell")
def _cellPara(val, ps):
return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps)
def _cellPara(cell, ps):
runs = self._inlineRunsForCell(cell)
if isinstance(cell, list):
xml = self._renderInlineRunsToPdfXml(runs)
return Paragraph(_wrapEmojiSpansInXml(xml), ps)
return self._paragraphFromInlineMarkdown(str(cell) if cell is not None else "", ps)
headerRow = [_cellPara(h, hdrPs) for h in headers]
bodyRows = []
@ -786,7 +845,7 @@ class RendererPdf(BaseRenderer):
]
table.setStyle(TableStyle(table_style))
return [table, Spacer(1, 12)]
except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}")
return []
@ -794,32 +853,29 @@ class RendererPdf(BaseRenderer):
def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON bullet list to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return []
items = content.get("items", [])
bullet_style_def = styles.get("bullet_list", {})
bulletStyleDef = styles.get("bullet_list", {})
normalStyle = self._createNormalStyle(styles)
elements = []
for item in items:
if isinstance(item, str):
elements.append(
Paragraph(f"{self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles))
)
runs = self._inlineRunsForListItem(item)
if isinstance(item, list):
xml = self._renderInlineRunsToPdfXml(runs)
elements.append(Paragraph(f"\u2022 {_wrapEmojiSpansInXml(xml)}", normalStyle))
elif isinstance(item, str):
elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item)}", normalStyle))
elif isinstance(item, dict) and "text" in item:
elements.append(
Paragraph(
f"{self._markdownInlineToReportlabXml(item['text'])}",
self._createNormalStyle(styles),
)
)
elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item['text'])}", normalStyle))
if elements:
elements.append(Spacer(1, bullet_style_def.get("space_after", 3)))
elements.append(Spacer(1, bulletStyleDef.get("space_after", 3)))
return elements
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
return []
@ -848,20 +904,27 @@ class RendererPdf(BaseRenderer):
def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]:
"""Render a JSON paragraph to PDF elements using AI-generated styles."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if isinstance(content, str):
content = {"text": content}
if not isinstance(content, dict):
return []
normalStyle = self._createNormalStyle(styles)
if "inlineRuns" in content:
runs = self._inlineRunsFromContent(content)
xml = self._renderInlineRunsToPdfXml(runs)
if xml:
return [Paragraph(_wrapEmojiSpansInXml(xml), normalStyle)]
return []
text = content.get("text", "")
if text:
return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))]
return [self._paragraphFromInlineMarkdown(text, normalStyle)]
return []
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
return []
@ -1030,20 +1093,18 @@ class RendererPdf(BaseRenderer):
pilImage = PILImage.open(imageStream)
originalWidth, originalHeight = pilImage.size
# Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom)
pageWidth = A4[0] # 595.27 points
pageHeight = A4[1] # 841.89 points
leftMargin = 72
rightMargin = 72
topMargin = 72
bottomMargin = 18
# Use actual frame dimensions from SimpleDocTemplate
# Frame is smaller than page minus margins due to internal spacing
# From error message: frame is 439.27559055118115 x 739.8897637795277
# Use conservative values with safety margin
availableWidth = 430.0 # Slightly smaller than frame width for safety
availableHeight = 730.0 # Slightly smaller than frame height for safety
# Use page dimensions minus margins with generous safety buffer
# A4 = 595.27 x 841.89 pt; frame = page - margins - internal padding
_us = getattr(self, '_unifiedStyle', None) or {}
_pageMgn = (_us.get('page') or {}).get('marginsPt') or {}
marginTop = _pageMgn.get('top', 60)
marginBottom = _pageMgn.get('bottom', 60)
marginLeft = _pageMgn.get('left', 60)
marginRight = _pageMgn.get('right', 60)
availableWidth = pageWidth - marginLeft - marginRight - 20 # 20pt safety
availableHeight = pageHeight - marginTop - marginBottom - 80 # 80pt safety for header/footer
# Convert original image size from pixels to points
# PIL provides size in pixels, need to convert to points

View file

@ -59,7 +59,7 @@ class RendererPptx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""
Render content as PowerPoint presentation from JSON data.
@ -68,7 +68,7 @@ class RendererPptx(BaseRenderer):
title: Title for the presentation
userPrompt: User prompt for AI styling
aiService: AI service for styling
**kwargs: Additional rendering options
style: Unified style dict from pipeline (preferred over AI-generated styles)
Returns:
Base64-encoded PowerPoint presentation as string
@ -81,8 +81,19 @@ class RendererPptx(BaseRenderer):
from pptx.dml.color import RGBColor
import re
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
# Get style set: prefer unified style, then metadata, then AI-enhanced
if style:
internalStyle = self._convertUnifiedStyleToInternal(style)
defaultPptx = self._getDefaultStyleSet()
for key in ("slide_size", "content_per_slide", "design_theme", "color_scheme", "background_style", "accent_colors", "professional_grade", "executive_ready"):
internalStyle[key] = defaultPptx.get(key)
internalStyle["heading"] = internalStyle["heading1"]
internalStyle["subheading"] = internalStyle["heading2"]
styles = internalStyle
self._unifiedStyle = style
else:
styles = await self._getStyleSet(extractedContent, userPrompt, aiService)
self._unifiedStyle = None
# Create new presentation
prs = Presentation()
@ -910,15 +921,17 @@ JSON ONLY. NO OTHER TEXT."""
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
if content.get("inlineRuns"):
text = "".join(r.get("value", "") for r in content["inlineRuns"])
else:
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
if text:
# Limit paragraph length based on content density
max_length = 200 # Default limit
max_length = 200
if len(text) > max_length:
text = text[:max_length] + "..."
@ -1303,6 +1316,32 @@ JSON ONLY. NO OTHER TEXT."""
r.text = text[pos:]
_applyBase(r)
def _renderInlineRunsPptx(self, runs, paragraph, fontSize=None, fontColor=None):
"""Process InlineRun dicts into pptx text runs."""
from pptx.util import Pt
paragraph.text = ""
us = getattr(self, '_unifiedStyle', None)
monoFont = us["fonts"]["monospace"] if us else "Courier New"
for run in runs:
runType = run.get("type", "text")
value = run.get("value", "")
r = paragraph.add_run()
r.text = value
if fontSize:
r.font.size = fontSize
if fontColor:
r.font.color.rgb = fontColor
if runType == "bold":
r.font.bold = True
elif runType == "italic":
r.font.italic = True
elif runType == "code":
r.font.name = monoFont
if fontSize and hasattr(fontSize, 'pt'):
r.font.size = Pt(max(8, int(fontSize.pt * 0.85)))
elif runType == "link":
r.font.underline = True
def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None:
"""Add a PowerPoint table to slide."""
try:
@ -1374,7 +1413,8 @@ JSON ONLY. NO OTHER TEXT."""
cell = table.cell(0, col_idx)
# Clear existing text and set new text
cell.text_frame.clear()
header_text = str(header) if header else ""
cellRuns = self._inlineRunsForCell(header)
header_text = "".join(r.get("value", "") for r in cellRuns)
cell.text = header_text
# Ensure paragraph exists
@ -1420,7 +1460,8 @@ JSON ONLY. NO OTHER TEXT."""
cell = table.cell(row_idx, col_idx)
# Clear existing text and set new text
cell.text_frame.clear()
cell_text = str(cell_data) if cell_data is not None else ""
cellRuns = self._inlineRunsForCell(cell_data)
cell_text = "".join(r.get("value", "") for r in cellRuns)
cell.text = cell_text
# Ensure paragraph exists
@ -1462,9 +1503,8 @@ JSON ONLY. NO OTHER TEXT."""
fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47))))
for item in items:
itemText = item.get("text", "") if isinstance(item, dict) else str(item)
if not itemText or not itemText.strip():
continue
runs = self._inlineRunsForListItem(item)
isNewFormat = isinstance(item, list)
p = text_frame.add_paragraph()
p.level = 0
@ -1472,21 +1512,33 @@ JSON ONLY. NO OTHER TEXT."""
p.space_before = Pt(2)
p.space_after = Pt(2)
# Consistent bullet prefix
self._addMarkdownInlineRuns(p, f"{itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
if isNewFormat:
bulletRuns = [{"type": "text", "value": " \u2022 "}] + runs
self._renderInlineRunsPptx(bulletRuns, p, fontSize=fontSize, fontColor=fontColor)
else:
itemText = item.get("text", "") if isinstance(item, dict) else str(item)
if not itemText or not itemText.strip():
continue
self._addMarkdownInlineRuns(p, f" \u2022 {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
# Subitems
# Subitems (only for dict-style items)
if isinstance(item, dict):
for sub in item.get("subitems", []):
subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
if not subText:
continue
subRuns = self._inlineRunsForListItem(sub)
isSubNew = isinstance(sub, list)
sp = text_frame.add_paragraph()
sp.level = 0
sp.alignment = PP_ALIGN.LEFT
sp.space_before = Pt(1)
sp.space_after = Pt(1)
self._addMarkdownInlineRuns(sp, f" {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
if isSubNew:
subBulletRuns = [{"type": "text", "value": " \u2013 "}] + subRuns
self._renderInlineRunsPptx(subBulletRuns, sp, fontSize=fontSize, fontColor=fontColor)
else:
subText = sub.get("text", "") if isinstance(sub, dict) else str(sub)
if not subText:
continue
self._addMarkdownInlineRuns(sp, f" \u2013 {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False)
except Exception as e:
logger.warning(f"Error adding bullet list to slide: {str(e)}")
@ -1540,42 +1592,53 @@ JSON ONLY. NO OTHER TEXT."""
# Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
inlineRuns = self._inlineRunsFromContent(content)
hasInlineRuns = content.get("inlineRuns") is not None
text = content.get("text", "")
elif isinstance(content, str):
text = content
inlineRuns = [{"type": "text", "value": text}] if text else []
hasInlineRuns = False
else:
text = ""
inlineRuns = []
hasInlineRuns = False
if text:
p = text_frame.add_paragraph()
p.level = 0
try:
if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None
except (AttributeError, TypeError):
pass
paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 14)
calculated_size = max(10, int(base_font_size * font_size_multiplier))
fSize = Pt(calculated_size)
fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
fBold = paragraph_style.get("bold", False)
if not inlineRuns and not text:
return
p = text_frame.add_paragraph()
p.level = 0
try:
if hasattr(p, 'paragraph_format'):
p.paragraph_format.bullet.type = None
except (AttributeError, TypeError):
pass
paragraph_style = styles.get("paragraph", {})
base_font_size = paragraph_style.get("font_size", 14)
calculated_size = max(10, int(base_font_size * font_size_multiplier))
fSize = Pt(calculated_size)
fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47))))
fBold = paragraph_style.get("bold", False)
if hasInlineRuns:
self._renderInlineRunsPptx(inlineRuns, p, fontSize=fSize, fontColor=fColor)
else:
self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold)
# Add proper spacing
p.space_before = Pt(6) # Space before paragraph
p.space_after = Pt(6) # Space after paragraph
p.line_spacing = 1.2 # Line spacing for readability
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
p.space_before = Pt(6)
p.space_after = Pt(6)
p.line_spacing = 1.2
align = paragraph_style.get("align", "left")
if align == "center":
p.alignment = PP_ALIGN.CENTER
elif align == "right":
p.alignment = PP_ALIGN.RIGHT
else:
p.alignment = PP_ALIGN.LEFT
except Exception as e:
logger.warning(f"Error adding paragraph to slide: {str(e)}")

View file

@ -8,7 +8,7 @@ import re
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, Union
class RendererText(BaseRenderer):
"""Renders content to plain text format with format-specific extraction."""
@ -76,8 +76,17 @@ class RendererText(BaseRenderer):
# Text renderer accepts all types except images
return [st for st in supportedSectionTypes if st != "image"]
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(
self,
extractedContent: Dict[str, Any],
title: str,
userPrompt: str = None,
aiService=None,
*,
style: Dict[str, Any] = None,
) -> List[RenderedDocument]:
"""Render extracted JSON content to plain text format."""
_ = style # unified style from renderReport; plain text ignores formatting hints
try:
# Generate text from JSON structure
textContent = self._generateTextFromJson(extractedContent, title)
@ -263,16 +272,16 @@ class RendererText(BaseRenderer):
textParts = []
# Create table header
headerLine = " | ".join(str(header) for header in headers)
headerLine = " | ".join(self._tableCellToPlainText(h) for h in headers)
textParts.append(headerLine)
# Add separator line
separatorLine = " | ".join("-" * len(str(header)) for header in headers)
separatorLine = " | ".join("-" * len(self._tableCellToPlainText(h)) for h in headers)
textParts.append(separatorLine)
# Add data rows
for row in rows:
rowLine = " | ".join(str(cellData) for cellData in row)
rowLine = " | ".join(self._tableCellToPlainText(cellData) for cellData in row)
textParts.append(rowLine)
return '\n'.join(textParts)
@ -299,6 +308,9 @@ class RendererText(BaseRenderer):
textParts.append(f"- {self._stripMarkdownForPlainText(item)}")
elif isinstance(item, dict) and "text" in item:
textParts.append(f"- {self._stripMarkdownForPlainText(item['text'])}")
elif isinstance(item, list):
# markdownToDocumentJson: each item is List[InlineRun]
textParts.append(f"- {self._inlineRunsToPlainText(item)}")
return '\n'.join(textParts)
@ -345,12 +357,54 @@ class RendererText(BaseRenderer):
text = re.sub(r'`([^`]+)`', r'\1', text)
return text.strip()
def _inlineRunsToPlainText(self, runs: Union[List[Any], Any]) -> str:
"""Flatten InlineRun dicts (from markdownToDocumentJson) to a single string."""
if runs is None:
return ""
if isinstance(runs, dict):
runs = [runs]
if not isinstance(runs, list):
return self._stripMarkdownForPlainText(str(runs))
parts: List[str] = []
for run in runs:
if not isinstance(run, dict):
parts.append(str(run))
continue
t = run.get("type") or "text"
val = run.get("value", "")
if t == "text":
parts.append(str(val))
elif t in ("bold", "italic", "code"):
parts.append(str(val))
elif t == "link":
parts.append(str(val))
elif t == "image":
parts.append(f"[{val}]")
else:
parts.append(str(val))
return "".join(parts)
def _tableCellToPlainText(self, cell: Any) -> str:
"""Table header/cell: plain str, legacy dict, or List[InlineRun]."""
if cell is None:
return ""
if isinstance(cell, str):
return self._stripMarkdownForPlainText(cell)
if isinstance(cell, list):
return self._inlineRunsToPlainText(cell)
if isinstance(cell, dict) and "text" in cell:
return self._stripMarkdownForPlainText(str(cell["text"]))
return self._stripMarkdownForPlainText(str(cell))
def _renderJsonParagraph(self, paragraphData: Dict[str, Any]) -> str:
"""Render a JSON paragraph to text. Strips markdown for plain text output."""
try:
# Extract from nested content structure
content = paragraphData.get("content", {})
if isinstance(content, dict):
runs = self._inlineRunsFromContent(content)
if runs:
return self._stripMarkdownForPlainText(self._inlineRunsToPlainText(runs))
text = content.get("text", "")
elif isinstance(content, str):
text = content

View file

@ -68,17 +68,17 @@ class RendererXlsx(BaseRenderer):
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]:
"""Render extracted JSON content to Excel format using AI-analyzed styling."""
try:
if not OPENPYXL_AVAILABLE:
# Fallback to CSV if openpyxl not available
from .rendererCsv import RendererCsv
csvRenderer = RendererCsv()
return await csvRenderer.render(extractedContent, title, userPrompt, aiService)
return await csvRenderer.render(extractedContent, title, userPrompt, aiService, style=style)
# Generate Excel using AI-analyzed styling
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService)
excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService, style=style)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
@ -298,15 +298,22 @@ class RendererXlsx(BaseRenderer):
except Exception as e:
self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str:
"""Generate Excel content from structured JSON document using AI-generated styling."""
try:
# Debug output
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER")
self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER")
# Get style set: use styles from metadata if available, otherwise enhance with AI
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Store unified style for use by inline-run helpers
self._unifiedStyle = style
# Get style set: prefer unified style, fall back to legacy approach
if style:
styles = self._convertUnifiedStyleToInternal(style)
styles = self._convertColorsFormat(styles)
else:
styles = await self._getStyleSet(jsonContent, userPrompt, aiService)
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
if not self._validateJsonStructure(jsonContent):
@ -511,6 +518,10 @@ class RendererXlsx(BaseRenderer):
"code_block": {"font": "Courier New", "font_size": 10, "color": "FF2F2F2F", "background": "FFF5F5F5"}
}
def _renderInlineRuns(self, runs: list) -> str:
"""Flatten inline runs to plain text for Excel cells."""
return "".join(r.get("value", "") for r in runs)
async def _getAiStylesWithExcelColors(self, aiService, styleTemplate: str, defaultStyles: Dict[str, Any]) -> Dict[str, Any]:
"""Get AI styles with proper Excel color conversion."""
if not aiService:
@ -1206,7 +1217,9 @@ class RendererXlsx(BaseRenderer):
# Add headers with formatting - OPTIMIZED: use cached style objects
for col, header in enumerate(headers, 1):
sanitized_header = self._sanitizeCellValue(header)
runs = self._inlineRunsForCell(header)
headerText = self._renderInlineRuns(runs)
sanitized_header = self._sanitizeCellValue(headerText)
cell = sheet.cell(row=headerRow, column=col, value=sanitized_header)
# Apply styling with fallbacks - use pre-calculated objects
@ -1272,7 +1285,9 @@ class RendererXlsx(BaseRenderer):
cell_values = cell_values[:header_count]
for col, cell_value in enumerate(cell_values, 1):
sanitized_value = self._sanitizeCellValue(cell_value)
runs = self._inlineRunsForCell(cell_value)
cellText = self._renderInlineRuns(runs)
sanitized_value = self._sanitizeCellValue(cellText)
cell = sheet.cell(row=startRow, column=col, value=sanitized_value)
# Apply styling with fallbacks - use pre-calculated objects
@ -1311,20 +1326,20 @@ class RendererXlsx(BaseRenderer):
def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
"""Add a list element to Excel sheet. Expects nested content structure."""
try:
# Extract from nested content structure
content = element.get("content", {})
if not isinstance(content, dict):
return startRow
list_items = content.get("items") or []
# Ensure list_items is a list
if not isinstance(list_items, list):
list_items = []
listItems = content.get("items") or []
if not isinstance(listItems, list):
listItems = []
list_style = styles.get("bullet_list", {})
for item in list_items:
sheet.cell(row=startRow, column=1, value=f"{item}")
if list_style.get("color"):
sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(list_style["color"]))
listStyle = styles.get("bullet_list", {})
for item in listItems:
runs = self._inlineRunsForListItem(item)
text = self._renderInlineRuns(runs)
sheet.cell(row=startRow, column=1, value=f"\u2022 {text}")
if listStyle.get("color"):
sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(listStyle["color"]))
startRow += 1
return startRow
@ -1336,10 +1351,10 @@ class RendererXlsx(BaseRenderer):
def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int:
"""Add a paragraph element to Excel sheet. Expects nested content structure."""
try:
# Extract from nested content structure
content = element.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
runs = self._inlineRunsFromContent(content)
text = self._renderInlineRuns(runs)
elif isinstance(content, str):
text = content
else:

View file

@ -0,0 +1,75 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Default style definitions and style resolution for document rendering."""
from typing import Any, Dict
DEFAULT_STYLE: Dict[str, Any] = {
"fonts": {
"primary": "Calibri",
"monospace": "Consolas",
},
"colors": {
"primary": "#1F3864",
"secondary": "#2C3E50",
"accent": "#2980B9",
"background": "#FFFFFF",
},
"headings": {
"h1": {"sizePt": 24, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 12, "spaceAfterPt": 6},
"h2": {"sizePt": 18, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 10, "spaceAfterPt": 4},
"h3": {"sizePt": 14, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 8, "spaceAfterPt": 3},
"h4": {"sizePt": 12, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 6, "spaceAfterPt": 2},
},
"paragraph": {"sizePt": 11, "lineSpacing": 1.15, "color": "#333333"},
"table": {
"headerBg": "#1F3864",
"headerFg": "#FFFFFF",
"headerSizePt": 10,
"bodySizePt": 10,
"rowBandingEven": "#F2F6FC",
"rowBandingOdd": "#FFFFFF",
"borderColor": "#CBD5E1",
"borderWidthPt": 0.5,
},
"list": {"bulletChar": "\u2022", "indentPt": 18, "sizePt": 11},
"image": {"defaultWidthPt": 480, "maxWidthPt": 800, "alignment": "center"},
"codeBlock": {"fontSizePt": 9, "background": "#F8F9FA", "borderColor": "#E2E8F0"},
"page": {
"format": "A4",
"marginsPt": {"top": 60, "bottom": 60, "left": 60, "right": 60},
"showPageNumbers": True,
"headerHeight": 30,
"footerHeight": 30,
"headerLogo": None,
"headerText": "",
"footerText": "",
},
}
def _deepMerge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
"""Recursively merge override into base. Both dicts left unchanged; returns new dict."""
result = {}
for key in base:
if key in override:
baseVal = base[key]
overVal = override[key]
if isinstance(baseVal, dict) and isinstance(overVal, dict):
result[key] = _deepMerge(baseVal, overVal)
else:
result[key] = overVal
else:
result[key] = base[key]
for key in override:
if key not in base:
result[key] = override[key]
return result
def resolveStyle(agentStyle: dict | None) -> Dict[str, Any]:
"""Deep-merge DEFAULT_STYLE <- agentStyle. Returns fully resolved style dict."""
if not agentStyle:
return dict(DEFAULT_STYLE)
return _deepMerge(DEFAULT_STYLE, agentStyle)

View file

@ -9,11 +9,70 @@ from typing import Any, Dict
logger = logging.getLogger(__name__)
def _parseInlineRuns(text: str) -> list:
"""
Parse inline markdown formatting into a list of InlineRun dicts.
Handles: images, links, bold, italic, inline code, plain text.
Uses a regex-based tokenizer that processes tokens left-to-right.
"""
if not text:
return [{"type": "text", "value": ""}]
# Pattern order matters: images before links, bold before italic
_TOKEN_RE = re.compile(
r'!\[(?P<imgAlt>[^\]]*)\]\((?P<imgSrc>[^)"]+)(?:\s+"(?P<imgWidth>\d+)pt")?\)' # image
r'|\[(?P<linkText>[^\]]+)\]\((?P<linkHref>[^)]+)\)' # link
r'|`(?P<code>[^`]+)`' # inline code
r'|\*\*(?P<bold>.+?)\*\*' # bold
r'|(?<!\w)\*(?P<italic1>.+?)\*(?!\w)' # italic *x*
r'|(?<!\w)_(?P<italic2>.+?)_(?!\w)' # italic _x_
)
runs = []
lastEnd = 0
for m in _TOKEN_RE.finditer(text):
# Plain text before this match
if m.start() > lastEnd:
runs.append({"type": "text", "value": text[lastEnd:m.start()]})
if m.group("imgAlt") is not None or m.group("imgSrc") is not None:
alt = (m.group("imgAlt") or "").strip() or "Image"
src = (m.group("imgSrc") or "").strip()
widthStr = m.group("imgWidth")
run = {"type": "image", "value": alt}
if src.startswith("file:"):
run["fileId"] = src[5:]
else:
run["href"] = src
if widthStr:
run["widthPt"] = int(widthStr)
runs.append(run)
elif m.group("linkText") is not None:
runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")})
elif m.group("code") is not None:
runs.append({"type": "code", "value": m.group("code")})
elif m.group("bold") is not None:
runs.append({"type": "bold", "value": m.group("bold")})
elif m.group("italic1") is not None:
runs.append({"type": "italic", "value": m.group("italic1")})
elif m.group("italic2") is not None:
runs.append({"type": "italic", "value": m.group("italic2")})
lastEnd = m.end()
# Trailing plain text
if lastEnd < len(text):
runs.append({"type": "text", "value": text[lastEnd:]})
return runs if runs else [{"type": "text", "value": text}]
def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""
Convert markdown content to the standard document JSON format expected by renderReport.
Supports headings, code blocks, tables, lists, images (file: refs), paragraphs.
For plain text: wraps entire content in a single paragraph section.
Convert markdown content to the standard document JSON format with Inline-Run model.
Sections use inlineRuns (list of run dicts) instead of plain text strings.
Supports headings, code blocks, tables, lists, images, paragraphs.
"""
if not isinstance(markdown, str):
markdown = str(markdown) if markdown else ""
@ -31,7 +90,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
while i < len(lines):
line = lines[i]
# Headings
# Headings (plain text, no inline formatting)
headingMatch = re.match(r"^(#{1,6})\s+(.+)", line)
if headingMatch:
level = len(headingMatch.group(1))
@ -43,7 +102,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
i += 1
continue
# Fenced code blocks
# Fenced code blocks (no inline formatting)
codeMatch = re.match(r"^```(\w*)", line)
if codeMatch:
lang = codeMatch.group(1) or "text"
@ -59,14 +118,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
})
continue
# Tables
# Tables - cells are List[InlineRun]
tableMatch = re.match(r"^\|(.+)\|$", line)
if tableMatch and (i + 1) < len(lines) and re.match(r"^\|[\s\-:|]+\|$", lines[i + 1]):
headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
headerCells = [_parseInlineRuns(c.strip()) for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and re.match(r"^\|(.+)\|$", lines[i]):
rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
rowCells = [_parseInlineRuns(c.strip()) for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
@ -75,14 +134,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
})
continue
# Bullet / numbered lists
# Bullet / numbered lists - items are List[List[InlineRun]]
listMatch = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", line)
if listMatch:
isNumbered = bool(re.match(r"\d+[.)]", listMatch.group(2)))
items = []
while i < len(lines) and re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i]):
m = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i])
items.append({"text": m.group(3).strip()})
items.append(_parseInlineRuns(m.group(3).strip()))
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
@ -95,46 +154,50 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D
i += 1
continue
# Images (simplified: store as paragraph with ref for now - full resolution needs Knowledge Store)
imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)]+)\)", line)
# Standalone image on its own line -> block-level image section
imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)\"]+)(?:\s+\"(\d+)pt\")?\)\s*$", line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
widthStr = imgMatch.group(3)
fileId = src[5:] if src.startswith("file:") else ""
content = {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
if widthStr:
content["widthPt"] = int(widthStr)
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
"elements": [{
"content": {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
}],
"elements": [{"content": content}],
})
i += 1
continue
# Paragraph
# Paragraph - produces inlineRuns
paraLines = []
while i < len(lines) and lines[i].strip() and not re.match(
r"^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )", lines[i]
r"^(#{1,6}\s|```|\|.+\||!\[[^\]]*\]\([^)]+\)\s*$|(\s*)([-*+]|\d+[.)]) )", lines[i]
):
paraLines.append(lines[i])
i += 1
if paraLines:
combinedText = " ".join(paraLines)
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": " ".join(paraLines)}}],
"elements": [{"content": {"inlineRuns": _parseInlineRuns(combinedText)}}],
})
continue
i += 1
if not sections:
fallbackText = markdown.strip() or "(empty)"
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
"elements": [{"content": {"inlineRuns": _parseInlineRuns(fallbackText)}}],
})
return {

View file

@ -2,9 +2,13 @@
# All rights reserved.
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
import hashlib
import json
import logging
import re
from typing import Any, Callable, Dict, List, Optional
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Union
from modules.datamodels.datamodelKnowledge import (
FileContentIndex, ContentChunk, WorkflowMemory,
@ -20,6 +24,68 @@ DEFAULT_CHUNK_TOKENS = 400
DEFAULT_CONTEXT_BUDGET = 12000
# =============================================================================
# Ingestion façade (P0 of unified-knowledge-indexing concept)
# =============================================================================
@dataclass
class IngestionJob:
"""One request to add or refresh content in the unified knowledge store.
Callers from any lane (routes, feature hooks, agent tools, connector sync)
describe the work they want done via this object; idempotency, scope
resolution, and embedding are handled by KnowledgeService.requestIngestion.
"""
sourceKind: str
sourceId: str
fileName: str
mimeType: str
userId: str
contentObjects: List[Dict[str, Any]] = field(default_factory=list)
featureInstanceId: str = ""
mandateId: str = ""
structure: Optional[Dict[str, Any]] = None
containerPath: Optional[str] = None
contentVersion: Optional[str] = None
provenance: Optional[Dict[str, Any]] = None
# Connector-driven neutralization: True when the user opted in via §2.6 preferences.
# For sourceKind == "file", _indexFileInternal resolves this from FileItem.neutralize instead.
neutralize: bool = False
@dataclass
class IngestionHandle:
"""Result of requestIngestion. Stable across in-process and future queue impls."""
jobId: str
status: str
contentHash: str
fileId: str
index: Optional[FileContentIndex] = None
error: Optional[str] = None
def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
"""Deterministic SHA256 over (contentType, data) tuples in extractor order.
`contentObjectId` is intentionally excluded because extractors generate
fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable
across re-extractions of the same source defeating idempotency.
Order is preserved (no sort) because two different documents can share the
same multiset of parts but differ in arrangement (e.g. swapped pages).
Text whitespace is preserved intentionally because chunk boundaries
depend on it.
"""
normalized = [
(
str(o.get("contentType", "text") or "text"),
o.get("data", "") or "",
)
for o in (contentObjects or [])
]
payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
class KnowledgeService:
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
@ -46,6 +112,224 @@ class KnowledgeService:
results = await self._embed([text])
return results[0] if results else []
# =========================================================================
# Ingestion façade (single entry point for all lanes)
# =========================================================================
async def requestIngestion(self, job: IngestionJob) -> IngestionHandle:
"""Unified entry point for filling the knowledge corpus.
Applies idempotency based on a content hash (or caller-supplied
`contentVersion`) persisted in `FileContentIndex.structure._ingestion`.
Re-runs indexing only when the hash differs or the previous run did
not reach `indexed` state. Runs embedding synchronously for now
(callers already schedule background tasks where needed).
"""
jobId = f"{job.sourceKind}:{job.sourceId}"
startMs = time.time()
contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects)
# 1. Check for duplicate via existing FileContentIndex row.
existing = None
try:
existing = self._knowledgeDb.getFileContentIndex(job.sourceId)
except Exception:
existing = None
if existing:
existingStructure = (
existing.get("structure") if isinstance(existing, dict)
else getattr(existing, "structure", {})
) or {}
existingMeta = existingStructure.get("_ingestion", {}) or {}
existingStatus = (
existing.get("status") if isinstance(existing, dict)
else getattr(existing, "status", "")
) or ""
if existingMeta.get("hash") == contentHash and existingStatus == "indexed":
logger.info(
"ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s",
job.sourceKind, job.sourceId, contentHash[:12],
extra={
"event": "ingestion.skipped.duplicate",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"durationMs": int((time.time() - startMs) * 1000),
},
)
return IngestionHandle(
jobId=jobId,
status="duplicate",
contentHash=contentHash,
fileId=job.sourceId,
index=None,
)
# 2. Prepare ingestion metadata; stays in structure._ingestion so
# later connector revoke/purge can filter chunks by sourceKind /
# provenance.connectionId without a schema migration.
ingestionMeta = {
"hash": contentHash,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"contentVersion": job.contentVersion,
"indexedAt": getUtcTimestamp(),
"provenance": dict(job.provenance or {}),
}
structure = dict(job.structure or {})
structure["_ingestion"] = ingestionMeta
logger.info(
"ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s",
job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12],
extra={
"event": "ingestion.queued",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"objectCount": len(job.contentObjects or []),
},
)
# 3. Run real indexing.
try:
index = await self._indexFileInternal(
fileId=job.sourceId,
fileName=job.fileName,
mimeType=job.mimeType,
userId=job.userId,
featureInstanceId=job.featureInstanceId,
mandateId=job.mandateId,
contentObjects=job.contentObjects or [],
structure=structure,
containerPath=job.containerPath,
sourceKind=job.sourceKind,
connectionId=(job.provenance or {}).get("connectionId"),
neutralize=job.neutralize,
)
except Exception as exc:
logger.error(
"ingestion.failed sourceKind=%s sourceId=%s error=%s",
job.sourceKind, job.sourceId, exc,
exc_info=True,
extra={
"event": "ingestion.failed",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"error": str(exc),
"durationMs": int((time.time() - startMs) * 1000),
},
)
try:
self._knowledgeDb.updateFileStatus(job.sourceId, "failed")
except Exception:
pass
return IngestionHandle(
jobId=jobId,
status="failed",
contentHash=contentHash,
fileId=job.sourceId,
index=None,
error=str(exc),
)
logger.info(
"ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d",
job.sourceKind, job.sourceId, len(job.contentObjects or []),
int((time.time() - startMs) * 1000),
extra={
"event": "ingestion.indexed",
"jobId": jobId,
"sourceKind": job.sourceKind,
"sourceId": job.sourceId,
"hash": contentHash,
"objectCount": len(job.contentObjects or []),
"durationMs": int((time.time() - startMs) * 1000),
},
)
return IngestionHandle(
jobId=jobId,
status="indexed",
contentHash=contentHash,
fileId=job.sourceId,
index=index,
)
def purgeConnection(self, connectionId: str) -> Dict[str, int]:
"""Delete every FileContentIndex + ContentChunk linked to a UserConnection.
Called on `connection.revoked` events so the knowledge corpus never
holds chunks the user has withdrawn access to. Returns deletion counts
for observability.
"""
if not connectionId:
return {"indexRows": 0, "chunks": 0}
startMs = time.time()
result = self._knowledgeDb.deleteFileContentIndexByConnectionId(connectionId)
logger.info(
"ingestion.connection.purged connectionId=%s rows=%d chunks=%d durationMs=%d",
connectionId, result["indexRows"], result["chunks"],
int((time.time() - startMs) * 1000),
extra={
"event": "ingestion.connection.purged",
"connectionId": connectionId,
"indexRows": result["indexRows"],
"chunks": result["chunks"],
"durationMs": int((time.time() - startMs) * 1000),
},
)
return result
def getIngestionStatus(
self, handleOrJobId: Union[IngestionHandle, str]
) -> Dict[str, Any]:
"""Map a handle or `sourceKind:sourceId` jobId to a status snapshot."""
if isinstance(handleOrJobId, IngestionHandle):
sourceId = handleOrJobId.fileId
jobId = handleOrJobId.jobId
elif isinstance(handleOrJobId, str) and ":" in handleOrJobId:
jobId = handleOrJobId
sourceId = handleOrJobId.split(":", 1)[1]
else:
jobId = str(handleOrJobId)
sourceId = str(handleOrJobId)
row = None
try:
row = self._knowledgeDb.getFileContentIndex(sourceId)
except Exception:
row = None
if not row:
return {
"jobId": jobId,
"sourceId": sourceId,
"status": "unknown",
"contentHash": None,
}
structure = (
row.get("structure") if isinstance(row, dict)
else getattr(row, "structure", {})
) or {}
meta = structure.get("_ingestion", {}) or {}
status = (
row.get("status") if isinstance(row, dict)
else getattr(row, "status", "")
) or "unknown"
return {
"jobId": jobId,
"sourceId": sourceId,
"status": status,
"contentHash": meta.get("hash"),
"sourceKind": meta.get("sourceKind"),
"indexedAt": meta.get("indexedAt"),
}
# =========================================================================
# File Indexing (called after extraction, before embedding)
# =========================================================================
@ -61,6 +345,57 @@ class KnowledgeService:
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
) -> Optional[FileContentIndex]:
"""Backward-compatible wrapper delegating to requestIngestion.
Existing callers that still invoke `indexFile` directly automatically
participate in the idempotency/metrics layer. New callers should
prefer `requestIngestion` so they can pass `sourceKind` and
`provenance` for connector revoke/purge later.
"""
job = IngestionJob(
sourceKind="file",
sourceId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=list(contentObjects or []),
structure=structure,
containerPath=containerPath,
)
handle = await self.requestIngestion(job)
if handle.index is not None:
return handle.index
if handle.status == "duplicate":
row = None
try:
row = self._knowledgeDb.getFileContentIndex(fileId)
except Exception:
row = None
if isinstance(row, dict):
try:
return FileContentIndex(**row)
except Exception:
return None
return row
return None
async def _indexFileInternal(
self,
fileId: str,
fileName: str,
mimeType: str,
userId: str,
featureInstanceId: str = "",
mandateId: str = "",
contentObjects: List[Dict[str, Any]] = None,
structure: Dict[str, Any] = None,
containerPath: str = None,
sourceKind: str = "file",
connectionId: Optional[str] = None,
neutralize: bool = False,
) -> FileContentIndex:
"""Index a file's content objects and create embeddings for text chunks.
@ -83,39 +418,41 @@ class KnowledgeService:
"""
contentObjects = contentObjects or []
# 1. Resolve scope fields from FileItem (Single Source of Truth)
# FileItem lives in poweron_management; its scope/mandateId/featureInstanceId
# are authoritative and must be mirrored onto the FileContentIndex.
# 1. Resolve scope fields from FileItem (Single Source of Truth) for
# uploaded files. Connector-sourced ingestion (sharepoint_item,
# outlook_message, ...) has no FileItem row — trust the caller's
# scope + ids directly.
resolvedScope = "personal"
resolvedMandateId = mandateId
resolvedFeatureInstanceId = featureInstanceId
resolvedUserId = userId
_shouldNeutralize = False
try:
from modules.datamodels.datamodelFiles import FileItem as _FileItem
_dbComponent = getattr(self._context, "interfaceDbComponent", None)
_fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
if not _fileRecords:
from modules.interfaces.interfaceDbManagement import ComponentObjects
_row = ComponentObjects().db._loadRecord(_FileItem, fileId)
if _row:
_fileRecords = [_row]
if _fileRecords:
_fileRecord = _fileRecords[0]
_get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
_shouldNeutralize = bool(_get("neutralize", False))
_fileScope = _get("scope")
if _fileScope:
resolvedScope = _fileScope
if not resolvedMandateId:
resolvedMandateId = str(_get("mandateId", "") or "")
if not resolvedFeatureInstanceId:
resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
_fileCreatedBy = _get("sysCreatedBy")
if _fileCreatedBy:
resolvedUserId = str(_fileCreatedBy)
except Exception:
pass
_shouldNeutralize = neutralize # caller-supplied flag (connector prefs / IngestionJob)
if sourceKind == "file":
try:
from modules.datamodels.datamodelFiles import FileItem as _FileItem
_dbComponent = getattr(self._context, "interfaceDbComponent", None)
_fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
if not _fileRecords:
from modules.interfaces.interfaceDbManagement import ComponentObjects
_row = ComponentObjects().db._loadRecord(_FileItem, fileId)
if _row:
_fileRecords = [_row]
if _fileRecords:
_fileRecord = _fileRecords[0]
_get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
_shouldNeutralize = bool(_get("neutralize", False)) # FileItem is authoritative for uploads
_fileScope = _get("scope")
if _fileScope:
resolvedScope = _fileScope
if not resolvedMandateId:
resolvedMandateId = str(_get("mandateId", "") or "")
if not resolvedFeatureInstanceId:
resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "")
_fileCreatedBy = _get("sysCreatedBy")
if _fileCreatedBy:
resolvedUserId = str(_fileCreatedBy)
except Exception:
pass
# 2. Create FileContentIndex with correct scope from the start
index = FileContentIndex(
@ -124,6 +461,8 @@ class KnowledgeService:
featureInstanceId=resolvedFeatureInstanceId,
mandateId=resolvedMandateId,
scope=resolvedScope,
sourceKind=sourceKind,
connectionId=connectionId,
fileName=fileName,
mimeType=mimeType,
containerPath=containerPath,
@ -300,7 +639,12 @@ class KnowledgeService:
Formatted context string for injection into the agent's system prompt.
"""
queryVector = await self._embedSingle(currentPrompt)
logger.debug(
"buildAgentContext.start userId=%s featureInstanceId=%s mandateId=%s isSysAdmin=%s prompt=%r",
userId, featureInstanceId, mandateId, isSysAdmin, (currentPrompt or "")[:120],
)
if not queryVector:
logger.debug("buildAgentContext.abort reason=no_query_vector")
return ""
builder = _ContextBuilder(budget=contextBudget)
@ -327,9 +671,14 @@ class KnowledgeService:
featureInstanceId=featureInstanceId,
mandateId=mandateId,
limit=15,
minScore=0.65,
minScore=0.35,
isSysAdmin=isSysAdmin,
)
logger.debug(
"buildAgentContext.layer1 instanceChunks=%d top_scores=%s",
len(instanceChunks),
[round(float(c.get("_score", 0) or 0), 3) for c in (instanceChunks or [])[:3]],
)
if instanceChunks:
builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000)
@ -338,7 +687,7 @@ class KnowledgeService:
queryVector=queryVector,
workflowId=workflowId,
limit=10,
minScore=0.55,
minScore=0.35,
)
if roundMemories:
memItems = []
@ -376,7 +725,7 @@ class KnowledgeService:
scope="mandate",
mandateId=mandateId,
limit=10,
minScore=0.7,
minScore=0.35,
isSysAdmin=isSysAdmin,
)
if mandateChunks:
@ -392,7 +741,12 @@ class KnowledgeService:
maxChars=500,
)
return builder.build()
_result = builder.build()
logger.debug(
"buildAgentContext.done totalChars=%d userId=%s",
len(_result), userId,
)
return _result
# =========================================================================
# Workflow Memory

View file

@ -0,0 +1,334 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Connection-lifecycle consumer bridging OAuth events to ingestion jobs.
Subscribes to `connection.established` and `connection.revoked` callbacks
emitted by the OAuth callbacks / connection management routes and dispatches:
- `connection.established` -> enqueue a `connection.bootstrap` BackgroundJob
that walks the connector and ingests all reachable items via
KnowledgeService.requestIngestion (file-like or virtual documents).
- `connection.revoked` -> run `KnowledgeService.purgeConnection` synchronously
so the knowledge corpus releases the data before the UI confirms the revoke.
The consumer is registered once at process boot (see `app.py` lifespan).
It intentionally does NOT hold a per-user service context; each callback
creates whatever context it needs from the UserConnection row itself.
"""
from __future__ import annotations
import asyncio
import logging
from typing import Any, Dict, Optional
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.shared.callbackRegistry import callbackRegistry
from modules.serviceCenter.services.serviceBackgroundJobs import (
registerJobHandler,
startJob,
)
logger = logging.getLogger(__name__)
BOOTSTRAP_JOB_TYPE = "connection.bootstrap"
_registered = False
def _onConnectionEstablished(
*,
connectionId: str,
authority: str,
userId: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Fire-and-forget bootstrap enqueue for a freshly connected UserConnection."""
if not connectionId:
logger.warning("connection.established without connectionId; ignoring")
return
payload: Dict[str, Any] = {
"connectionId": connectionId,
"authority": (authority or "").lower(),
"userId": userId,
}
logger.info(
"ingestion.connection.bootstrap.queued connectionId=%s authority=%s",
connectionId, authority,
extra={
"event": "ingestion.connection.bootstrap.queued",
"connectionId": connectionId,
"authority": authority,
},
)
async def _enqueue() -> None:
try:
await startJob(
BOOTSTRAP_JOB_TYPE,
payload,
triggeredBy=userId,
)
except Exception as exc:
logger.error(
"ingestion.connection.bootstrap.enqueue_failed connectionId=%s error=%s",
connectionId, exc, exc_info=True,
)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(_enqueue())
else:
loop.run_until_complete(_enqueue())
except RuntimeError:
asyncio.run(_enqueue())
def _onConnectionRevoked(
*,
connectionId: str,
authority: Optional[str] = None,
userId: Optional[str] = None,
reason: Optional[str] = None,
**kwargs: Any,
) -> None:
"""Run the knowledge purge synchronously so UI feedback is authoritative."""
if not connectionId:
logger.warning("connection.revoked without connectionId; ignoring")
return
try:
# Purge lives on the DB interface to avoid ServiceCenter/user-context
# plumbing here; the service method is a thin wrapper on top of this.
result = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId)
except Exception as exc:
logger.error(
"ingestion.connection.purged.failed connectionId=%s error=%s",
connectionId, exc, exc_info=True,
)
return
logger.info(
"ingestion.connection.purged connectionId=%s authority=%s reason=%s rows=%d chunks=%d",
connectionId, authority, reason,
result.get("indexRows", 0), result.get("chunks", 0),
extra={
"event": "ingestion.connection.purged",
"connectionId": connectionId,
"authority": authority,
"reason": reason,
"indexRows": result.get("indexRows", 0),
"chunks": result.get("chunks", 0),
},
)
async def _bootstrapJobHandler(
job: Dict[str, Any],
progressCb,
) -> Dict[str, Any]:
"""Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps."""
payload = job.get("payload") or {}
connectionId = payload.get("connectionId")
authority = (payload.get("authority") or "").lower()
if not connectionId:
raise ValueError("connection.bootstrap requires payload.connectionId")
progressCb(5, f"resolving {authority} connection")
# Defensive consent check: if the connection has since disabled knowledge ingestion
# (e.g. user toggled setting after the job was enqueued), skip all walkers.
try:
from modules.interfaces.interfaceDbApp import getRootInterface
_root = getRootInterface()
_conn = _root.getUserConnectionById(connectionId)
if _conn and not getattr(_conn, "knowledgeIngestionEnabled", True):
logger.info(
"ingestion.connection.bootstrap.skipped — consent disabled connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"connectionId": connectionId,
"authority": authority,
"reason": "consent_disabled",
},
)
return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "consent_disabled"}
except Exception as _guardErr:
logger.debug("Could not load connection for consent guard: %s", _guardErr)
def _normalize(res: Any, label: str) -> Dict[str, Any]:
if isinstance(res, Exception):
logger.error(
"ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s",
label, connectionId, res, exc_info=res,
)
return {"error": str(res)}
return res or {}
if authority == "msft":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import (
bootstrapSharepoint,
)
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import (
bootstrapOutlook,
)
progressCb(10, "sharepoint + outlook")
spResult, olResult = await asyncio.gather(
bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb),
bootstrapOutlook(connectionId=connectionId, progressCb=progressCb),
return_exceptions=True,
)
return {
"connectionId": connectionId,
"authority": authority,
"sharepoint": _normalize(spResult, "sharepoint"),
"outlook": _normalize(olResult, "outlook"),
}
if authority == "google":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import (
bootstrapGdrive,
)
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import (
bootstrapGmail,
)
progressCb(10, "drive + gmail")
gdResult, gmResult = await asyncio.gather(
bootstrapGdrive(connectionId=connectionId, progressCb=progressCb),
bootstrapGmail(connectionId=connectionId, progressCb=progressCb),
return_exceptions=True,
)
return {
"connectionId": connectionId,
"authority": authority,
"drive": _normalize(gdResult, "gdrive"),
"gmail": _normalize(gmResult, "gmail"),
}
if authority == "clickup":
from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import (
bootstrapClickup,
)
progressCb(10, "clickup tasks")
cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb)
return {
"connectionId": connectionId,
"authority": authority,
"clickup": _normalize(cuResult, "clickup"),
}
logger.info(
"ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s",
authority, connectionId,
extra={
"event": "ingestion.connection.bootstrap.skipped",
"authority": authority,
"connectionId": connectionId,
"reason": "unsupported_authority",
},
)
return {
"connectionId": connectionId,
"authority": authority,
"skipped": True,
"reason": "unsupported_authority",
}
async def _scheduledDailyResync() -> None:
"""Enqueue a connection.bootstrap job for every active knowledge connection.
Runs once per day (default 2 AM Europe/Zurich). Each job re-walks the
connector and hands new / changed items to KnowledgeService.requestIngestion.
Unchanged items are deduplicated by content-hash and skipped automatically.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
connections = rootInterface.getActiveKnowledgeConnections()
except Exception as exc:
logger.error("knowledge.daily_resync: could not load connections: %s", exc, exc_info=True)
return
if not connections:
logger.info("knowledge.daily_resync: no active knowledge connections — nothing to do")
return
logger.info(
"knowledge.daily_resync: enqueuing bootstrap for %d connection(s)",
len(connections),
extra={"event": "knowledge.daily_resync.started", "count": len(connections)},
)
enqueued = 0
skipped = 0
for conn in connections:
connectionId = str(conn.id)
authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority)
userId = str(conn.userId)
payload: Dict[str, Any] = {
"connectionId": connectionId,
"authority": authority.lower(),
"userId": userId,
}
try:
await startJob(
BOOTSTRAP_JOB_TYPE,
payload,
triggeredBy="scheduler.daily_resync",
)
enqueued += 1
logger.debug(
"knowledge.daily_resync: queued connectionId=%s authority=%s",
connectionId, authority,
)
except Exception as exc:
skipped += 1
logger.error(
"knowledge.daily_resync: failed to enqueue connectionId=%s: %s",
connectionId, exc,
)
logger.info(
"knowledge.daily_resync: done — enqueued=%d skipped=%d",
enqueued, skipped,
extra={"event": "knowledge.daily_resync.done", "enqueued": enqueued, "skipped": skipped},
)
def registerDailyResyncScheduler(*, hour: int = 2, minute: int = 0) -> None:
"""Register the daily knowledge re-sync cron job. Idempotent.
Args:
hour: Hour of day to run (023, default 2 2 AM Europe/Zurich).
minute: Minute within the hour (default 0).
"""
try:
from modules.shared.eventManagement import eventManager
eventManager.registerCron(
jobId="knowledge.daily_resync",
func=_scheduledDailyResync,
cronKwargs={"hour": str(hour), "minute": str(minute)},
)
logger.info(
"knowledge.daily_resync scheduler registered (daily %02d:%02d Europe/Zurich)",
hour, minute,
)
except Exception as exc:
logger.warning("knowledge.daily_resync scheduler registration failed (non-critical): %s", exc)
def registerKnowledgeIngestionConsumer() -> None:
"""Register callback subscribers + background job handler. Idempotent."""
global _registered
if _registered:
return
callbackRegistry.register("connection.established", _onConnectionEstablished)
callbackRegistry.register("connection.revoked", _onConnectionRevoked)
registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler)
registerDailyResyncScheduler()
_registered = True
logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler + daily resync)", BOOTSTRAP_JOB_TYPE)

View file

@ -0,0 +1,101 @@
"""Per-connection knowledge ingestion preference helpers.
Walkers call `loadConnectionPrefs(connectionId)` once at bootstrap start and
receive a `ConnectionIngestionPrefs` dataclass they can pass down into their
inner loops. All fields have safe defaults so walkers stay backward-compatible
with connections that predate the §2.6 preference schema (knowledgePreferences
is None).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
_DEFAULT_MAX_AGE_DAYS = 90
_DEFAULT_MAIL_DEPTH = "full"
_DEFAULT_CLICKUP_SCOPE = "title_description"
@dataclass
class ConnectionIngestionPrefs:
"""Parsed per-connection preferences for knowledge ingestion walkers."""
# PII
neutralizeBeforeEmbed: bool = False
# Mail (Outlook + Gmail)
mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full"
mailIndexAttachments: bool = False
# Files (Drive / SharePoint / OneDrive)
filesIndexBinaries: bool = True
mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed
# ClickUp
clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments"
clickupIndexAttachments: bool = False
# Per-authority surface toggles (default everything on)
gmailEnabled: bool = True
driveEnabled: bool = True
sharepointEnabled: bool = True
outlookEnabled: bool = True
# Time window
maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit
def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs:
"""Load and parse per-connection preferences from the database.
Returns safe defaults for any missing or unparseable values so walkers
never fail due to missing preference data.
"""
try:
from modules.interfaces.interfaceDbApp import getRootInterface
root = getRootInterface()
conn = root.getUserConnectionById(connectionId)
if not conn:
logger.debug("loadConnectionPrefs: connection %s not found, using defaults", connectionId)
return ConnectionIngestionPrefs()
raw: Optional[Dict[str, Any]] = getattr(conn, "knowledgePreferences", None)
if not raw or not isinstance(raw, dict):
return ConnectionIngestionPrefs()
def _bool(key: str, default: bool) -> bool:
v = raw.get(key)
return bool(v) if isinstance(v, bool) else default
def _str(key: str, allowed: List[str], default: str) -> str:
v = raw.get(key)
return v if v in allowed else default
def _int(key: str, default: int) -> int:
v = raw.get(key)
return int(v) if isinstance(v, int) else default
surface = raw.get("surfaceToggles") or {}
google_surf = surface.get("google") or {}
msft_surf = surface.get("msft") or {}
return ConnectionIngestionPrefs(
neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False),
mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH),
mailIndexAttachments=_bool("mailIndexAttachments", False),
filesIndexBinaries=_bool("filesIndexBinaries", True),
mimeAllowlist=list(raw.get("mimeAllowlist") or []),
clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE),
clickupIndexAttachments=_bool("clickupIndexAttachments", False),
gmailEnabled=bool(google_surf.get("gmail", True)),
driveEnabled=bool(google_surf.get("drive", True)),
sharepointEnabled=bool(msft_surf.get("sharepoint", True)),
outlookEnabled=bool(msft_surf.get("outlook", True)),
maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS),
)
except Exception as exc:
logger.warning("loadConnectionPrefs failed for %s, using defaults: %s", connectionId, exc)
return ConnectionIngestionPrefs()

View file

@ -0,0 +1,512 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""ClickUp bootstrap for the unified knowledge ingestion lane.
ClickUp tasks are ingested as *virtual documents* we never download file
bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose
`contentObjects` carry a summary header (name + status + metadata) and the
task description / text content so retrieval finds them without a live API
call.
Hierarchy traversal: workspace (team) spaces folders / folderless lists
tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` /
`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d).
Idempotency: `date_updated` from the ClickUp task payload is a millisecond
timestamp and strictly monotonic per revision used as `contentVersion`.
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
MAX_TASKS_DEFAULT = 500
MAX_WORKSPACES_DEFAULT = 3
MAX_LISTS_PER_WORKSPACE_DEFAULT = 20
MAX_DESCRIPTION_CHARS_DEFAULT = 8000
MAX_AGE_DAYS_DEFAULT = 180
@dataclass
class ClickupBootstrapLimits:
maxTasks: int = MAX_TASKS_DEFAULT
maxWorkspaces: int = MAX_WORKSPACES_DEFAULT
maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT
maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT
# Only ingest tasks updated within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Include closed/archived tasks if they still meet the recency filter.
# ClickUp `closed` tasks often carry the most useful RAG context
# ("why was this shipped the way it was?").
includeClosed: bool = True
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
# Content scope: "titles" | "title_description" | "with_comments"
clickupScope: str = "title_description"
@dataclass
class ClickupBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
workspaces: int = 0
lists: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticTaskId(connectionId: str, taskId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16]
return f"cu:{connectionId[:8]}:{token}"
def _truncate(value: Any, limit: int) -> str:
text = str(value or "").strip()
if not text:
return ""
if len(text) <= limit:
return text
return text[:limit].rstrip() + "\n[truncated]"
def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not dateUpdatedMs:
return True
try:
ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc)
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
return ts >= cutoff
def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]:
"""Header (name/status/metadata) + optional description + text_content.
`limits.clickupScope` controls how much is embedded:
- "titles": task name + status metadata only
- "title_description": header + description / text_content (default)
- "with_comments": header + description + text_content
(comments themselves are not yet fetched in v1)
"""
name = task.get("name") or f"Task {task.get('id', '')}"
status = ((task.get("status") or {}).get("status")) or ""
assignees = ", ".join(
filter(None, [
(a.get("username") or a.get("email") or "")
for a in (task.get("assignees") or [])
])
)
tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])]))
listInfo = task.get("list") or {}
folderInfo = task.get("folder") or {}
spaceInfo = task.get("space") or {}
dueMs = task.get("due_date")
dueIso = ""
if dueMs:
try:
dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d")
except Exception:
dueIso = ""
headerLines = [
f"Task: {name}",
f"Status: {status}" if status else "",
f"List: {listInfo.get('name', '')}" if listInfo else "",
f"Folder: {folderInfo.get('name', '')}" if folderInfo else "",
f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "",
f"Assignees: {assignees}" if assignees else "",
f"Tags: {tags}" if tags else "",
f"Due: {dueIso}" if dueIso else "",
f"Url: {task.get('url', '')}" if task.get("url") else "",
]
header = "\n".join(line for line in headerLines if line)
parts: List[Dict[str, Any]] = [{
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
}]
scope = getattr(limits, "clickupScope", "title_description")
if scope in ("title_description", "with_comments"):
description = _truncate(task.get("description"), limits.maxDescriptionChars)
if description:
parts.append({
"contentObjectId": "description",
"contentType": "text",
"data": description,
"contextRef": {"part": "description"},
})
# text_content is ClickUp's rendered-markdown version; include if it adds
# something beyond the plain description (common for bullet lists, checklists).
textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars)
if textContent and textContent != description:
parts.append({
"contentObjectId": "text_content",
"contentType": "text",
"data": textContent,
"contextRef": {"part": "text_content"},
})
return parts
async def bootstrapClickup(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[ClickupBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Walk workspaces → lists → tasks and ingest each task as a virtual doc."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = ClickupBootstrapLimits(
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
clickupScope=prefs.clickupScope,
)
startMs = time.time()
result = ClickupBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=clickup connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "clickup",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
svc = getattr(adapter, "_svc", None)
if svc is None:
result.errors.append("adapter missing _svc instance")
return _finalizeResult(connectionId, result, startMs)
try:
teamsResp = await svc.getAuthorizedTeams()
except Exception as exc:
logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"teams: {exc}")
return _finalizeResult(connectionId, result, startMs)
teams = (teamsResp or {}).get("teams") or []
for team in teams[: limits.maxWorkspaces]:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
break
teamId = str(team.get("id", "") or "")
if not teamId:
continue
result.workspaces += 1
try:
await _walkTeam(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
team=team,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True)
result.errors.append(f"team({teamId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerClickup.connectorClickup import ClickupConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = ClickupConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("clickup")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkTeam(
*,
svc,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
team: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
teamId = str(team.get("id", "") or "")
spacesResp = await svc.getSpaces(teamId)
spaces = (spacesResp or {}).get("spaces") or []
listsCollected: List[Dict[str, Any]] = []
for space in spaces:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
spaceId = str(space.get("id", "") or "")
if not spaceId:
continue
# Folderless lists directly under the space
folderless = await svc.getFolderlessLists(spaceId)
for lst in (folderless or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
listsCollected.append({**lst, "_space": space})
# Lists inside folders
foldersResp = await svc.getFolders(spaceId)
for folder in (foldersResp or {}).get("folders") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
folderId = str(folder.get("id", "") or "")
if not folderId:
continue
folderLists = await svc.getListsInFolder(folderId)
for lst in (folderLists or {}).get("lists") or []:
if len(listsCollected) >= limits.maxListsPerWorkspace:
break
listsCollected.append({**lst, "_space": space, "_folder": folder})
for lst in listsCollected:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return
result.lists += 1
await _walkList(
svc=svc,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
teamId=teamId,
lst=lst,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _walkList(
*,
svc,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
teamId: str,
lst: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
listId = str(lst.get("id", "") or "")
if not listId:
return
page = 0
while result.indexed + result.skippedDuplicate < limits.maxTasks:
resp = await svc.getTasksInList(
listId,
page=page,
include_closed=limits.includeClosed,
subtasks=True,
)
if isinstance(resp, dict) and resp.get("error"):
logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error"))
result.errors.append(f"list({listId}): {resp.get('error')}")
return
tasks = (resp or {}).get("tasks") or []
if not tasks:
return
for task in tasks:
if result.indexed + result.skippedDuplicate >= limits.maxTasks:
return
if not _isRecent(task.get("date_updated"), limits.maxAgeDays):
result.skippedPolicy += 1
continue
# Inject the list/folder/space metadata we already loaded.
task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")}
task["folder"] = task.get("folder") or lst.get("_folder") or {}
task["space"] = task.get("space") or lst.get("_space") or {}
await _ingestTask(
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
teamId=teamId,
task=task,
limits=limits,
result=result,
progressCb=progressCb,
)
if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page
return
page += 1
async def _ingestTask(
*,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
teamId: str,
task: Dict[str, Any],
limits: ClickupBootstrapLimits,
result: ClickupBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
taskId = str(task.get("id", "") or "")
if not taskId:
result.skippedPolicy += 1
return
revision = str(task.get("date_updated") or task.get("date_created") or "")
name = task.get("name") or f"Task {taskId}"
syntheticId = _syntheticTaskId(connectionId, taskId)
fileName = f"{name[:80].strip() or taskId}.task.json"
contentObjects = _buildContentObjects(task, limits)
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="clickup_task",
sourceId=syntheticId,
fileName=fileName,
mimeType="application/vnd.clickup.task+json",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision or None,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "clickup",
"service": "clickup",
"externalItemId": taskId,
"teamId": teamId,
"listId": ((task.get("list") or {}).get("id")),
"spaceId": ((task.get("space") or {}).get("id")),
"url": task.get("url"),
"status": ((task.get("status") or {}).get("status")),
"tier": limits.clickupScope,
},
)
)
except Exception as exc:
logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({taskId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxTasks))),
f"clickup processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "clickup",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.workspaces, result.lists, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "clickup",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"workspaces": result.workspaces,
"lists": result.lists,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"workspaces": result.workspaces,
"lists": result.lists,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,443 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Google Drive bootstrap for the unified knowledge ingestion lane.
Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the
user's *My Drive* tree from the virtual `root` folder, downloads each
file-like item via `DriveAdapter.download` (which handles native Google docs
via export), runs the standard extraction pipeline and routes results through
`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and
`contentVersion = modifiedTime` (monotonic per-revision).
"""
from __future__ import annotations
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_AGE_DAYS_DEFAULT = 365
# Google Drive uses virtual mime-types for folders and non-downloadable assets.
FOLDER_MIME = "application/vnd.google-apps.folder"
@dataclass
class GdriveBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
# Only ingest files modified within the last N days. None disables filter.
maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
# Whether to skip binary/non-text files
filesIndexBinaries: bool = True
@dataclass
class GdriveBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"gd:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool:
if not maxAgeDays:
return True
if not modifiedIso:
# No timestamp -> be permissive (Drive native docs sometimes omit it on export).
return True
try:
# Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both.
ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00"))
except Exception:
return True
cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays)
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
return ts >= cutoff
async def bootstrapGdrive(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GdriveBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Walk My Drive starting from the virtual root folder."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = GdriveBootstrapLimits(
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
neutralize=prefs.neutralizeBeforeEmbed,
filesIndexBinaries=prefs.filesIndexBinaries,
)
startMs = time.time()
result = GdriveBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gdrive connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gdrive",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root"
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"walk: {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("drive")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("gdrive browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
metadata = getattr(entry, "metadata", {}) or {}
mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType")
if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
effectiveMime = mimeType or "application/octet-stream"
if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
modifiedTime = metadata.get("modifiedTime")
if not _isRecent(modifiedTime, limits.maxAgeDays):
result.skippedPolicy += 1
continue
externalItemId = metadata.get("id") or entryPath
revision = modifiedTime
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=effectiveMime,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: GdriveBootstrapLimits,
result: GdriveBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
downloaded = await adapter.download(entryPath)
except Exception as exc:
logger.warning("gdrive download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
# Adapter.download returns raw bytes today; guard DownloadResult shape too.
fileBytes: bytes
if isinstance(downloaded, (bytes, bytearray)):
fileBytes = bytes(downloaded)
else:
fileBytes = bytes(getattr(downloaded, "data", b"") or b"")
if getattr(downloaded, "mimeType", None):
mimeType = downloaded.mimeType # export may have changed the type
if not fileBytes:
result.failed += 1
return
if len(fileBytes) > limits.maxFileSize:
result.skippedPolicy += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gdrive extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gdrive_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "drive",
"externalItemId": externalItemId,
"entryPath": entryPath,
"tier": "body",
},
)
)
except Exception as exc:
logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"gdrive processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gdrive",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.failed, result.bytesProcessed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gdrive",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytes": result.bytesProcessed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,606 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Gmail bootstrap for the unified knowledge ingestion lane.
Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google
Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents
with header / snippet / cleaned body content-objects; attachments are optional
child jobs with `sourceKind="gmail_attachment"`.
Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is
passed as `contentVersion`, so rerunning the bootstrap yields
`ingestion.skipped.duplicate` for unchanged messages.
"""
from __future__ import annotations
import asyncio
import base64
import hashlib
import logging
import time
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
DEFAULT_LABELS = ("INBOX", "SENT")
@dataclass
class GmailBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
labels: tuple = DEFAULT_LABELS
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
# Content depth: "metadata" | "snippet" | "full"
mailContentDepth: str = "full"
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class GmailBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"gm:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"ga:{connectionId[:8]}:{token}"
def _decodeBase64Url(data: str) -> bytes:
if not data:
return b""
# Gmail uses URL-safe base64 without padding.
padding = 4 - (len(data) % 4)
if padding != 4:
data = data + ("=" * padding)
try:
return base64.urlsafe_b64decode(data)
except Exception:
return b""
def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]:
"""Return {"text": ..., "html": ...} by walking MIME parts.
Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned
body, but capture `text/html` as a fallback so `cleanEmailBody` can strip
markup if plain is missing.
"""
found: Dict[str, str] = {"text": "", "html": ""}
def _walk(part: Dict[str, Any]) -> None:
mime = (part.get("mimeType") or "").lower()
body = part.get("body") or {}
raw = body.get("data") or ""
if raw and mime.startswith("text/"):
decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace")
key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "")
if key and not found[key]:
found[key] = decoded
for sub in part.get("parts") or []:
_walk(sub)
_walk(payload or {})
return found
def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]:
return {
(h.get("name") or "").lower(): (h.get("value") or "")
for h in (payload.get("headers") or [])
}
def _buildContentObjects(
message: Dict[str, Any],
maxBodyChars: int,
mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
"""Build content objects for a Gmail message.
`mailContentDepth` controls how much is embedded:
- "metadata": header only (subject, from, to, date)
- "snippet": header + Gmail snippet (~155 chars, no full body)
- "full": header + snippet + cleaned full body (default)
"""
payload = message.get("payload") or {}
headers = _headerMap(payload)
subject = headers.get("subject") or "(no subject)"
fromAddr = headers.get("from") or ""
toAddr = headers.get("to") or ""
ccAddr = headers.get("cc") or ""
date = headers.get("date") or ""
snippet = message.get("snippet") or ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {date}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if mailContentDepth in ("snippet", "full") and snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if mailContentDepth == "full":
bodies = _walkPayloadForBody(payload)
rawBody = bodies["text"] or bodies["html"]
cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else ""
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapGmail(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[GmailBootstrapLimits] = None,
googleGetFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate Gmail labels (INBOX + SENT default) and ingest messages."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = GmailBootstrapLimits(
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time()
result = GmailBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=gmail connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "gmail",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if googleGetFn is None:
from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet
token = getattr(adapter, "_token", "")
async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef]
return await _defaultGet(token, url)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
for labelId in limits.labels:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestLabel(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True)
result.errors.append(f"label({labelId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = GoogleConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("gmail")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _ingestLabel(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
labelId: str,
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
query = ""
if limits.maxAgeDays:
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
# Gmail uses YYYY/MM/DD.
query = f"after:{cutoff.strftime('%Y/%m/%d')}"
baseUrl = (
"https://gmail.googleapis.com/gmail/v1/users/me/messages"
f"?labelIds={labelId}&maxResults={pageSize}"
)
if query:
baseUrl = f"{baseUrl}&q={query}"
nextPageToken: Optional[str] = None
while (result.indexed + result.skippedDuplicate) < limits.maxMessages:
url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}"
page = await googleGetFn(url)
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("gmail list page error for label %s: %s", labelId, err)
result.errors.append(f"list({labelId}): {err}")
return
messageStubs = page.get("messages") or []
for stub in messageStubs:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
msgId = stub.get("id")
if not msgId:
continue
detailUrl = (
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full"
)
detail = await googleGetFn(detailUrl)
if not isinstance(detail, dict) or "error" in detail:
result.failed += 1
continue
await _ingestMessage(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
labelId=labelId,
message=detail,
limits=limits,
result=result,
progressCb=progressCb,
)
nextPageToken = page.get("nextPageToken")
if not nextPageToken:
break
async def _ingestMessage(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
labelId: str,
message: Dict[str, Any],
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("historyId") or message.get("internalDate")
headers = _headerMap(message.get("payload") or {})
subject = headers.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(
message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
)
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gmail_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=str(revision) if revision else None,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "gmail",
"externalItemId": messageId,
"label": labelId,
"threadId": message.get("threadId"),
"tier": limits.mailContentDepth,
},
)
)
except Exception as exc:
logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments:
try:
await _ingestAttachments(
googleGetFn=googleGetFn,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("gmail attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"gmail processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "gmail",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
googleGetFn,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
parentSyntheticId: str,
limits: GmailBootstrapLimits,
result: GmailBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments. Skips inline images (cid: refs)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
messageId = message.get("id") or ""
def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None:
filename = part.get("filename") or ""
body = part.get("body") or {}
attId = body.get("attachmentId")
if filename and attId:
acc.append({
"filename": filename,
"mimeType": part.get("mimeType") or "application/octet-stream",
"attachmentId": attId,
"size": int(body.get("size") or 0),
})
for sub in part.get("parts") or []:
_collectAttachmentStubs(sub, acc)
stubs: List[Dict[str, Any]] = []
_collectAttachmentStubs(message.get("payload") or {}, stubs)
if not stubs:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for stub in stubs:
if stub["size"] and stub["size"] > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
attUrl = (
f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}"
f"/attachments/{stub['attachmentId']}"
)
detail = await googleGetFn(attUrl)
if not isinstance(detail, dict) or "error" in detail:
result.failed += 1
continue
rawBytes = _decodeBase64Url(detail.get("data") or "")
if not rawBytes:
continue
fileName = stub["filename"]
mimeType = stub["mimeType"]
syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"])
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="gmail_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"connectionId": connectionId,
"authority": "google",
"service": "gmail",
"parentId": parentSyntheticId,
"externalItemId": stub["attachmentId"],
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "gmail",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,576 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Outlook bootstrap for the unified knowledge ingestion lane.
Unlike SharePoint, Outlook messages are "virtual documents" we never persist
file bytes in the store. Each message becomes a `sourceKind="outlook_message"`
IngestionJob whose `contentObjects` carry the header, snippet and cleaned body
so retrieval can show a compact answer without fetching Graph again.
Attachments are optional (`includeAttachments` limit flag) and enqueued as
child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody
logger = logging.getLogger(__name__)
MAX_MESSAGES_DEFAULT = 500
MAX_FOLDERS_DEFAULT = 5
MAX_BODY_CHARS_DEFAULT = 8000
MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024
WELL_KNOWN_FOLDERS = ("inbox", "sentitems")
@dataclass
class OutlookBootstrapLimits:
maxMessages: int = MAX_MESSAGES_DEFAULT
maxFolders: int = MAX_FOLDERS_DEFAULT
maxBodyChars: int = MAX_BODY_CHARS_DEFAULT
includeAttachments: bool = False
maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT
# Only fetch messages newer than N days. None disables filter.
maxAgeDays: Optional[int] = 90
# Content depth: "metadata" | "snippet" | "full"
mailContentDepth: str = "full"
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class OutlookBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
attachmentsIndexed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticMessageId(connectionId: str, messageId: str) -> str:
token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16]
return f"om:{connectionId[:8]}:{token}"
def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str:
token = hashlib.sha256(
f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8")
).hexdigest()[:16]
return f"oa:{connectionId[:8]}:{token}"
def _extractRecipient(recipient: Dict[str, Any]) -> str:
email = (recipient or {}).get("emailAddress") or {}
name = email.get("name") or ""
addr = email.get("address") or ""
if name and addr:
return f"{name} <{addr}>"
return addr or name
def _joinRecipients(recipients: List[Dict[str, Any]]) -> str:
return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []]))
def _buildContentObjects(
message: Dict[str, Any],
maxBodyChars: int,
mailContentDepth: str = "full",
) -> List[Dict[str, Any]]:
"""Build content objects for an Outlook message.
`mailContentDepth` mirrors the Gmail walker:
- "metadata": header only
- "snippet": header + bodyPreview (~255 chars)
- "full": header + snippet + cleaned body (default)
"""
subject = message.get("subject") or "(no subject)"
fromAddr = _extractRecipient(message.get("from") or {})
toAddr = _joinRecipients(message.get("toRecipients") or [])
ccAddr = _joinRecipients(message.get("ccRecipients") or [])
received = message.get("receivedDateTime") or ""
snippet = message.get("bodyPreview") or ""
parts: List[Dict[str, Any]] = []
header = (
f"Subject: {subject}\n"
f"From: {fromAddr}\n"
f"To: {toAddr}\n"
+ (f"Cc: {ccAddr}\n" if ccAddr else "")
+ f"Date: {received}"
)
parts.append({
"contentObjectId": "header",
"contentType": "text",
"data": header,
"contextRef": {"part": "header"},
})
if mailContentDepth in ("snippet", "full") and snippet:
parts.append({
"contentObjectId": "snippet",
"contentType": "text",
"data": snippet,
"contextRef": {"part": "snippet"},
})
if mailContentDepth == "full":
body = message.get("body") or {}
bodyContent = body.get("content") or ""
cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else ""
if cleanedBody:
parts.append({
"contentObjectId": "body",
"contentType": "text",
"data": cleanedBody,
"contextRef": {"part": "body"},
})
return parts
async def bootstrapOutlook(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[OutlookBootstrapLimits] = None,
) -> Dict[str, Any]:
"""Enumerate Outlook folders (inbox + sent by default) and ingest messages."""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = OutlookBootstrapLimits(
includeAttachments=prefs.mailIndexAttachments,
maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None,
mailContentDepth=prefs.mailContentDepth,
neutralize=prefs.neutralizeBeforeEmbed,
)
startMs = time.time()
result = OutlookBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=outlook connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "outlook",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
folderIds = await _selectFolderIds(adapter, limits)
for folderId in folderIds:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
try:
await _ingestFolder(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderId=folderId,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True)
result.errors.append(f"folder({folderId}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("outlook")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]:
"""Prefer well-known folders (inbox, sentitems); fall back to browse()."""
folderIds: List[str] = []
for wellKnown in WELL_KNOWN_FOLDERS:
if len(folderIds) >= limits.maxFolders:
break
try:
row = await adapter._graphGet(f"me/mailFolders/{wellKnown}")
except Exception:
row = None
if isinstance(row, dict) and "error" not in row and row.get("id"):
folderIds.append(row["id"])
if len(folderIds) < limits.maxFolders:
try:
entries = await adapter.browse("/")
except Exception:
entries = []
for entry in entries:
metadata = getattr(entry, "metadata", {}) or {}
fid = metadata.get("id")
if fid and fid not in folderIds:
folderIds.append(fid)
if len(folderIds) >= limits.maxFolders:
break
return folderIds
async def _ingestFolder(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
folderId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate)
if remaining <= 0:
return
pageSize = min(100, remaining)
select = (
"id,subject,from,toRecipients,ccRecipients,receivedDateTime,"
"bodyPreview,body,internetMessageId,hasAttachments,changeKey"
)
endpoint: Optional[str] = (
f"me/mailFolders/{folderId}/messages"
f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}"
)
# Keep header-based age filter in Graph itself to avoid shipping ancient
# messages we'd discard client-side.
if limits.maxAgeDays:
from datetime import datetime, timezone, timedelta
cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays)
cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ")
endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}"
while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages:
try:
page = await adapter._graphGet(endpoint)
except Exception as exc:
logger.warning("outlook graph page failed for folder %s: %s", folderId, exc)
result.errors.append(f"graph({folderId}): {exc}")
return
if not isinstance(page, dict) or "error" in page:
err = (page or {}).get("error") if isinstance(page, dict) else "unknown"
logger.warning("outlook graph page error for folder %s: %s", folderId, err)
result.errors.append(f"graph({folderId}): {err}")
return
for message in page.get("value", []) or []:
if result.indexed + result.skippedDuplicate >= limits.maxMessages:
break
await _ingestMessage(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
message=message,
limits=limits,
result=result,
progressCb=progressCb,
)
nextLink = page.get("@odata.nextLink")
if not nextLink:
break
# Strip Graph base so adapter._graphGet accepts the relative path.
from modules.connectors.providerMsft.connectorMsft import _stripGraphBase
endpoint = _stripGraphBase(nextLink)
async def _ingestMessage(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
message: Dict[str, Any],
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
messageId = message.get("id")
if not messageId:
result.skippedPolicy += 1
return
revision = message.get("changeKey") or message.get("internetMessageId")
subject = message.get("subject") or "(no subject)"
syntheticId = _syntheticMessageId(connectionId, messageId)
fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml"
contentObjects = _buildContentObjects(
message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth
)
# Always at least the header is emitted, so `contentObjects` is non-empty.
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_message",
sourceId=syntheticId,
fileName=fileName,
mimeType="message/rfc822",
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"externalItemId": messageId,
"internetMessageId": message.get("internetMessageId"),
"tier": limits.mailContentDepth,
},
)
)
except Exception as exc:
logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({messageId}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if limits.includeAttachments and message.get("hasAttachments"):
try:
await _ingestAttachments(
adapter=adapter,
knowledgeService=knowledgeService,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
messageId=messageId,
parentSyntheticId=syntheticId,
limits=limits,
result=result,
)
except Exception as exc:
logger.warning("outlook attachments %s failed: %s", messageId, exc)
result.errors.append(f"attachments({messageId}): {exc}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxMessages))),
f"outlook processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "outlook",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
await asyncio.sleep(0)
async def _ingestAttachments(
*,
adapter,
knowledgeService,
connectionId: str,
mandateId: str,
userId: str,
messageId: str,
parentSyntheticId: str,
limits: OutlookBootstrapLimits,
result: OutlookBootstrapResult,
) -> None:
"""Child ingestion jobs for file attachments (skip inline & oversized)."""
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
from modules.datamodels.datamodelExtraction import ExtractionOptions
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
import base64
page = await adapter._graphGet(f"me/messages/{messageId}/attachments")
if not isinstance(page, dict) or "error" in page:
return
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
for attachment in page.get("value", []) or []:
if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment":
continue
if attachment.get("isInline"):
continue
size = int(attachment.get("size") or 0)
if size and size > limits.maxAttachmentBytes:
result.skippedPolicy += 1
continue
contentBytesB64 = attachment.get("contentBytes")
if not contentBytesB64:
continue
try:
rawBytes = base64.b64decode(contentBytesB64)
except Exception:
result.skippedPolicy += 1
continue
fileName = attachment.get("name") or "attachment"
mimeType = attachment.get("contentType") or "application/octet-stream"
attachmentId = attachment.get("id") or fileName
syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId)
try:
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc)
result.failed += 1
continue
contentObjects: List[Dict[str, Any]] = []
for part in getattr(extracted, "parts", None) or []:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
contentObjects.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "attachment",
**(getattr(part, "metadata", None) or {}),
},
})
if not contentObjects:
result.skippedPolicy += 1
continue
try:
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="outlook_attachment",
sourceId=syntheticId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
neutralize=limits.neutralize,
provenance={
"connectionId": connectionId,
"authority": "msft",
"service": "outlook",
"parentId": parentSyntheticId,
"externalItemId": attachmentId,
"parentMessageId": messageId,
},
)
)
result.attachmentsIndexed += 1
except Exception as exc:
logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc)
result.failed += 1
def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy,
result.attachmentsIndexed, result.failed, durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "outlook",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"attachmentsIndexed": result.attachmentsIndexed,
"failed": result.failed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,433 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""SharePoint bootstrap for the unified knowledge ingestion lane.
Walks the SharePoint drive(s) reachable via a UserConnection, downloads each
file-like item, runs the standard content extraction pipeline and hands the
result to `KnowledgeService.requestIngestion`. Idempotency is provided by the
ingestion façade itself; repeat bootstraps therefore produce
`ingestion.skipped.duplicate` for every unchanged item because we pass the
Graph `eTag` as `contentVersion`.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from modules.datamodels.datamodelExtraction import ExtractionOptions
logger = logging.getLogger(__name__)
MAX_ITEMS_DEFAULT = 500
MAX_BYTES_DEFAULT = 200 * 1024 * 1024
MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024
SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/")
MAX_DEPTH_DEFAULT = 4
MAX_SITES_DEFAULT = 3
@dataclass
class SharepointBootstrapLimits:
maxItems: int = MAX_ITEMS_DEFAULT
maxBytes: int = MAX_BYTES_DEFAULT
maxFileSize: int = MAX_FILE_SIZE_DEFAULT
skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT
maxDepth: int = MAX_DEPTH_DEFAULT
maxSites: int = MAX_SITES_DEFAULT
# Pass-through to IngestionJob.neutralize
neutralize: bool = False
@dataclass
class SharepointBootstrapResult:
connectionId: str
indexed: int = 0
skippedDuplicate: int = 0
skippedPolicy: int = 0
failed: int = 0
bytesProcessed: int = 0
errors: List[str] = field(default_factory=list)
def _syntheticFileId(connectionId: str, externalItemId: str) -> str:
"""Deterministic synthetic FileContentIndex id for a SharePoint item.
Stable across bootstraps idempotency works; independent of file name so
moves/renames don't duplicate chunks.
"""
token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16]
return f"sp:{connectionId[:8]}:{token}"
def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]:
"""Translate ExtractionResult → content objects accepted by requestIngestion."""
parts = getattr(extracted, "parts", None) or []
out: List[Dict[str, Any]] = []
for part in parts:
data = getattr(part, "data", None) or ""
if not data or not str(data).strip():
continue
typeGroup = getattr(part, "typeGroup", "text") or "text"
contentType = "text"
if typeGroup == "image":
contentType = "image"
elif typeGroup in ("binary", "container"):
contentType = "other"
out.append({
"contentObjectId": getattr(part, "id", ""),
"contentType": contentType,
"data": data,
"contextRef": {
"containerPath": fileName,
"location": getattr(part, "label", None) or "file",
**(getattr(part, "metadata", None) or {}),
},
})
return out
async def bootstrapSharepoint(
connectionId: str,
*,
progressCb: Optional[Callable[[int, Optional[str]], None]] = None,
adapter: Any = None,
connection: Any = None,
knowledgeService: Any = None,
limits: Optional[SharepointBootstrapLimits] = None,
runExtractionFn: Optional[Callable[..., Any]] = None,
) -> Dict[str, Any]:
"""Enumerate SharePoint drives and ingest every reachable file via the façade.
Parameters allow injection for tests; production callers pass only
`connectionId` (and optionally a progressCb) and everything else is
resolved against the registered services.
"""
from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs
prefs = loadConnectionPrefs(connectionId)
if not limits:
limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed)
startMs = time.time()
result = SharepointBootstrapResult(connectionId=connectionId)
logger.info(
"ingestion.connection.bootstrap.started part=sharepoint connectionId=%s",
connectionId,
extra={
"event": "ingestion.connection.bootstrap.started",
"part": "sharepoint",
"connectionId": connectionId,
},
)
if adapter is None or knowledgeService is None or connection is None:
adapter, connection, knowledgeService = await _resolveDependencies(connectionId)
if runExtractionFn is None:
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.serviceCenter.services.serviceExtraction.subRegistry import (
ExtractorRegistry, ChunkerRegistry,
)
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef]
return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options)
mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else ""
userId = str(getattr(connection, "userId", "") or "") if connection is not None else ""
try:
sites = await adapter.browse("/", limit=limits.maxSites)
except Exception as exc:
logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True)
result.errors.append(f"site_discovery: {exc}")
return _finalizeResult(connectionId, result, startMs)
for site in sites[: limits.maxSites]:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
break
sitePath = getattr(site, "path", "") or ""
try:
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=sitePath,
depth=0,
limits=limits,
result=result,
progressCb=progressCb,
)
except Exception as exc:
logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True)
result.errors.append(f"walk({sitePath}): {exc}")
return _finalizeResult(connectionId, result, startMs)
async def _resolveDependencies(connectionId: str):
"""Load connection, instantiate SharepointAdapter, and build a KnowledgeService.
Runs with root privileges: bootstrap is a system operation triggered by an
authenticated user via callback; it must not be gated by a per-user
service-center context.
"""
from modules.interfaces.interfaceDbApp import getRootInterface
from modules.auth import TokenManager
from modules.connectors.providerMsft.connectorMsft import MsftConnector
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
from modules.security.rootAccess import getRootUser
rootInterface = getRootInterface()
connection = rootInterface.getUserConnectionById(connectionId)
if connection is None:
raise ValueError(f"UserConnection not found: {connectionId}")
token = TokenManager().getFreshToken(connectionId)
if not token or not token.tokenAccess:
raise ValueError(f"No valid token for connection {connectionId}")
provider = MsftConnector(connection, token.tokenAccess)
adapter = provider.getServiceAdapter("sharepoint")
rootUser = getRootUser()
ctx = ServiceCenterContext(
user=rootUser,
mandate_id=str(getattr(connection, "mandateId", "") or ""),
)
knowledgeService = getService("knowledge", ctx)
return adapter, connection, knowledgeService
async def _walkFolder(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
folderPath: str,
depth: int,
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
if depth > limits.maxDepth:
return
try:
entries = await adapter.browse(folderPath)
except Exception as exc:
logger.warning("sharepoint browse %s failed: %s", folderPath, exc)
result.errors.append(f"browse({folderPath}): {exc}")
return
for entry in entries:
if result.indexed + result.skippedDuplicate >= limits.maxItems:
return
if result.bytesProcessed >= limits.maxBytes:
return
entryPath = getattr(entry, "path", "") or ""
if getattr(entry, "isFolder", False):
await _walkFolder(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
folderPath=entryPath,
depth=depth + 1,
limits=limits,
result=result,
progressCb=progressCb,
)
continue
mimeType = getattr(entry, "mimeType", None) or "application/octet-stream"
if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes):
result.skippedPolicy += 1
continue
size = int(getattr(entry, "size", 0) or 0)
if size and size > limits.maxFileSize:
result.skippedPolicy += 1
continue
metadata = getattr(entry, "metadata", {}) or {}
externalItemId = metadata.get("id") or entryPath
revision = metadata.get("revision") or metadata.get("lastModifiedDateTime")
await _ingestOne(
adapter=adapter,
knowledgeService=knowledgeService,
runExtractionFn=runExtractionFn,
connectionId=connectionId,
mandateId=mandateId,
userId=userId,
entry=entry,
entryPath=entryPath,
mimeType=mimeType,
externalItemId=externalItemId,
revision=revision,
limits=limits,
result=result,
progressCb=progressCb,
)
async def _ingestOne(
*,
adapter,
knowledgeService,
runExtractionFn,
connectionId: str,
mandateId: str,
userId: str,
entry,
entryPath: str,
mimeType: str,
externalItemId: str,
revision: Optional[str],
limits: SharepointBootstrapLimits,
result: SharepointBootstrapResult,
progressCb: Optional[Callable[[int, Optional[str]], None]],
) -> None:
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
syntheticFileId = _syntheticFileId(connectionId, externalItemId)
fileName = getattr(entry, "name", "") or externalItemId
try:
fileBytes = await adapter.download(entryPath)
except Exception as exc:
logger.warning("sharepoint download %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"download({entryPath}): {exc}")
return
if not fileBytes:
result.failed += 1
return
result.bytesProcessed += len(fileBytes)
try:
extracted = runExtractionFn(
fileBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
except Exception as exc:
logger.warning("sharepoint extraction %s failed: %s", entryPath, exc)
result.failed += 1
result.errors.append(f"extract({entryPath}): {exc}")
return
contentObjects = _toContentObjects(extracted, fileName)
if not contentObjects:
result.skippedPolicy += 1
return
provenance: Dict[str, Any] = {
"connectionId": connectionId,
"authority": "msft",
"service": "sharepoint",
"externalItemId": externalItemId,
"externalPath": entryPath,
"revision": revision,
}
try:
handle = await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="sharepoint_item",
sourceId=syntheticFileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
mandateId=mandateId,
contentObjects=contentObjects,
contentVersion=revision,
neutralize=limits.neutralize,
provenance=provenance,
)
)
except Exception as exc:
logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True)
result.failed += 1
result.errors.append(f"ingest({entryPath}): {exc}")
return
if handle.status == "duplicate":
result.skippedDuplicate += 1
elif handle.status == "indexed":
result.indexed += 1
else:
result.failed += 1
if handle.error:
result.errors.append(f"ingest({entryPath}): {handle.error}")
if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0:
processed = result.indexed + result.skippedDuplicate
try:
progressCb(
min(90, 10 + int(80 * processed / max(1, limits.maxItems))),
f"sharepoint processed={processed}",
)
except Exception:
pass
logger.info(
"ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d",
processed, result.skippedDuplicate, result.failed,
extra={
"event": "ingestion.connection.bootstrap.progress",
"part": "sharepoint",
"connectionId": connectionId,
"processed": processed,
"skippedDup": result.skippedDuplicate,
"failed": result.failed,
},
)
# Yield so the event loop can interleave other tasks (download/extract are
# CPU-ish and extraction uses sync libs; cooperative scheduling prevents
# starving other workers).
await asyncio.sleep(0)
def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]:
durationMs = int((time.time() - startMs) * 1000)
logger.info(
"ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d",
connectionId,
result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed,
durationMs,
extra={
"event": "ingestion.connection.bootstrap.done",
"part": "sharepoint",
"connectionId": connectionId,
"indexed": result.indexed,
"skippedDup": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"durationMs": durationMs,
},
)
return {
"connectionId": result.connectionId,
"indexed": result.indexed,
"skippedDuplicate": result.skippedDuplicate,
"skippedPolicy": result.skippedPolicy,
"failed": result.failed,
"bytesProcessed": result.bytesProcessed,
"durationMs": durationMs,
"errors": result.errors[:20],
}

View file

@ -0,0 +1,107 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Text normalisation utilities used by knowledge ingestion.
The email body cleaning logic is intentionally regex-based and works on plain
text after an HTMLtext pass so we never store unsanitised HTML/JS in the
knowledge store and retrieval stays robust (no extraneous markup tokens
eating embedding budget).
"""
from __future__ import annotations
import re
from typing import Optional
DEFAULT_MAX_CHARS = 8000
_QUOTE_MARKER_PATTERNS = [
re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE),
]
_SIGNATURE_MARKERS = [
re.compile(r"^\s*-{2,}\s*$", re.MULTILINE),
re.compile(r"^\s*—\s*$", re.MULTILINE),
re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE),
re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE),
]
def _htmlToText(html: str) -> str:
"""Prefer BeautifulSoup when available, fall back to regex."""
try:
from bs4 import BeautifulSoup # type: ignore
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "head"]):
tag.decompose()
for br in soup.find_all(["br"]):
br.replace_with("\n")
for p in soup.find_all(["p", "div", "li", "tr"]):
p.append("\n")
text = soup.get_text()
except Exception:
# Minimal fallback: strip tags crudely.
text = re.sub(r"<br\s*/?>", "\n", html, flags=re.IGNORECASE)
text = re.sub(r"</(?:p|div|li|tr)>", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
# Collapse non-breaking + zero-width whitespace.
text = text.replace("\u00a0", " ").replace("\u200b", "")
return text
def _stripQuotedThread(text: str) -> str:
"""Remove reply-chain content so only the author's own contribution remains."""
earliest = len(text)
for pattern in _QUOTE_MARKER_PATTERNS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
# Drop any block starting with "> " quoted lines (often Gmail/Thunderbird).
quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE)
if quotedBlock and quotedBlock.start() < earliest:
earliest = quotedBlock.start()
return text[:earliest].rstrip()
def _stripSignature(text: str) -> str:
earliest = len(text)
for pattern in _SIGNATURE_MARKERS:
match = pattern.search(text)
if match and match.start() < earliest:
earliest = match.start()
return text[:earliest].rstrip()
def _collapseWhitespace(text: str) -> str:
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str:
"""Return a compact plain-text view of an email body suitable for embedding.
Steps: HTML text, remove quoted reply chain, remove signature, collapse
whitespace, truncate to maxChars. Always returns a string (possibly empty).
"""
if not html:
return ""
text = _htmlToText(html) if "<" in html and ">" in html else html
text = _stripQuotedThread(text)
text = _stripSignature(text)
text = _collapseWhitespace(text)
if maxChars and len(text) > maxChars:
text = text[:maxChars].rstrip() + ""
return text

View file

@ -302,6 +302,30 @@ async def _executeWithRetry(executor, node, context, maxRetries: int = 0, retryD
raise lastError
def _substituteFeatureInstancePlaceholders(
graph: Dict[str, Any],
targetFeatureInstanceId: str,
) -> Dict[str, Any]:
"""Replace ``{{featureInstanceId}}`` placeholders in the serialised graph.
Works on the full JSON representation so that placeholders inside nested
parameter dicts, prompt strings, etc. are all caught. Already-resolved
concrete UUIDs (pre-baked by ``_copyTemplateWorkflows``) are left untouched
because the placeholder literal ``{{featureInstanceId}}`` will not match.
"""
import json as _json
raw = _json.dumps(graph)
if "{{featureInstanceId}}" not in raw:
return graph
replaced = raw.replace("{{featureInstanceId}}", targetFeatureInstanceId)
logger.debug(
"_substituteFeatureInstancePlaceholders: resolved %d occurrence(s) -> %s",
raw.count("{{featureInstanceId}}"),
targetFeatureInstanceId,
)
return _json.loads(replaced)
async def executeGraph(
graph: Dict[str, Any],
services: Any,
@ -315,6 +339,7 @@ async def executeGraph(
runId: Optional[str] = None,
run_envelope: Optional[Dict[str, Any]] = None,
label: Optional[str] = None,
targetFeatureInstanceId: Optional[str] = None,
) -> Dict[str, Any]:
"""
Execute automation2 graph. Returns { success, nodeOutputs, error?, stopped? }.
@ -322,14 +347,16 @@ async def executeGraph(
pauses the run, and returns { success: False, paused: True, taskId, runId }.
For resume: pass initialNodeOutputs (with result for the human node) and startAfterNodeId.
For fresh runs: pass run_envelope (unified start payload for the start node); normalized with userId into context.runEnvelope.
targetFeatureInstanceId: resolves {{featureInstanceId}} placeholders in the graph JSON before execution.
"""
logger.info(
"executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s",
"executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s targetInstance=%s",
instanceId,
workflowId,
userId,
mandateId,
startAfterNodeId is not None,
targetFeatureInstanceId,
)
from modules.workflows.processing.shared.methodDiscovery import discoverMethods
discoverMethods(services)
@ -338,6 +365,9 @@ async def executeGraph(
materializeFeatureInstanceRefs,
)
if targetFeatureInstanceId:
graph = _substituteFeatureInstancePlaceholders(graph, targetFeatureInstanceId)
# Phase-5 Schicht-4: typed-ref envelopes are materialized FIRST so the
# subsequent connection-ref pass and validation see the canonical shape.
graph = materializeFeatureInstanceRefs(graph)

View file

@ -326,11 +326,25 @@ class ActionNodeExecutor:
if isinstance(dumped, dict) and isinstance(rawData, bytes) and len(rawData) > 0:
try:
from modules.interfaces.interfaceDbManagement import getInterface as _getMgmtInterface
from modules.interfaces.interfaceDbApp import getInterface as _getAppInterface
from modules.security.rootAccess import getRootUser
_userId = context.get("userId")
_mandateId = context.get("mandateId")
_instanceId = context.get("instanceId")
_mgmt = _getMgmtInterface(getRootUser(), mandateId=_mandateId, featureInstanceId=_instanceId)
_owner = None
if _userId:
try:
_umap = _getAppInterface(getRootUser()).getUsersByIds([str(_userId)])
_owner = _umap.get(str(_userId))
except Exception as _ue:
logger.warning("Could not resolve workflow user for file persistence: %s", _ue)
if _owner is None:
_owner = getRootUser()
logger.debug(
"Persisting workflow document as root user (no resolved owner userId=%r)",
_userId,
)
_mgmt = _getMgmtInterface(_owner, mandateId=_mandateId, featureInstanceId=_instanceId)
_docName = dumped.get("documentName") or f"workflow-result-{nodeId}.bin"
_mimeType = dumped.get("mimeType") or "application/octet-stream"
_fileItem = _mgmt.createFile(_docName, _mimeType, rawData)
@ -345,6 +359,20 @@ class ActionNodeExecutor:
dumped["_hasBinaryData"] = True
docsList.append(dumped)
# Clean DocumentList shape for document nodes (match file.create: documents + count, no AiResult fields)
if outputSchema == "DocumentList" and nodeType in ("ai.generateDocument", "ai.convertDocument"):
if not result.success:
return _normalizeError(
RuntimeError(str(result.error or "document action failed")),
outputSchema,
)
list_out: Dict[str, Any] = {
"documents": docsList,
"count": len(docsList),
}
_attachConnectionProvenance(list_out, resolvedParams, outputSchema, chatService, self.services)
return normalizeToSchema(list_out, outputSchema)
extractedContext = ""
if result.documents:
doc = result.documents[0]

View file

@ -7,6 +7,50 @@ from typing import Dict, List, Any, Tuple, Set, Optional
logger = logging.getLogger(__name__)
def _ai_result_text_from_documents(d: Dict[str, Any]) -> Optional[str]:
"""Extract plain-text body from AiResult-style ``documents[0].documentData``."""
docs = d.get("documents")
if not isinstance(docs, list) or not docs:
return None
d0 = docs[0]
raw: Any = None
if isinstance(d0, dict):
raw = d0.get("documentData")
elif d0 is not None:
raw = getattr(d0, "documentData", None)
if raw is None:
return None
if isinstance(raw, bytes):
try:
t = raw.decode("utf-8").strip()
return t or None
except (UnicodeDecodeError, ValueError):
return None
if isinstance(raw, str):
s = raw.strip()
return s or None
return None
def _ref_coalesce_empty_ai_result_text(data: Any, path: List[Any], resolved: Any) -> Any:
"""If a ref targets AiResult text fields but resolves empty/missing, fall back to documents.
Needed when: optional ``responseData`` is absent (no synthetic ``{}``), ``response`` is
still empty but ``documents`` hold the model output, or legacy graphs bind responseData only.
"""
if resolved not in (None, ""):
return resolved
if not isinstance(data, dict) or not path:
return resolved
head = path[0]
if head not in ("response", "responseData", "context"):
return resolved
if head == "context" and len(path) != 1:
return resolved
fb = _ai_result_text_from_documents(data)
return fb if fb is not None else resolved
def parseGraph(graph: Dict[str, Any]) -> Tuple[List[Dict], List[Dict], Set[str]]:
"""
Parse graph into nodes, connections, and node IDs.
@ -356,14 +400,15 @@ def resolveParameterReferences(value: Any, nodeOutputs: Dict[str, Any]) -> Any:
data = data.get("data", data)
plist = list(path)
resolved = _get_by_path(data, plist)
if (
resolved is None
and isinstance(data, dict)
and plist
and plist[0] == "payload"
and len(plist) > 1
):
resolved = _get_by_path(data, plist[1:])
if resolved is None and isinstance(data, dict) and plist:
if plist[0] == "payload" and len(plist) > 1:
# Strip explicit "payload" prefix (legacy DataPicker paths)
resolved = _get_by_path(data, plist[1:])
elif "payload" in data and isinstance(data["payload"], dict):
# Form nodes store fields under {"payload": {fieldName: …}}.
# DataPicker emits bare field paths like ["url"]; try under payload.
resolved = _get_by_path(data["payload"], plist)
resolved = _ref_coalesce_empty_ai_result_text(data, plist, resolved)
return resolveParameterReferences(resolved, nodeOutputs)
return value
if value.get("type") == "value":
@ -386,17 +431,34 @@ def resolveParameterReferences(value: Any, nodeOutputs: Dict[str, Any]) -> Any:
if len(parts) < 2:
return json.dumps(data) if isinstance(data, (dict, list)) else str(data)
rest = ".".join(parts[1:])
if data is None:
def _walk(root, keys):
cur = root
for k in keys:
if isinstance(cur, dict) and k in cur:
cur = cur[k]
elif isinstance(cur, (list, tuple)) and k.isdigit():
cur = cur[int(k)]
else:
return None
return cur
keys = rest.split(".")
result = _walk(data, keys)
# Form nodes store fields under {"payload": {field: …}}.
# Fall back to looking under "payload" when the direct path misses.
if result is None and isinstance(data, dict) and "payload" in data:
result = _walk(data["payload"], keys)
if result is None:
return m.group(0)
for k in rest.split("."):
if isinstance(data, dict) and k in data:
data = data[k]
elif isinstance(data, (list, tuple)) and k.isdigit():
data = data[int(k)]
else:
return m.group(0)
return str(data) if data is not None else m.group(0)
return str(result) if not isinstance(result, (dict, list)) else json.dumps(result, ensure_ascii=False)
return re.sub(r"\{\{\s*([^}]+)\s*\}\}", repl, value)
if isinstance(value, list):
# contextBuilder: list where every item is a `{"type":"ref",...}` envelope.
# Resolve each ref and join the serialised parts into a single prompt string.
if value and all(isinstance(v, dict) and v.get("type") == "ref" for v in value):
from modules.workflows.methods.methodAi._common import serialize_context
parts = [serialize_context(resolveParameterReferences(v, nodeOutputs)) for v in value]
return "\n\n".join(p for p in parts if p)
return [resolveParameterReferences(v, nodeOutputs) for v in value]
return value

View file

@ -0,0 +1,42 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Shared helpers for AI workflow actions."""
import json
from typing import Any
def serialize_context(val: Any) -> str:
"""Convert any context value to a readable string for use in AI prompts.
- None / empty string ""
- empty dict (no keys) "" (avoids literal "{}" in file.create / prompts)
- str as-is
- dict / list pretty-printed JSON
- anything else str()
"""
if val is None or val == "" or val == []:
return ""
if isinstance(val, dict) and len(val) == 0:
return ""
if isinstance(val, str):
return val.strip()
try:
return json.dumps(val, ensure_ascii=False, indent=2)
except Exception:
return str(val)
def applyCommonAiParams(parameters: dict, request) -> None:
"""Apply common AI parameters (requireNeutralization, allowedModels) from node to request."""
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
request.requireNeutralization = bool(requireNeutralization)
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
if not request.options:
from modules.datamodels.datamodelAi import AiCallOptions
request.options = AiCallOptions()
request.options.allowedModels = allowedModels

View file

@ -67,6 +67,8 @@ async def consolidate(self, parameters: Dict[str, Any]) -> ActionResult:
prompt=prompt,
options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE),
)
from modules.workflows.methods.methodAi._common import applyCommonAiParams
applyCommonAiParams(parameters, req)
resp = await ai_service.callAi(req)
except (SubscriptionInactiveException, BillingContextError):
raise

View file

@ -36,6 +36,10 @@ async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult:
}
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
if parameters.get("allowedModels"):
processParams["allowedModels"] = parameters["allowedModels"]
if parameters.get("requireNeutralization") is not None:
processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)

View file

@ -14,8 +14,11 @@ from modules.serviceCenter.services.serviceBilling.mainServiceBilling import Bil
logger = logging.getLogger(__name__)
async def generateCode(self, parameters: Dict[str, Any]) -> ActionResult:
prompt = parameters.get("prompt")
if not prompt:
from modules.workflows.methods.methodAi._common import serialize_context
base_prompt = (parameters.get("prompt") or "").strip()
context_val = serialize_context(parameters.get("context"))
prompt = f"Kontext:\n{context_val}\n\n{base_prompt}" if context_val else base_prompt
if not prompt.strip():
return ActionResult.isFailure(error="prompt is required")
documentList = parameters.get("documentList", [])
@ -55,6 +58,16 @@ async def generateCode(self, parameters: Dict[str, Any]) -> ActionResult:
processingMode=ProcessingModeEnum.DETAILED
)
# Apply node-level AI params
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
options.allowedModels = allowedModels
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
_ctx = getattr(self.services, '_context', None)
if _ctx:
_ctx.requireNeutralization = bool(requireNeutralization)
# outputFormat: Optional - if None, formats determined from prompt by AI
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt,

View file

@ -14,14 +14,19 @@ from modules.serviceCenter.services.serviceBilling.mainServiceBilling import Bil
logger = logging.getLogger(__name__)
async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
prompt = parameters.get("prompt")
if not prompt:
from modules.workflows.methods.methodAi._common import serialize_context
base_prompt = (parameters.get("prompt") or "").strip()
context_val = serialize_context(parameters.get("context"))
prompt = f"Kontext:\n{context_val}\n\n{base_prompt}" if context_val else base_prompt
if not prompt.strip():
return ActionResult.isFailure(error="prompt is required")
documentList = parameters.get("documentList", [])
documentType = parameters.get("documentType")
# Optional: if omitted, formats determined from prompt by AI
resultType = parameters.get("resultType")
# Prefer explicit outputFormat (flow UI); resultType remains for legacy / API callers.
resultType = parameters.get("outputFormat") or parameters.get("resultType")
if isinstance(resultType, str):
resultType = resultType.strip().lstrip(".").lower() or None
if not resultType:
logger.debug("resultType not provided - formats will be determined from prompt by AI")
@ -46,8 +51,12 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
else:
docRefList = DocumentReferenceList(references=[])
# Prepare title
title = parameters.get("documentType") or "Generated Document"
title_raw = parameters.get("title")
title = (title_raw.strip() if isinstance(title_raw, str) else "") or None
if not title and isinstance(documentType, str) and documentType.strip():
title = documentType.strip()
if not title:
title = "Generated Document"
# Call AI service for document generation
# callAiContent handles documentList internally via Phases 5A-5E
@ -59,6 +68,16 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
compressContext=False
)
# Apply node-level AI params
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
options.allowedModels = allowedModels
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
_ctx = getattr(self.services, '_context', None)
if _ctx:
_ctx.requireNeutralization = bool(requireNeutralization)
# outputFormat: Optional - if None, formats determined from prompt by AI
aiResponse: AiResponse = await self.services.ai.callAiContent(
prompt=prompt,
@ -85,6 +104,8 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
"actionType": "ai.generateDocument",
"documentType": documentType,
"resultType": resultType,
"outputFormat": resultType,
"title": title,
}
))
@ -106,14 +127,15 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
docName = sanitized
# Determine mime type
rt = resultTypeFallback
mimeType = "text/plain"
if resultType == "html":
if rt == "html":
mimeType = "text/html"
elif resultType == "json":
elif rt == "json":
mimeType = "application/json"
elif resultType == "pdf":
elif rt == "pdf":
mimeType = "application/pdf"
elif resultType == "md":
elif rt == "md":
mimeType = "text/markdown"
documents.append(ActionDocument(
@ -124,6 +146,8 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
"actionType": "ai.generateDocument",
"documentType": documentType,
"resultType": resultType,
"outputFormat": resultType,
"title": title,
}
))

View file

@ -73,6 +73,47 @@ def _action_docs_to_content_parts(services, docs: List[Any]) -> List[ContentPart
logger.info(f"ai.process: Extracted {len(ec.parts)} parts from {name} (no persistence)")
return all_parts
def _resolve_file_refs_to_content_parts(services, fileIdRefs) -> List[ContentPart]:
"""Fetch files by ID from the file store and extract content.
Used for automation2 workflows where documents are file-store references,
not chat message attachments."""
from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy
mgmt = getattr(services, 'interfaceDbComponent', None)
extraction = getattr(services, 'extraction', None)
if not mgmt or not extraction:
logger.warning("_resolve_file_refs_to_content_parts: missing interfaceDbComponent or extraction service")
return []
allParts: List[ContentPart] = []
opts = ExtractionOptions(prompt="", mergeStrategy=MergeStrategy())
for ref in fileIdRefs:
fileId = ref.documentId
fileMeta = mgmt.getFile(fileId)
if not fileMeta:
logger.warning(f"_resolve_file_refs_to_content_parts: file {fileId} not found")
continue
fileData = mgmt.getFileData(fileId)
if not fileData:
logger.warning(f"_resolve_file_refs_to_content_parts: no data for file {fileId}")
continue
fileName = getattr(fileMeta, 'fileName', fileId)
mimeType = getattr(fileMeta, 'mimeType', 'application/octet-stream')
ec = extraction.extractContentFromBytes(
documentBytes=fileData,
fileName=fileName,
mimeType=mimeType,
documentId=fileId,
options=opts,
)
for p in ec.parts:
if p.data or getattr(p, "typeGroup", "") == "image":
p.metadata.setdefault("originalFileName", fileName)
allParts.append(p)
logger.info(f"_resolve_file_refs_to_content_parts: extracted {len(ec.parts)} parts from {fileName}")
return allParts
async def process(self, parameters: Dict[str, Any]) -> ActionResult:
operationId = None
try:
@ -129,6 +170,17 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
f"ai.process: Coerced documentList ({type(documentListParam).__name__}) "
f"to DocumentReferenceList with {len(documentList.references)} references"
)
# Resolve DocumentItemReferences (file-ID refs from automation2) directly
# from the file store. These cannot be resolved via chat messages.
from modules.datamodels.datamodelDocref import DocumentItemReference
fileIdRefs = [r for r in documentList.references if isinstance(r, DocumentItemReference)]
if fileIdRefs:
extractedParts = _resolve_file_refs_to_content_parts(self.services, fileIdRefs)
if extractedParts:
inline_content_parts = (inline_content_parts or []) + extractedParts
remaining = [r for r in documentList.references if not isinstance(r, DocumentItemReference)]
documentList = DocumentReferenceList(references=remaining)
# Optional: if omitted, formats determined from prompt. Default "txt" is validation fallback only.
resultType = parameters.get("resultType")
@ -157,7 +209,14 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
mimeMap = {"txt": "text/plain", "json": "application/json", "html": "text/html", "md": "text/markdown", "csv": "text/csv", "xml": "application/xml"}
output_mime_type = mimeMap.get(normalized_result_type, "text/plain") if normalized_result_type else "text/plain"
# Normalize context: serialize any non-string value (dict/list/int/…) to text
from modules.workflows.methods.methodAi._common import serialize_context
paramContext = serialize_context(parameters.get("context"))
parameters["context"] = paramContext
if paramContext:
logger.info(f"ai.process: context serialized ({len(paramContext)} chars)")
# Phase 7.3: Pass documentList and/or contentParts to AI service
contentParts: Optional[List[ContentPart]] = inline_content_parts
if "contentParts" in parameters and not inline_content_parts:
@ -183,7 +242,7 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
self.services.chat.progressLogUpdate(operationId, 0.6, "Calling AI (simple mode)")
context_parts = []
paramContext = parameters.get("context")
paramContext = parameters.get("context") # already serialized above
if paramContext and isinstance(paramContext, str) and paramContext.strip():
context_parts.append(paramContext.strip())
if documentList and len(documentList.references) > 0:
@ -212,6 +271,9 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
)
)
from modules.workflows.methods.methodAi._common import applyCommonAiParams
applyCommonAiParams(parameters, request)
aiResponse_obj = await self.services.ai.callAi(request)
# Convert AiCallResponse to AiResponse format
@ -243,6 +305,16 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult:
operationType=OperationTypeEnum.IMAGE_GENERATE if isImageGeneration else OperationTypeEnum.DATA_GENERATE
)
# Apply node-level AI params (allowedModels, requireNeutralization)
allowedModels = parameters.get("allowedModels")
if allowedModels and isinstance(allowedModels, list):
options.allowedModels = allowedModels
requireNeutralization = parameters.get("requireNeutralization")
if requireNeutralization is not None:
_ctx = getattr(self.services, '_context', None)
if _ctx:
_ctx.requireNeutralization = bool(requireNeutralization)
# Get generationIntent from parameters (required for DATA_GENERATE)
# Default to "document" if not provided (most common use case)
# For code generation, use ai.generateCode action or explicitly pass generationIntent="code"

View file

@ -39,6 +39,10 @@ async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult:
}
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
if parameters.get("allowedModels"):
processParams["allowedModels"] = parameters["allowedModels"]
if parameters.get("requireNeutralization") is not None:
processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)

View file

@ -41,6 +41,10 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
processParams["resultType"] = resultType
if parentOperationId:
processParams["parentOperationId"] = parentOperationId
if parameters.get("allowedModels"):
processParams["allowedModels"] = parameters["allowedModels"]
if parameters.get("requireNeutralization") is not None:
processParams["requireNeutralization"] = parameters["requireNeutralization"]
return await self.process(processParams)

Some files were not shown because too many files have changed in this diff Show more