From 9d82d3d353070a06b66825a4decf8b0716c24a86 Mon Sep 17 00:00:00 2001 From: Ida Date: Tue, 21 Apr 2026 11:27:44 +0200 Subject: [PATCH 01/18] P0: injection facade --- .../commcoach/serviceCommcoachIndexer.py | 28 +- modules/routes/routeDataFiles.py | 26 +- .../serviceAgent/coreTools/_documentTools.py | 18 +- .../serviceAgent/coreTools/_workspaceTools.py | 18 +- .../serviceKnowledge/mainServiceKnowledge.py | 303 +++++++++++++++++- tests/unit/serviceKnowledge/__init__.py | 0 .../serviceKnowledge/test_requestIngestion.py | 172 ++++++++++ 7 files changed, 536 insertions(+), 29 deletions(-) create mode 100644 tests/unit/serviceKnowledge/__init__.py create mode 100644 tests/unit/serviceKnowledge/test_requestIngestion.py diff --git a/modules/features/commcoach/serviceCommcoachIndexer.py b/modules/features/commcoach/serviceCommcoachIndexer.py index b43764a1..2f042795 100644 --- a/modules/features/commcoach/serviceCommcoachIndexer.py +++ b/modules/features/commcoach/serviceCommcoachIndexer.py @@ -174,14 +174,26 @@ async def indexSessionData( for c in chunks ] - await knowledgeService.indexFile( - fileId=syntheticFileId, - fileName=f"coaching-session-{sessionId[:8]}", - mimeType="application/x-coaching-session", - userId=userId, - featureInstanceId=featureInstanceId, - mandateId=mandateId, - contentObjects=contentObjects, + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="coaching_session", + sourceId=syntheticFileId, + fileName=f"coaching-session-{sessionId[:8]}", + mimeType="application/x-coaching-session", + userId=userId, + featureInstanceId=featureInstanceId, + mandateId=mandateId, + contentObjects=contentObjects, + provenance={ + "lane": "feature", + "feature": "commcoach", + "sessionId": sessionId, + "contextId": contextId, + "messageCount": len(messages or []), + }, + ) ) logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)") except Exception as e: diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index 90431ba2..f281d15e 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -77,7 +77,7 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): """Background task: pre-scan + extraction + knowledge indexing. Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted) Step 2: Content extraction via runExtraction -> ContentParts - Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store""" + Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store""" userId = user.id if hasattr(user, "id") else str(user) try: mgmtInterface = interfaceDbManagement.getInterface(user) @@ -181,15 +181,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): ) knowledgeService = getService("knowledge", ctx) - await knowledgeService.indexFile( - fileId=fileId, - fileName=fileName, - mimeType=mimeType, - userId=userId, - featureInstanceId=str(feature_instance_id) if feature_instance_id else "", - mandateId=str(mandate_id) if mandate_id else "", - contentObjects=contentObjects, - structure=contentIndex.structure, + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="file", + sourceId=fileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + featureInstanceId=str(feature_instance_id) if feature_instance_id else "", + mandateId=str(mandate_id) if mandate_id else "", + contentObjects=contentObjects, + structure=contentIndex.structure, + provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"}, + ) ) # Re-acquire interface after await to avoid stale user context from the singleton diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py index a48e53b3..b9b00755 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py @@ -434,11 +434,19 @@ def _registerDocumentTools(registry: ToolRegistry, services): if contentObjects: _diFiId, _diMId = _resolveFileScope(fileId, context) - await knowledgeService.indexFile( - fileId=fileId, fileName=fileName, mimeType=fileMime, - userId=context.get("userId", ""), contentObjects=contentObjects, - featureInstanceId=_diFiId, - mandateId=_diMId, + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="agent_tool", + sourceId=fileId, + fileName=fileName, + mimeType=fileMime, + userId=context.get("userId", ""), + contentObjects=contentObjects, + featureInstanceId=_diFiId, + mandateId=_diMId, + provenance={"lane": "agent", "tool": "describeImage"}, + ) ) chunks = knowledgeService._knowledgeDb.getContentChunks(fileId) diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py index 9a6af658..bb548081 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py @@ -132,11 +132,19 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): try: userId = context.get("userId", "") _fiId, _mId = _resolveFileScope(fileId, context) - await knowledgeService.indexFile( - fileId=fileId, fileName=fileName, mimeType=mimeType, - userId=userId, contentObjects=contentObjects, - featureInstanceId=_fiId, - mandateId=_mId, + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="agent_tool", + sourceId=fileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + contentObjects=contentObjects, + featureInstanceId=_fiId, + mandateId=_mId, + provenance={"lane": "agent", "tool": "readFile"}, + ) ) except Exception as e: logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}") diff --git a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py index dab8cc25..716ade31 100644 --- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py +++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py @@ -2,9 +2,13 @@ # All rights reserved. """Knowledge service: 3-tier RAG with indexing, semantic search, and context building.""" +import hashlib +import json import logging import re -from typing import Any, Callable, Dict, List, Optional +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Union from modules.datamodels.datamodelKnowledge import ( FileContentIndex, ContentChunk, WorkflowMemory, @@ -20,6 +24,65 @@ DEFAULT_CHUNK_TOKENS = 400 DEFAULT_CONTEXT_BUDGET = 12000 +# ============================================================================= +# Ingestion façade (P0 of unified-knowledge-indexing concept) +# ============================================================================= + +@dataclass +class IngestionJob: + """One request to add or refresh content in the unified knowledge store. + + Callers from any lane (routes, feature hooks, agent tools, connector sync) + describe the work they want done via this object; idempotency, scope + resolution, and embedding are handled by KnowledgeService.requestIngestion. + """ + sourceKind: str + sourceId: str + fileName: str + mimeType: str + userId: str + contentObjects: List[Dict[str, Any]] = field(default_factory=list) + featureInstanceId: str = "" + mandateId: str = "" + structure: Optional[Dict[str, Any]] = None + containerPath: Optional[str] = None + contentVersion: Optional[str] = None + provenance: Optional[Dict[str, Any]] = None + + +@dataclass +class IngestionHandle: + """Result of requestIngestion. Stable across in-process and future queue impls.""" + jobId: str + status: str + contentHash: str + fileId: str + index: Optional[FileContentIndex] = None + error: Optional[str] = None + + +def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str: + """Deterministic SHA256 over (contentObjectId, contentType, data) tuples. + + Sorted by contentObjectId so re-ordering of extractor output does not + invalidate the cache; text whitespace is preserved intentionally because + chunk boundaries depend on it. + """ + normalized = sorted( + ( + ( + str(o.get("contentObjectId", "") or ""), + str(o.get("contentType", "text") or "text"), + o.get("data", "") or "", + ) + for o in (contentObjects or []) + ), + key=lambda t: t[0], + ) + payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":")) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + class KnowledgeService: """Service for Knowledge Store operations: indexing, retrieval, and context building.""" @@ -46,6 +109,196 @@ class KnowledgeService: results = await self._embed([text]) return results[0] if results else [] + # ========================================================================= + # Ingestion façade (single entry point for all lanes) + # ========================================================================= + + async def requestIngestion(self, job: IngestionJob) -> IngestionHandle: + """Unified entry point for filling the knowledge corpus. + + Applies idempotency based on a content hash (or caller-supplied + `contentVersion`) persisted in `FileContentIndex.structure._ingestion`. + Re-runs indexing only when the hash differs or the previous run did + not reach `indexed` state. Runs embedding synchronously for now + (callers already schedule background tasks where needed). + """ + jobId = f"{job.sourceKind}:{job.sourceId}" + startMs = time.time() + contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects) + + # 1. Check for duplicate via existing FileContentIndex row. + existing = None + try: + existing = self._knowledgeDb.getFileContentIndex(job.sourceId) + except Exception: + existing = None + + if existing: + existingStructure = ( + existing.get("structure") if isinstance(existing, dict) + else getattr(existing, "structure", {}) + ) or {} + existingMeta = existingStructure.get("_ingestion", {}) or {} + existingStatus = ( + existing.get("status") if isinstance(existing, dict) + else getattr(existing, "status", "") + ) or "" + if existingMeta.get("hash") == contentHash and existingStatus == "indexed": + logger.info( + "ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s", + job.sourceKind, job.sourceId, contentHash[:12], + extra={ + "event": "ingestion.skipped.duplicate", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + return IngestionHandle( + jobId=jobId, + status="duplicate", + contentHash=contentHash, + fileId=job.sourceId, + index=None, + ) + + # 2. Prepare ingestion metadata; stays in structure._ingestion so + # later connector revoke/purge can filter chunks by sourceKind / + # provenance.connectionId without a schema migration. + ingestionMeta = { + "hash": contentHash, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "contentVersion": job.contentVersion, + "indexedAt": getUtcTimestamp(), + "provenance": dict(job.provenance or {}), + } + structure = dict(job.structure or {}) + structure["_ingestion"] = ingestionMeta + + logger.info( + "ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s", + job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12], + extra={ + "event": "ingestion.queued", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "objectCount": len(job.contentObjects or []), + }, + ) + + # 3. Run real indexing. + try: + index = await self._indexFileInternal( + fileId=job.sourceId, + fileName=job.fileName, + mimeType=job.mimeType, + userId=job.userId, + featureInstanceId=job.featureInstanceId, + mandateId=job.mandateId, + contentObjects=job.contentObjects or [], + structure=structure, + containerPath=job.containerPath, + ) + except Exception as exc: + logger.error( + "ingestion.failed sourceKind=%s sourceId=%s error=%s", + job.sourceKind, job.sourceId, exc, + exc_info=True, + extra={ + "event": "ingestion.failed", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "error": str(exc), + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + try: + self._knowledgeDb.updateFileStatus(job.sourceId, "failed") + except Exception: + pass + return IngestionHandle( + jobId=jobId, + status="failed", + contentHash=contentHash, + fileId=job.sourceId, + index=None, + error=str(exc), + ) + + logger.info( + "ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d", + job.sourceKind, job.sourceId, len(job.contentObjects or []), + int((time.time() - startMs) * 1000), + extra={ + "event": "ingestion.indexed", + "jobId": jobId, + "sourceKind": job.sourceKind, + "sourceId": job.sourceId, + "hash": contentHash, + "objectCount": len(job.contentObjects or []), + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + return IngestionHandle( + jobId=jobId, + status="indexed", + contentHash=contentHash, + fileId=job.sourceId, + index=index, + ) + + def getIngestionStatus( + self, handleOrJobId: Union[IngestionHandle, str] + ) -> Dict[str, Any]: + """Map a handle or `sourceKind:sourceId` jobId to a status snapshot.""" + if isinstance(handleOrJobId, IngestionHandle): + sourceId = handleOrJobId.fileId + jobId = handleOrJobId.jobId + elif isinstance(handleOrJobId, str) and ":" in handleOrJobId: + jobId = handleOrJobId + sourceId = handleOrJobId.split(":", 1)[1] + else: + jobId = str(handleOrJobId) + sourceId = str(handleOrJobId) + + row = None + try: + row = self._knowledgeDb.getFileContentIndex(sourceId) + except Exception: + row = None + if not row: + return { + "jobId": jobId, + "sourceId": sourceId, + "status": "unknown", + "contentHash": None, + } + + structure = ( + row.get("structure") if isinstance(row, dict) + else getattr(row, "structure", {}) + ) or {} + meta = structure.get("_ingestion", {}) or {} + status = ( + row.get("status") if isinstance(row, dict) + else getattr(row, "status", "") + ) or "unknown" + return { + "jobId": jobId, + "sourceId": sourceId, + "status": status, + "contentHash": meta.get("hash"), + "sourceKind": meta.get("sourceKind"), + "indexedAt": meta.get("indexedAt"), + } + # ========================================================================= # File Indexing (called after extraction, before embedding) # ========================================================================= @@ -61,6 +314,54 @@ class KnowledgeService: contentObjects: List[Dict[str, Any]] = None, structure: Dict[str, Any] = None, containerPath: str = None, + ) -> Optional[FileContentIndex]: + """Backward-compatible wrapper delegating to requestIngestion. + + Existing callers that still invoke `indexFile` directly automatically + participate in the idempotency/metrics layer. New callers should + prefer `requestIngestion` so they can pass `sourceKind` and + `provenance` for connector revoke/purge later. + """ + job = IngestionJob( + sourceKind="file", + sourceId=fileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + featureInstanceId=featureInstanceId, + mandateId=mandateId, + contentObjects=list(contentObjects or []), + structure=structure, + containerPath=containerPath, + ) + handle = await self.requestIngestion(job) + if handle.index is not None: + return handle.index + if handle.status == "duplicate": + row = None + try: + row = self._knowledgeDb.getFileContentIndex(fileId) + except Exception: + row = None + if isinstance(row, dict): + try: + return FileContentIndex(**row) + except Exception: + return None + return row + return None + + async def _indexFileInternal( + self, + fileId: str, + fileName: str, + mimeType: str, + userId: str, + featureInstanceId: str = "", + mandateId: str = "", + contentObjects: List[Dict[str, Any]] = None, + structure: Dict[str, Any] = None, + containerPath: str = None, ) -> FileContentIndex: """Index a file's content objects and create embeddings for text chunks. diff --git a/tests/unit/serviceKnowledge/__init__.py b/tests/unit/serviceKnowledge/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/serviceKnowledge/test_requestIngestion.py b/tests/unit/serviceKnowledge/test_requestIngestion.py new file mode 100644 index 00000000..595faeff --- /dev/null +++ b/tests/unit/serviceKnowledge/test_requestIngestion.py @@ -0,0 +1,172 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Unit tests for the P0 ingestion facade on KnowledgeService. + +Covers acceptance criteria AC4 (idempotent ingestion for unchanged content) +and hash stability. The knowledge DB interface and AI embedding service are +stubbed so the test runs without any external dependency. +""" + +from unittest.mock import MagicMock, patch + +import pytest + +from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import ( + IngestionJob, + KnowledgeService, + _computeIngestionHash, +) + + +class _StubKnowledgeDb: + """Minimal in-memory stand-in for interfaceDbKnowledge.""" + + def __init__(self): + self.index = None + self.upsertIndexCalls = 0 + self.upsertChunkCalls = 0 + + def upsertFileContentIndex(self, index): + self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index) + self.upsertIndexCalls += 1 + + def upsertContentChunk(self, chunk): + self.upsertChunkCalls += 1 + + def updateFileStatus(self, fileId, status): + if self.index is not None: + self.index["status"] = status + + def getFileContentIndex(self, fileId): + return self.index + + +def _makeService(): + """Create a KnowledgeService with stubbed db and ai dependencies.""" + stubDb = _StubKnowledgeDb() + + aiService = MagicMock() + + async def _callEmbedding(texts): + return MagicMock( + errorCount=0, + content="", + metadata={"embeddings": [[0.0] * 4 for _ in texts]}, + ) + + aiService.callEmbedding = _callEmbedding + + def getService(name): + if name == "ai": + return aiService + raise KeyError(name) + + context = MagicMock() + context.user = MagicMock() + # Return a non-empty but empty-dict record so the FileItem lookup branch + # in _indexFileInternal resolves without touching a real DB. + context.interfaceDbComponent = MagicMock() + context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}]) + + with patch( + "modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface", + return_value=stubDb, + ): + service = KnowledgeService(context, getService) + + return service, stubDb + + +@pytest.mark.asyncio +async def test_duplicate_skipped(): + service, db = _makeService() + job = IngestionJob( + sourceKind="file", + sourceId="file-123", + fileName="a.txt", + mimeType="text/plain", + userId="u1", + contentObjects=[ + {"contentObjectId": "c1", "contentType": "text", "data": "hello world"} + ], + ) + + first = await service.requestIngestion(job) + assert first.status == "indexed" + chunksAfterFirst = db.upsertChunkCalls + assert chunksAfterFirst >= 1 + + second = await service.requestIngestion(job) + assert second.status == "duplicate" + assert second.contentHash == first.contentHash + # No additional embedding work. + assert db.upsertChunkCalls == chunksAfterFirst + + +@pytest.mark.asyncio +async def test_reindex_on_content_change(): + service, db = _makeService() + base = IngestionJob( + sourceKind="file", + sourceId="file-123", + fileName="a.txt", + mimeType="text/plain", + userId="u1", + contentObjects=[ + {"contentObjectId": "c1", "contentType": "text", "data": "hello world"} + ], + ) + first = await service.requestIngestion(base) + assert first.status == "indexed" + chunksAfterFirst = db.upsertChunkCalls + + changed = IngestionJob( + sourceKind="file", + sourceId="file-123", + fileName="a.txt", + mimeType="text/plain", + userId="u1", + contentObjects=[ + {"contentObjectId": "c1", "contentType": "text", "data": "hello universe"} + ], + ) + second = await service.requestIngestion(changed) + assert second.status == "indexed" + assert second.contentHash != first.contentHash + assert db.upsertChunkCalls > chunksAfterFirst + + +def test_hash_stable_under_reordering(): + a = [ + {"contentObjectId": "c1", "contentType": "text", "data": "alpha"}, + {"contentObjectId": "c2", "contentType": "text", "data": "beta"}, + ] + b = list(reversed(a)) + assert _computeIngestionHash(a) == _computeIngestionHash(b) + + +def test_hash_changes_on_data_edit(): + a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}] + b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}] + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +@pytest.mark.asyncio +async def test_get_ingestion_status_after_index(): + service, _db = _makeService() + job = IngestionJob( + sourceKind="coaching_session", + sourceId="coaching-session:abc", + fileName="session", + mimeType="application/x-coaching-session", + userId="u1", + contentObjects=[ + {"contentObjectId": "m0", "contentType": "text", "data": "User: hi"} + ], + provenance={"lane": "feature", "feature": "commcoach"}, + ) + handle = await service.requestIngestion(job) + status = service.getIngestionStatus(handle) + assert status["status"] == "indexed" + assert status["sourceKind"] == "coaching_session" + assert status["contentHash"] == handle.contentHash From 078b4eaaaf375a66ecaa1665f02006199239c098 Mon Sep 17 00:00:00 2001 From: Ida Date: Tue, 21 Apr 2026 11:28:55 +0200 Subject: [PATCH 02/18] removed unnecessary test files --- tests/unit/serviceKnowledge/__init__.py | 0 .../serviceKnowledge/test_requestIngestion.py | 172 ------------------ 2 files changed, 172 deletions(-) delete mode 100644 tests/unit/serviceKnowledge/__init__.py delete mode 100644 tests/unit/serviceKnowledge/test_requestIngestion.py diff --git a/tests/unit/serviceKnowledge/__init__.py b/tests/unit/serviceKnowledge/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/serviceKnowledge/test_requestIngestion.py b/tests/unit/serviceKnowledge/test_requestIngestion.py deleted file mode 100644 index 595faeff..00000000 --- a/tests/unit/serviceKnowledge/test_requestIngestion.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -"""Unit tests for the P0 ingestion facade on KnowledgeService. - -Covers acceptance criteria AC4 (idempotent ingestion for unchanged content) -and hash stability. The knowledge DB interface and AI embedding service are -stubbed so the test runs without any external dependency. -""" - -from unittest.mock import MagicMock, patch - -import pytest - -from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import ( - IngestionJob, - KnowledgeService, - _computeIngestionHash, -) - - -class _StubKnowledgeDb: - """Minimal in-memory stand-in for interfaceDbKnowledge.""" - - def __init__(self): - self.index = None - self.upsertIndexCalls = 0 - self.upsertChunkCalls = 0 - - def upsertFileContentIndex(self, index): - self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index) - self.upsertIndexCalls += 1 - - def upsertContentChunk(self, chunk): - self.upsertChunkCalls += 1 - - def updateFileStatus(self, fileId, status): - if self.index is not None: - self.index["status"] = status - - def getFileContentIndex(self, fileId): - return self.index - - -def _makeService(): - """Create a KnowledgeService with stubbed db and ai dependencies.""" - stubDb = _StubKnowledgeDb() - - aiService = MagicMock() - - async def _callEmbedding(texts): - return MagicMock( - errorCount=0, - content="", - metadata={"embeddings": [[0.0] * 4 for _ in texts]}, - ) - - aiService.callEmbedding = _callEmbedding - - def getService(name): - if name == "ai": - return aiService - raise KeyError(name) - - context = MagicMock() - context.user = MagicMock() - # Return a non-empty but empty-dict record so the FileItem lookup branch - # in _indexFileInternal resolves without touching a real DB. - context.interfaceDbComponent = MagicMock() - context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}]) - - with patch( - "modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface", - return_value=stubDb, - ): - service = KnowledgeService(context, getService) - - return service, stubDb - - -@pytest.mark.asyncio -async def test_duplicate_skipped(): - service, db = _makeService() - job = IngestionJob( - sourceKind="file", - sourceId="file-123", - fileName="a.txt", - mimeType="text/plain", - userId="u1", - contentObjects=[ - {"contentObjectId": "c1", "contentType": "text", "data": "hello world"} - ], - ) - - first = await service.requestIngestion(job) - assert first.status == "indexed" - chunksAfterFirst = db.upsertChunkCalls - assert chunksAfterFirst >= 1 - - second = await service.requestIngestion(job) - assert second.status == "duplicate" - assert second.contentHash == first.contentHash - # No additional embedding work. - assert db.upsertChunkCalls == chunksAfterFirst - - -@pytest.mark.asyncio -async def test_reindex_on_content_change(): - service, db = _makeService() - base = IngestionJob( - sourceKind="file", - sourceId="file-123", - fileName="a.txt", - mimeType="text/plain", - userId="u1", - contentObjects=[ - {"contentObjectId": "c1", "contentType": "text", "data": "hello world"} - ], - ) - first = await service.requestIngestion(base) - assert first.status == "indexed" - chunksAfterFirst = db.upsertChunkCalls - - changed = IngestionJob( - sourceKind="file", - sourceId="file-123", - fileName="a.txt", - mimeType="text/plain", - userId="u1", - contentObjects=[ - {"contentObjectId": "c1", "contentType": "text", "data": "hello universe"} - ], - ) - second = await service.requestIngestion(changed) - assert second.status == "indexed" - assert second.contentHash != first.contentHash - assert db.upsertChunkCalls > chunksAfterFirst - - -def test_hash_stable_under_reordering(): - a = [ - {"contentObjectId": "c1", "contentType": "text", "data": "alpha"}, - {"contentObjectId": "c2", "contentType": "text", "data": "beta"}, - ] - b = list(reversed(a)) - assert _computeIngestionHash(a) == _computeIngestionHash(b) - - -def test_hash_changes_on_data_edit(): - a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}] - b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}] - assert _computeIngestionHash(a) != _computeIngestionHash(b) - - -@pytest.mark.asyncio -async def test_get_ingestion_status_after_index(): - service, _db = _makeService() - job = IngestionJob( - sourceKind="coaching_session", - sourceId="coaching-session:abc", - fileName="session", - mimeType="application/x-coaching-session", - userId="u1", - contentObjects=[ - {"contentObjectId": "m0", "contentType": "text", "data": "User: hi"} - ], - provenance={"lane": "feature", "feature": "commcoach"}, - ) - handle = await service.requestIngestion(job) - status = service.getIngestionStatus(handle) - assert status["status"] == "indexed" - assert status["sourceKind"] == "coaching_session" - assert status["contentHash"] == handle.contentHash From a7f4055130dae218baaca66394852a3fe30e458b Mon Sep 17 00:00:00 2001 From: Ida Date: Tue, 21 Apr 2026 12:41:50 +0200 Subject: [PATCH 03/18] fix(rag): preserve per-page granularity + remove on-demand extraction fallbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default MergeStrategy concatenates every extracted text part into a single ContentPart, collapsing a 500-page PDF into one chunk with a blurred average embedding — RAG retrieval was effectively broken. - ExtractionOptions.mergeStrategy is now Optional[MergeStrategy]; passing None preserves per-part granularity. Default factory kept for backward compatibility. - routeDataFiles._autoIndexFile, _workspaceTools.readFile, and _documentTools.describeImage explicitly pass mergeStrategy=None. - Agent tools no longer carry redundant extraction + requestIngestion fallback paths: the unified ingestion lane owns all corpus writes, and readFile/describeImage are pure consumers of the knowledge store. - Unit test asserts runExtraction(mergeStrategy=None) keeps every part. --- modules/datamodels/datamodelExtraction.py | 9 +- modules/routes/routeDataFiles.py | 5 +- .../serviceAgent/coreTools/_documentTools.py | 3 +- .../serviceAgent/coreTools/_workspaceTools.py | 3 +- .../test_extraction_merge_strategy.py | 124 ++++++++++++++++++ 5 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 tests/unit/services/test_extraction_merge_strategy.py diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py index 0aaaffd8..38fd1d27 100644 --- a/modules/datamodels/datamodelExtraction.py +++ b/modules/datamodels/datamodelExtraction.py @@ -95,7 +95,14 @@ class ExtractionOptions(BaseModel): imageQuality: int = Field(default=85, ge=1, le=100, description="Image quality (1-100)") # Merging strategy - mergeStrategy: MergeStrategy = Field(default_factory=MergeStrategy, description="Strategy for merging extraction results") + mergeStrategy: Optional[MergeStrategy] = Field( + default_factory=MergeStrategy, + description=( + "Strategy for merging extraction results. Pass None to skip merging entirely " + "(required for per-chunk ingestion pipelines like RAG, where per-page/per-section " + "granularity must be preserved for embedding)." + ), + ) # Optional chunking parameters (for backward compatibility) chunkAllowed: Optional[bool] = Field(default=None, description="Whether chunking is allowed") diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index f281d15e..26614ff0 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -134,7 +134,10 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): extractorRegistry = ExtractorRegistry() chunkerRegistry = ChunkerRegistry() - options = ExtractionOptions() + # mergeStrategy=None: keep per-page / per-section granularity for RAG ingestion. + # The default MergeStrategy concatenates all text parts into a single blob, which + # collapses a 500-page PDF into one ContentChunk and destroys semantic retrieval. + options = ExtractionOptions(mergeStrategy=None) extracted = runExtraction( extractorRegistry, chunkerRegistry, diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py index b9b00755..64b3a147 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py @@ -416,7 +416,8 @@ def _registerDocumentTools(registry: ToolRegistry, services): fileName = fileInfo.get("fileName", fileId) extracted = runExtraction( ExtractorRegistry(), None, - rawBytes, fileName, fileMime, ExtractionOptions(), + rawBytes, fileName, fileMime, + ExtractionOptions(mergeStrategy=None), ) contentObjects = [] diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py index bb548081..aa337472 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py @@ -107,7 +107,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): extracted = runExtraction( ExtractorRegistry(), ChunkerRegistry(), - rawBytes, fileName, mimeType, ExtractionOptions(), + rawBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), ) contentObjects = [] diff --git a/tests/unit/services/test_extraction_merge_strategy.py b/tests/unit/services/test_extraction_merge_strategy.py new file mode 100644 index 00000000..784bb783 --- /dev/null +++ b/tests/unit/services/test_extraction_merge_strategy.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Test that runExtraction preserves per-part granularity when mergeStrategy=None. + +The default MergeStrategy concatenates all text parts into a single ContentPart, which +collapses multi-page documents into one blob. This destroys RAG retrieval because every +document ends up as a single ContentChunk with a "blurred average" embedding. + +Ingestion pipelines (requestIngestion callers) MUST pass mergeStrategy=None to preserve +per-page / per-section chunks. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.datamodels.datamodelExtraction import ( + ContentPart, + ExtractionOptions, + MergeStrategy, +) +from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction +from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ChunkerRegistry, + Extractor, + ExtractorRegistry, +) + + +class _FakeMultiPagePdfExtractor(Extractor): + """Emits one text ContentPart per simulated page.""" + + def __init__(self, pageCount: int = 10): + self.pageCount = pageCount + + def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool: + return mimeType == "application/pdf" + + def getSupportedExtensions(self): + return [".pdf"] + + def getSupportedMimeTypes(self): + return ["application/pdf"] + + def extract(self, fileBytes: bytes, context): + return [ + ContentPart( + id=f"page-{i}", + parentId=None, + label=f"page_{i + 1}", + typeGroup="text", + mimeType="text/plain", + data=f"Page {i + 1} content — distinct semantic anchor #{i}", + metadata={"pageIndex": i, "size": 64}, + ) + for i in range(self.pageCount) + ] + + +def _buildRegistry(pageCount: int) -> ExtractorRegistry: + registry = ExtractorRegistry() + fake = _FakeMultiPagePdfExtractor(pageCount) + registry.register("application/pdf", fake) + registry.register("pdf", fake) + return registry + + +def test_default_options_merge_all_text_parts_into_one(): + """Regression safeguard: default ExtractionOptions still merges (legacy behaviour). + + Non-ingestion callers (AI processing, summarization) rely on this default. + """ + registry = _buildRegistry(pageCount=5) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 1, ( + f"Default options should merge all text parts into one, got {len(textParts)}" + ) + assert "Page 1" in textParts[0].data and "Page 5" in textParts[0].data, ( + "Merged text should contain content from all pages" + ) + print("test_default_options_merge_all_text_parts_into_one [PASS]") + + +def test_merge_none_preserves_all_text_parts(): + """Core fix: mergeStrategy=None preserves per-page granularity for RAG ingestion.""" + registry = _buildRegistry(pageCount=500) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(mergeStrategy=None), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 500, ( + f"mergeStrategy=None should preserve all 500 text parts, got {len(textParts)}" + ) + assert textParts[0].label == "page_1" + assert textParts[-1].label == "page_500" + print("test_merge_none_preserves_all_text_parts [PASS]") + + +def test_explicit_merge_strategy_still_merges(): + """Callers can still opt in to merging by passing an explicit MergeStrategy.""" + registry = _buildRegistry(pageCount=3) + extracted = runExtraction( + registry, ChunkerRegistry(), b"", "sample.pdf", "application/pdf", + ExtractionOptions(mergeStrategy=MergeStrategy()), + ) + textParts = [p for p in extracted.parts if p.typeGroup == "text"] + assert len(textParts) == 1, ( + f"Explicit MergeStrategy should merge, got {len(textParts)} parts" + ) + print("test_explicit_merge_strategy_still_merges [PASS]") + + +if __name__ == "__main__": + test_default_options_merge_all_text_parts_into_one() + test_merge_none_preserves_all_text_parts() + test_explicit_merge_strategy_still_merges() + print("\nAll merge-strategy tests passed.") From dff3d418457d188fe3cd7058fbf785f5c3625e0e Mon Sep 17 00:00:00 2001 From: Ida Date: Tue, 21 Apr 2026 12:42:40 +0200 Subject: [PATCH 04/18] fix(rag): stable ingestion idempotency across re-extractions (AC4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-indexing the same file always triggered a full embedding run — ingestion.skipped.duplicate never fired. Two independent causes: 1. _computeIngestionHash included contentObjectId in its payload, but extractors generate fresh uuid4() per run, making the hash a per-run nonce. Now hashed over (contentType, data) in extractor order — stable across re-extractions, sensitive to content, ordering, and type changes. 2. _autoIndexFile upserted the fresh pre-scan FileContentIndex before requestIngestion's duplicate check, wiping structure._ingestion and status=indexed from the prior run. The pre-upsert now merges the existing _ingestion metadata and preserves the indexed status. Verified end-to-end: second PATCH /scope on an already-indexed file logs and returns in ~2s with zero embedding API calls. Adds test_ingestion_hash_stability.py (5 cases). --- modules/routes/routeDataFiles.py | 23 ++++- .../serviceAgent/coreTools/_documentTools.py | 73 +------------- .../serviceAgent/coreTools/_workspaceTools.py | 95 ++----------------- .../serviceKnowledge/mainServiceKnowledge.py | 28 +++--- .../services/test_ingestion_hash_stability.py | 81 ++++++++++++++++ 5 files changed, 128 insertions(+), 172 deletions(-) create mode 100644 tests/unit/services/test_ingestion_hash_stability.py diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index 26614ff0..3abccdc4 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -122,9 +122,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): f"{contentIndex.totalObjects} objects" ) - # Persist FileContentIndex immediately + # Persist FileContentIndex immediately. + # IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any + # prior successful run — otherwise this upsert wipes the idempotency cache + # and requestIngestion cannot detect duplicates (AC4 breaks). from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface knowledgeDb = getKnowledgeInterface() + try: + _existing = knowledgeDb.getFileContentIndex(fileId) + except Exception: + _existing = None + if _existing: + _existingStruct = ( + _existing.get("structure") if isinstance(_existing, dict) + else getattr(_existing, "structure", {}) + ) or {} + _existingStatus = ( + _existing.get("status") if isinstance(_existing, dict) + else getattr(_existing, "status", "") + ) or "" + if "_ingestion" in _existingStruct: + contentIndex.structure = dict(contentIndex.structure or {}) + contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"] + if _existingStatus == "indexed": + contentIndex.status = "indexed" knowledgeDb.upsertFileContentIndex(contentIndex) # Step 2: Content extraction (AI-free, produces ContentParts) diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py index 64b3a147..62413103 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py @@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( _getOrCreateTempFolder, - _looksLikeBinary, - _resolveFileScope, _MAX_TOOL_RESULT_CHARS, ) @@ -392,74 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services): if chunkMime: mimeType = chunkMime - # 2) File not yet indexed -> trigger extraction via ExtractionService, then retry - if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId): - try: - chatService = services.chat - fileInfo = chatService.getFileInfo(fileId) - fileContent = chatService.getFileContent(fileId) - if fileContent and fileInfo: - rawData = fileContent.get("data", "") - if isinstance(rawData, str) and len(rawData) > 100: - rawBytes = _b64.b64decode(rawData) - elif isinstance(rawData, bytes): - rawBytes = rawData - else: - rawBytes = None - - if rawBytes: - from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry - from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction - from modules.datamodels.datamodelExtraction import ExtractionOptions - - fileMime = fileInfo.get("mimeType", "application/octet-stream") - fileName = fileInfo.get("fileName", fileId) - extracted = runExtraction( - ExtractorRegistry(), None, - rawBytes, fileName, fileMime, - ExtractionOptions(mergeStrategy=None), - ) - - contentObjects = [] - for part in extracted.parts: - tg = (part.typeGroup or "").lower() - ct = "image" if tg == "image" else "text" - if not part.data or not part.data.strip(): - continue - contentObjects.append({ - "contentObjectId": part.id, - "contentType": ct, - "data": part.data, - "contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})}, - }) - - if contentObjects: - _diFiId, _diMId = _resolveFileScope(fileId, context) - from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob - await knowledgeService.requestIngestion( - IngestionJob( - sourceKind="agent_tool", - sourceId=fileId, - fileName=fileName, - mimeType=fileMime, - userId=context.get("userId", ""), - contentObjects=contentObjects, - featureInstanceId=_diFiId, - mandateId=_diMId, - provenance={"lane": "agent", "tool": "describeImage"}, - ) - ) - - chunks = knowledgeService._knowledgeDb.getContentChunks(fileId) - imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"] - if pageIndex is not None: - imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex] - if imageChunks: - imageData = imageChunks[0].get("data", "") - except Exception as extractErr: - logger.warning(f"describeImage: on-demand extraction failed: {extractErr}") - - # 3) Direct image file (not a container) - use raw file data + # 2) Direct image file (not a container) - use raw file data if not imageData: chatService = services.chat fileContent = chatService.getFileContent(fileId) @@ -469,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services): imageData = fileContent.get("data", "") mimeType = fileMimeType - # 4) PDF page rendering: render the requested page as an image via PyMuPDF + # 3) PDF page rendering: render the requested page as an image via PyMuPDF if not imageData: chatService = services.chat fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py index aa337472..e413c3f0 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py @@ -14,7 +14,6 @@ from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( _getOrCreateInstanceFolder, _getOrCreateTempFolder, _looksLikeBinary, - _resolveFileScope, _MAX_TOOL_RESULT_CHARS, ) @@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required") try: knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None + fileStatus = None # 1) Knowledge Store: return already-extracted text chunks if knowledgeService: @@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]", ) - # 2) Not indexed yet: try on-demand extraction + # 2) Not indexed yet: inspect file type to decide how to serve the agent + # (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline) chatService = services.chat fileInfo = chatService.getFileInfo(fileId) if not fileInfo: @@ -100,92 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): isBinary = _looksLikeBinary(rawBytes) if isBinary: - try: - from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry - from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction - from modules.datamodels.datamodelExtraction import ExtractionOptions - - extracted = runExtraction( - ExtractorRegistry(), ChunkerRegistry(), - rawBytes, fileName, mimeType, - ExtractionOptions(mergeStrategy=None), - ) - - contentObjects = [] - for part in extracted.parts: - tg = (part.typeGroup or "").lower() - ct = "image" if tg == "image" else "text" - if not part.data or not part.data.strip(): - continue - contentObjects.append({ - "contentObjectId": part.id, - "contentType": ct, - "data": part.data, - "contextRef": { - "containerPath": fileName, - "location": part.label or "file", - **(part.metadata or {}), - }, - }) - - if contentObjects: - if knowledgeService: - try: - userId = context.get("userId", "") - _fiId, _mId = _resolveFileScope(fileId, context) - from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob - await knowledgeService.requestIngestion( - IngestionJob( - sourceKind="agent_tool", - sourceId=fileId, - fileName=fileName, - mimeType=mimeType, - userId=userId, - contentObjects=contentObjects, - featureInstanceId=_fiId, - mandateId=_mId, - provenance={"lane": "agent", "tool": "readFile"}, - ) - ) - except Exception as e: - logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}") - - joined = "" - if knowledgeService: - _chunks = knowledgeService._knowledgeDb.getContentChunks(fileId) - _textChunks = [ - c for c in (_chunks or []) - if c.get("contentType") != "image" and c.get("data") - ] - if _textChunks: - joined = "\n\n".join(c["data"] for c in _textChunks) - if not joined: - textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"] - joined = "\n\n".join(textParts) if textParts else "" - if joined: - chunked = _applyOffsetLimit(joined, offset, limit) - if chunked is not None: - return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked) - if len(joined) > _MAX_TOOL_RESULT_CHARS: - joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated – showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]" - return ToolResult( - toolCallId="", toolName="readFile", success=True, - data=joined, - ) - imgCount = sum(1 for o in contentObjects if o["contentType"] == "image") - return ToolResult( - toolCallId="", toolName="readFile", success=True, - data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' " - f"({imgCount} images, no readable text). " - f"Use describeImage(fileId='{fileId}') to analyze visual content.]", - ) - except Exception as extractErr: - logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}") - return ToolResult( toolCallId="", toolName="readFile", success=True, - data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. " - f"Text extraction not available. Use describeImage for images.]", + data=( + f"[File '{fileName}' ({mimeType}) is not yet indexed " + f"(status: {fileStatus or 'unknown'}). Indexing runs automatically " + f"on upload. Please wait a few seconds and retry, or re-upload the file. " + f"For visual content use describeImage(fileId='{fileId}').]" + ), ) # 3) Text file: decode raw bytes diff --git a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py index 716ade31..57490eba 100644 --- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py +++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py @@ -62,23 +62,23 @@ class IngestionHandle: def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str: - """Deterministic SHA256 over (contentObjectId, contentType, data) tuples. + """Deterministic SHA256 over (contentType, data) tuples in extractor order. - Sorted by contentObjectId so re-ordering of extractor output does not - invalidate the cache; text whitespace is preserved intentionally because - chunk boundaries depend on it. + `contentObjectId` is intentionally excluded because extractors generate + fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable + across re-extractions of the same source — defeating idempotency. + Order is preserved (no sort) because two different documents can share the + same multiset of parts but differ in arrangement (e.g. swapped pages). + Text whitespace is preserved intentionally because chunk boundaries + depend on it. """ - normalized = sorted( + normalized = [ ( - ( - str(o.get("contentObjectId", "") or ""), - str(o.get("contentType", "text") or "text"), - o.get("data", "") or "", - ) - for o in (contentObjects or []) - ), - key=lambda t: t[0], - ) + str(o.get("contentType", "text") or "text"), + o.get("data", "") or "", + ) + for o in (contentObjects or []) + ] payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":")) return hashlib.sha256(payload.encode("utf-8")).hexdigest() diff --git a/tests/unit/services/test_ingestion_hash_stability.py b/tests/unit/services/test_ingestion_hash_stability.py new file mode 100644 index 00000000..df25a4f0 --- /dev/null +++ b/tests/unit/services/test_ingestion_hash_stability.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Test that _computeIngestionHash is stable across re-extractions of the same source. + +Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion +hash MUST therefore be derived from content (contentType + data + order) only — +otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and +triggers full re-embedding. +""" + +import os +import sys +import uuid + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import ( + _computeIngestionHash, +) + + +def _makeObjects(seed: str = "alpha"): + """Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would.""" + return [ + { + "contentObjectId": str(uuid.uuid4()), + "contentType": "text", + "data": f"Page 1 of {seed}", + }, + { + "contentObjectId": str(uuid.uuid4()), + "contentType": "text", + "data": f"Page 2 of {seed}", + }, + { + "contentObjectId": str(uuid.uuid4()), + "contentType": "binary", + "data": "", + }, + ] + + +def test_hash_stable_across_uuid_regeneration(): + """Same content + different contentObjectIds → same hash.""" + a = _makeObjects("alpha") + b = _makeObjects("alpha") # identical data, fresh UUIDs + assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b] + assert _computeIngestionHash(a) == _computeIngestionHash(b) + + +def test_hash_changes_when_data_changes(): + a = _makeObjects("alpha") + b = _makeObjects("beta") + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +def test_hash_is_order_sensitive(): + """Reordered pages produce a different hash (different document).""" + a = _makeObjects("alpha") + b = list(reversed(a)) + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +def test_hash_distinguishes_text_vs_binary_with_same_payload(): + a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}] + b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}] + assert _computeIngestionHash(a) != _computeIngestionHash(b) + + +def test_hash_handles_empty_input(): + assert _computeIngestionHash([]) == _computeIngestionHash([]) + + +if __name__ == "__main__": + test_hash_stable_across_uuid_regeneration() + test_hash_changes_when_data_changes() + test_hash_is_order_sensitive() + test_hash_distinguishes_text_vs_binary_with_same_payload() + test_hash_handles_empty_input() + print("OK — all 5 ingestion-hash stability tests passed") From 6a5ff1ff7cc9d3aeaf4a014209150bb40cfc7537 Mon Sep 17 00:00:00 2001 From: Ida Date: Tue, 21 Apr 2026 13:44:21 +0200 Subject: [PATCH 05/18] feat(rag): P1 user-connection hooks + retrieval threshold fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - connection.established/revoked callbacks from OAuth routes and connection management endpoints - KnowledgeIngestionConsumer dispatches bootstrap job (established) and synchronous purge (revoked) - FileContentIndex: add connectionId + sourceKind columns - SharePoint bootstrap with @odata.nextLink pagination and eTag-based idempotency - Outlook bootstrap treats messages as virtual documents with cleanEmailBody for HTML/quote/signature stripping - fix(rag): lower buildAgentContext minScore thresholds from 0.55/0.65/0.70 to 0.35 — previous values blocked all real matches from text-embedding-3-small - 24 new unit tests covering purge, consumer dispatch, email cleaning and both bootstrap paths --- app.py | 10 + .../connectors/providerMsft/connectorMsft.py | 41 +- modules/datamodels/datamodelKnowledge.py | 10 + modules/interfaces/interfaceDbKnowledge.py | 40 ++ modules/routes/routeDataConnections.py | 38 +- modules/routes/routeSecurityClickup.py | 12 + modules/routes/routeSecurityGoogle.py | 12 + modules/routes/routeSecurityMsft.py | 12 + .../serviceKnowledge/mainServiceKnowledge.py | 112 +++- .../subConnectorIngestConsumer.py | 196 +++++++ .../subConnectorSyncOutlook.py | 551 ++++++++++++++++++ .../subConnectorSyncSharepoint.py | 425 ++++++++++++++ .../services/serviceKnowledge/subTextClean.py | 107 ++++ tests/unit/services/test_bootstrap_outlook.py | 190 ++++++ .../services/test_bootstrap_sharepoint.py | 209 +++++++ tests/unit/services/test_clean_email_body.py | 110 ++++ tests/unit/services/test_connection_purge.py | 119 ++++ .../test_knowledge_ingest_consumer.py | 172 ++++++ 18 files changed, 2323 insertions(+), 43 deletions(-) create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py create mode 100644 modules/serviceCenter/services/serviceKnowledge/subTextClean.py create mode 100644 tests/unit/services/test_bootstrap_outlook.py create mode 100644 tests/unit/services/test_bootstrap_sharepoint.py create mode 100644 tests/unit/services/test_clean_email_body.py create mode 100644 tests/unit/services/test_connection_purge.py create mode 100644 tests/unit/services/test_knowledge_ingest_consumer.py diff --git a/app.py b/app.py index 41271739..52e70982 100644 --- a/app.py +++ b/app.py @@ -405,6 +405,16 @@ async def lifespan(app: FastAPI): except Exception as e: logger.warning(f"BackgroundJob recovery failed (non-critical): {e}") + # Subscribe knowledge ingestion to connection lifecycle events so OAuth + # connect/disconnect reliably trigger bootstrap/purge. + try: + from modules.serviceCenter.services.serviceKnowledge.subConnectorIngestConsumer import ( + registerKnowledgeIngestionConsumer, + ) + registerKnowledgeIngestionConsumer() + except Exception as e: + logger.warning(f"KnowledgeIngestionConsumer registration failed (non-critical): {e}") + yield # --- Stop Managers --- diff --git a/modules/connectors/providerMsft/connectorMsft.py b/modules/connectors/providerMsft/connectorMsft.py index bf290eca..49f6fdaa 100644 --- a/modules/connectors/providerMsft/connectorMsft.py +++ b/modules/connectors/providerMsft/connectorMsft.py @@ -126,6 +126,11 @@ def _stripGraphBase(url: str) -> str: def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> ExternalEntry: isFolder = "folder" in item + # Graph exposes the driveItem content hash as ``eTag`` (quoted) or + # ``cTag``; we normalise to a "revision" string so callers can use it as a + # stable ``contentVersion`` for idempotent ingestion without re-downloading + # file bytes. + revision = item.get("eTag") or item.get("cTag") return ExternalEntry( name=item.get("name", ""), path=f"{basePath}/{item.get('name', '')}" if basePath else item.get("name", ""), @@ -137,6 +142,9 @@ def _graphItemToExternalEntry(item: Dict[str, Any], basePath: str = "") -> Exter "id": item.get("id"), "webUrl": item.get("webUrl"), "childCount": item.get("folder", {}).get("childCount") if isFolder else None, + "revision": revision, + "lastModifiedDateTime": item.get("lastModifiedDateTime"), + "parentReference": item.get("parentReference", {}), }, ) @@ -167,21 +175,36 @@ class SharepointAdapter(_GraphApiMixin, ServiceAdapter): return await self._discoverSites() if not folderPath or folderPath == "/": - endpoint = f"sites/{siteId}/drive/root/children" + endpoint: Optional[str] = f"sites/{siteId}/drive/root/children?$top=200" else: cleanPath = folderPath.lstrip("/") - endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children" + endpoint = f"sites/{siteId}/drive/root:/{cleanPath}:/children?$top=200" - result = await self._graphGet(endpoint) - if "error" in result: - logger.warning(f"SharePoint browse failed: {result['error']}") - return [] + # Follow @odata.nextLink until a hard cap is reached so large libraries + # are fully enumerated (required for bootstrap). Per-page size uses + # Graph's max supported value to minimise round-trips. + effectiveLimit = int(limit) if limit is not None else None + items: List[Dict[str, Any]] = [] + hardCap = 5000 + while endpoint and len(items) < hardCap: + result = await self._graphGet(endpoint) + if "error" in result: + logger.warning(f"SharePoint browse failed: {result['error']}") + break + for raw in result.get("value", []) or []: + items.append(raw) + if effectiveLimit is not None and len(items) >= effectiveLimit: + break + if effectiveLimit is not None and len(items) >= effectiveLimit: + break + nextLink = result.get("@odata.nextLink") + endpoint = _stripGraphBase(nextLink) if nextLink else None - entries = [_graphItemToExternalEntry(item, path) for item in result.get("value", [])] + entries = [_graphItemToExternalEntry(item, path) for item in items] if filter: entries = [e for e in entries if _matchFilter(e, filter)] - if limit is not None: - entries = entries[: max(1, int(limit))] + if effectiveLimit is not None: + entries = entries[: max(1, effectiveLimit)] return entries async def _discoverSites(self) -> List[ExternalEntry]: diff --git a/modules/datamodels/datamodelKnowledge.py b/modules/datamodels/datamodelKnowledge.py index 163328a4..d0af2216 100644 --- a/modules/datamodels/datamodelKnowledge.py +++ b/modules/datamodels/datamodelKnowledge.py @@ -90,6 +90,16 @@ class FileContentIndex(PowerOnModel): description="Data visibility scope: personal, featureInstance, mandate, global", json_schema_extra={"label": "Sichtbarkeit"}, ) + sourceKind: str = Field( + default="file", + description="Origin of the indexed content: file, sharepoint_item, outlook_message, outlook_attachment, ...", + json_schema_extra={"label": "Quellenart"}, + ) + connectionId: Optional[str] = Field( + default=None, + description="UserConnection ID if this index entry originates from an external connector", + json_schema_extra={"label": "Connection-ID"}, + ) neutralizationStatus: Optional[str] = Field( default=None, description="Neutralization status: completed, failed, skipped, None = not required", diff --git a/modules/interfaces/interfaceDbKnowledge.py b/modules/interfaces/interfaceDbKnowledge.py index f819615e..e5a14147 100644 --- a/modules/interfaces/interfaceDbKnowledge.py +++ b/modules/interfaces/interfaceDbKnowledge.py @@ -93,6 +93,46 @@ class KnowledgeObjects: self.db.recordModify(FileContentIndex, fileId, {"status": status}) return True + def deleteFileContentIndexByConnectionId(self, connectionId: str) -> Dict[str, int]: + """Delete all FileContentIndex rows (and their ContentChunks) for a connection. + + Used when a UserConnection is revoked / disconnected so the knowledge corpus + no longer references data the user no longer grants access to. Returns a dict + with counts to support observability logs. + """ + if not connectionId: + return {"indexRows": 0, "chunks": 0} + + rows = self.db.getRecordset( + FileContentIndex, recordFilter={"connectionId": connectionId} + ) + mandateIds: set = set() + chunkCount = 0 + indexCount = 0 + for row in rows: + fid = row.get("id") if isinstance(row, dict) else getattr(row, "id", None) + mid = row.get("mandateId") if isinstance(row, dict) else getattr(row, "mandateId", "") + if not fid: + continue + chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fid}) + for chunk in chunks: + if self.db.recordDelete(ContentChunk, chunk["id"]): + chunkCount += 1 + if self.db.recordDelete(FileContentIndex, fid): + indexCount += 1 + if mid: + mandateIds.add(str(mid)) + + for mid in mandateIds: + try: + from modules.interfaces.interfaceDbBilling import _getRootInterface + + _getRootInterface().reconcileMandateStorageBilling(mid) + except Exception as ex: + logger.warning("reconcileMandateStorageBilling after connection purge failed: %s", ex) + + return {"indexRows": indexCount, "chunks": chunkCount} + def deleteFileContentIndex(self, fileId: str) -> bool: """Delete a FileContentIndex and all associated ContentChunks.""" existing = self.getFileContentIndex(fileId) diff --git a/modules/routes/routeDataConnections.py b/modules/routes/routeDataConnections.py index 8e7a730d..b8ccf4bf 100644 --- a/modules/routes/routeDataConnections.py +++ b/modules/routes/routeDataConnections.py @@ -586,8 +586,25 @@ def disconnect_service( detail=routeApiMsg("Connection not found") ) - # Update connection status - connection.status = ConnectionStatus.INACTIVE + # Fire revoked event BEFORE DB status change so knowledge purge and + # status mutation form one logical step; subscribers see the + # connection as it was. INACTIVE does not exist on the enum — REVOKED + # is the correct terminal-but-retained state (deleted rows are + # handled in DELETE /{id}). + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.revoked", + connectionId=connectionId, + authority=str(getattr(connection.authority, "value", connection.authority) or ""), + userId=str(currentUser.id), + reason="disconnected", + ) + except Exception as _cbErr: + logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr) + + connection.status = ConnectionStatus.REVOKED connection.lastChecked = getUtcTimestamp() # Update connection record - models now handle timestamp serialization automatically @@ -636,6 +653,23 @@ def delete_connection( detail=routeApiMsg("Connection not found") ) + # Fire revoked event BEFORE the row disappears so consumers still + # have authority/connection context for observability; purge itself + # targets FileContentIndex rows by connectionId which are unaffected + # by the UserConnection delete. + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.revoked", + connectionId=connectionId, + authority=str(getattr(connection.authority, "value", connection.authority) or ""), + userId=str(currentUser.id), + reason="deleted", + ) + except Exception as _cbErr: + logger.warning("connection.revoked callback failed for %s: %s", connectionId, _cbErr) + # Remove the connection - only need connectionId since permissions are verified interface.removeUserConnection(connectionId) diff --git a/modules/routes/routeSecurityClickup.py b/modules/routes/routeSecurityClickup.py index ca787391..698e3ca1 100644 --- a/modules/routes/routeSecurityClickup.py +++ b/modules/routes/routeSecurityClickup.py @@ -241,6 +241,18 @@ async def auth_connect_callback( ) interface.saveConnectionToken(token) + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"), + userId=str(user.id), + ) + except Exception as _cbErr: + logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) + return HTMLResponse( content=f""" diff --git a/modules/routes/routeSecurityGoogle.py b/modules/routes/routeSecurityGoogle.py index 523523ee..2b6a70f5 100644 --- a/modules/routes/routeSecurityGoogle.py +++ b/modules/routes/routeSecurityGoogle.py @@ -479,6 +479,18 @@ async def auth_connect_callback( ) interface.saveConnectionToken(token) + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "google"), + userId=str(user.id), + ) + except Exception as _cbErr: + logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) + return HTMLResponse( content=f""" diff --git a/modules/routes/routeSecurityMsft.py b/modules/routes/routeSecurityMsft.py index cc4cb87b..e087a44c 100644 --- a/modules/routes/routeSecurityMsft.py +++ b/modules/routes/routeSecurityMsft.py @@ -420,6 +420,18 @@ async def auth_connect_callback( ) interface.saveConnectionToken(token) + try: + from modules.shared.callbackRegistry import callbackRegistry + + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "msft"), + userId=str(user.id), + ) + except Exception as _cbErr: + logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) + return HTMLResponse( content=f""" diff --git a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py index 57490eba..0267e2fd 100644 --- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py +++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py @@ -203,6 +203,8 @@ class KnowledgeService: contentObjects=job.contentObjects or [], structure=structure, containerPath=job.containerPath, + sourceKind=job.sourceKind, + connectionId=(job.provenance or {}).get("connectionId"), ) except Exception as exc: logger.error( @@ -254,6 +256,31 @@ class KnowledgeService: index=index, ) + def purgeConnection(self, connectionId: str) -> Dict[str, int]: + """Delete every FileContentIndex + ContentChunk linked to a UserConnection. + + Called on `connection.revoked` events so the knowledge corpus never + holds chunks the user has withdrawn access to. Returns deletion counts + for observability. + """ + if not connectionId: + return {"indexRows": 0, "chunks": 0} + startMs = time.time() + result = self._knowledgeDb.deleteFileContentIndexByConnectionId(connectionId) + logger.info( + "ingestion.connection.purged connectionId=%s rows=%d chunks=%d durationMs=%d", + connectionId, result["indexRows"], result["chunks"], + int((time.time() - startMs) * 1000), + extra={ + "event": "ingestion.connection.purged", + "connectionId": connectionId, + "indexRows": result["indexRows"], + "chunks": result["chunks"], + "durationMs": int((time.time() - startMs) * 1000), + }, + ) + return result + def getIngestionStatus( self, handleOrJobId: Union[IngestionHandle, str] ) -> Dict[str, Any]: @@ -362,6 +389,8 @@ class KnowledgeService: contentObjects: List[Dict[str, Any]] = None, structure: Dict[str, Any] = None, containerPath: str = None, + sourceKind: str = "file", + connectionId: Optional[str] = None, ) -> FileContentIndex: """Index a file's content objects and create embeddings for text chunks. @@ -384,39 +413,41 @@ class KnowledgeService: """ contentObjects = contentObjects or [] - # 1. Resolve scope fields from FileItem (Single Source of Truth) - # FileItem lives in poweron_management; its scope/mandateId/featureInstanceId - # are authoritative and must be mirrored onto the FileContentIndex. + # 1. Resolve scope fields from FileItem (Single Source of Truth) for + # uploaded files. Connector-sourced ingestion (sharepoint_item, + # outlook_message, ...) has no FileItem row — trust the caller's + # scope + ids directly. resolvedScope = "personal" resolvedMandateId = mandateId resolvedFeatureInstanceId = featureInstanceId resolvedUserId = userId _shouldNeutralize = False - try: - from modules.datamodels.datamodelFiles import FileItem as _FileItem - _dbComponent = getattr(self._context, "interfaceDbComponent", None) - _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else [] - if not _fileRecords: - from modules.interfaces.interfaceDbManagement import ComponentObjects - _row = ComponentObjects().db._loadRecord(_FileItem, fileId) - if _row: - _fileRecords = [_row] - if _fileRecords: - _fileRecord = _fileRecords[0] - _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d)) - _shouldNeutralize = bool(_get("neutralize", False)) - _fileScope = _get("scope") - if _fileScope: - resolvedScope = _fileScope - if not resolvedMandateId: - resolvedMandateId = str(_get("mandateId", "") or "") - if not resolvedFeatureInstanceId: - resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "") - _fileCreatedBy = _get("sysCreatedBy") - if _fileCreatedBy: - resolvedUserId = str(_fileCreatedBy) - except Exception: - pass + if sourceKind == "file": + try: + from modules.datamodels.datamodelFiles import FileItem as _FileItem + _dbComponent = getattr(self._context, "interfaceDbComponent", None) + _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else [] + if not _fileRecords: + from modules.interfaces.interfaceDbManagement import ComponentObjects + _row = ComponentObjects().db._loadRecord(_FileItem, fileId) + if _row: + _fileRecords = [_row] + if _fileRecords: + _fileRecord = _fileRecords[0] + _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d)) + _shouldNeutralize = bool(_get("neutralize", False)) + _fileScope = _get("scope") + if _fileScope: + resolvedScope = _fileScope + if not resolvedMandateId: + resolvedMandateId = str(_get("mandateId", "") or "") + if not resolvedFeatureInstanceId: + resolvedFeatureInstanceId = str(_get("featureInstanceId", "") or "") + _fileCreatedBy = _get("sysCreatedBy") + if _fileCreatedBy: + resolvedUserId = str(_fileCreatedBy) + except Exception: + pass # 2. Create FileContentIndex with correct scope from the start index = FileContentIndex( @@ -425,6 +456,8 @@ class KnowledgeService: featureInstanceId=resolvedFeatureInstanceId, mandateId=resolvedMandateId, scope=resolvedScope, + sourceKind=sourceKind, + connectionId=connectionId, fileName=fileName, mimeType=mimeType, containerPath=containerPath, @@ -601,7 +634,12 @@ class KnowledgeService: Formatted context string for injection into the agent's system prompt. """ queryVector = await self._embedSingle(currentPrompt) + logger.debug( + "buildAgentContext.start userId=%s featureInstanceId=%s mandateId=%s isSysAdmin=%s prompt=%r", + userId, featureInstanceId, mandateId, isSysAdmin, (currentPrompt or "")[:120], + ) if not queryVector: + logger.debug("buildAgentContext.abort reason=no_query_vector") return "" builder = _ContextBuilder(budget=contextBudget) @@ -628,9 +666,14 @@ class KnowledgeService: featureInstanceId=featureInstanceId, mandateId=mandateId, limit=15, - minScore=0.65, + minScore=0.35, isSysAdmin=isSysAdmin, ) + logger.debug( + "buildAgentContext.layer1 instanceChunks=%d top_scores=%s", + len(instanceChunks), + [round(float(c.get("_score", 0) or 0), 3) for c in (instanceChunks or [])[:3]], + ) if instanceChunks: builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000) @@ -639,7 +682,7 @@ class KnowledgeService: queryVector=queryVector, workflowId=workflowId, limit=10, - minScore=0.55, + minScore=0.35, ) if roundMemories: memItems = [] @@ -677,7 +720,7 @@ class KnowledgeService: scope="mandate", mandateId=mandateId, limit=10, - minScore=0.7, + minScore=0.35, isSysAdmin=isSysAdmin, ) if mandateChunks: @@ -693,7 +736,12 @@ class KnowledgeService: maxChars=500, ) - return builder.build() + _result = builder.build() + logger.debug( + "buildAgentContext.done totalChars=%d userId=%s", + len(_result), userId, + ) + return _result # ========================================================================= # Workflow Memory diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py new file mode 100644 index 00000000..51acb71c --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py @@ -0,0 +1,196 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Connection-lifecycle consumer bridging OAuth events to ingestion jobs. + +Subscribes to `connection.established` and `connection.revoked` callbacks +emitted by the OAuth callbacks / connection management routes and dispatches: + +- `connection.established` -> enqueue a `connection.bootstrap` BackgroundJob + that walks the connector and ingests all reachable items via + KnowledgeService.requestIngestion (file-like or virtual documents). +- `connection.revoked` -> run `KnowledgeService.purgeConnection` synchronously + so the knowledge corpus releases the data before the UI confirms the revoke. + +The consumer is registered once at process boot (see `app.py` lifespan). +It intentionally does NOT hold a per-user service context; each callback +creates whatever context it needs from the UserConnection row itself. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any, Dict, Optional + +from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface +from modules.shared.callbackRegistry import callbackRegistry +from modules.serviceCenter.services.serviceBackgroundJobs import ( + registerJobHandler, + startJob, +) + +logger = logging.getLogger(__name__) + +BOOTSTRAP_JOB_TYPE = "connection.bootstrap" + +_registered = False + + +def _onConnectionEstablished( + *, + connectionId: str, + authority: str, + userId: Optional[str] = None, + **kwargs: Any, +) -> None: + """Fire-and-forget bootstrap enqueue for a freshly connected UserConnection.""" + if not connectionId: + logger.warning("connection.established without connectionId; ignoring") + return + payload: Dict[str, Any] = { + "connectionId": connectionId, + "authority": (authority or "").lower(), + "userId": userId, + } + logger.info( + "ingestion.connection.bootstrap.queued connectionId=%s authority=%s", + connectionId, authority, + extra={ + "event": "ingestion.connection.bootstrap.queued", + "connectionId": connectionId, + "authority": authority, + }, + ) + + async def _enqueue() -> None: + try: + await startJob( + BOOTSTRAP_JOB_TYPE, + payload, + triggeredBy=userId, + ) + except Exception as exc: + logger.error( + "ingestion.connection.bootstrap.enqueue_failed connectionId=%s error=%s", + connectionId, exc, exc_info=True, + ) + + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(_enqueue()) + else: + loop.run_until_complete(_enqueue()) + except RuntimeError: + asyncio.run(_enqueue()) + + +def _onConnectionRevoked( + *, + connectionId: str, + authority: Optional[str] = None, + userId: Optional[str] = None, + reason: Optional[str] = None, + **kwargs: Any, +) -> None: + """Run the knowledge purge synchronously so UI feedback is authoritative.""" + if not connectionId: + logger.warning("connection.revoked without connectionId; ignoring") + return + try: + # Purge lives on the DB interface to avoid ServiceCenter/user-context + # plumbing here; the service method is a thin wrapper on top of this. + result = getKnowledgeInterface(None).deleteFileContentIndexByConnectionId(connectionId) + except Exception as exc: + logger.error( + "ingestion.connection.purged.failed connectionId=%s error=%s", + connectionId, exc, exc_info=True, + ) + return + logger.info( + "ingestion.connection.purged connectionId=%s authority=%s reason=%s rows=%d chunks=%d", + connectionId, authority, reason, + result.get("indexRows", 0), result.get("chunks", 0), + extra={ + "event": "ingestion.connection.purged", + "connectionId": connectionId, + "authority": authority, + "reason": reason, + "indexRows": result.get("indexRows", 0), + "chunks": result.get("chunks", 0), + }, + ) + + +async def _bootstrapJobHandler( + job: Dict[str, Any], + progressCb, +) -> Dict[str, Any]: + """Dispatch bootstrap by authority. Each authority runs its own sub-bootstraps.""" + payload = job.get("payload") or {} + connectionId = payload.get("connectionId") + authority = (payload.get("authority") or "").lower() + if not connectionId: + raise ValueError("connection.bootstrap requires payload.connectionId") + + progressCb(5, f"resolving {authority} connection") + + if authority == "msft": + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( + bootstrapSharepoint, + ) + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import ( + bootstrapOutlook, + ) + + progressCb(10, "sharepoint + outlook") + spResult, olResult = await asyncio.gather( + bootstrapSharepoint(connectionId=connectionId, progressCb=progressCb), + bootstrapOutlook(connectionId=connectionId, progressCb=progressCb), + return_exceptions=True, + ) + + def _normalize(res: Any, label: str) -> Dict[str, Any]: + if isinstance(res, Exception): + logger.error( + "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s", + label, connectionId, res, exc_info=res, + ) + return {"error": str(res)} + return res or {} + + return { + "connectionId": connectionId, + "authority": authority, + "sharepoint": _normalize(spResult, "sharepoint"), + "outlook": _normalize(olResult, "outlook"), + } + + logger.info( + "ingestion.connection.bootstrap.skipped reason=P1_pilot_scope authority=%s connectionId=%s", + authority, connectionId, + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "authority": authority, + "connectionId": connectionId, + "reason": "P1_pilot_scope", + }, + ) + return { + "connectionId": connectionId, + "authority": authority, + "skipped": True, + "reason": "P1_pilot_scope", + } + + +def registerKnowledgeIngestionConsumer() -> None: + """Register callback subscribers + background job handler. Idempotent.""" + global _registered + if _registered: + return + callbackRegistry.register("connection.established", _onConnectionEstablished) + callbackRegistry.register("connection.revoked", _onConnectionRevoked) + registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler) + _registered = True + logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler)", BOOTSTRAP_JOB_TYPE) diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py new file mode 100644 index 00000000..b3f425ac --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py @@ -0,0 +1,551 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Outlook bootstrap for the unified knowledge ingestion lane. + +Unlike SharePoint, Outlook messages are "virtual documents" — we never persist +file bytes in the store. Each message becomes a `sourceKind="outlook_message"` +IngestionJob whose `contentObjects` carry the header, snippet and cleaned body +so retrieval can show a compact answer without fetching Graph again. + +Attachments are optional (`includeAttachments` limit flag) and enqueued as +child jobs with `sourceKind="outlook_attachment"` + `provenance.parentId`. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody + +logger = logging.getLogger(__name__) + +MAX_MESSAGES_DEFAULT = 500 +MAX_FOLDERS_DEFAULT = 5 +MAX_BODY_CHARS_DEFAULT = 8000 +MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024 +WELL_KNOWN_FOLDERS = ("inbox", "sentitems") + + +@dataclass +class OutlookBootstrapLimits: + maxMessages: int = MAX_MESSAGES_DEFAULT + maxFolders: int = MAX_FOLDERS_DEFAULT + maxBodyChars: int = MAX_BODY_CHARS_DEFAULT + includeAttachments: bool = False + maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT + # Only fetch messages newer than N days. None disables filter. + maxAgeDays: Optional[int] = 90 + + +@dataclass +class OutlookBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + attachmentsIndexed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticMessageId(connectionId: str, messageId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16] + return f"om:{connectionId[:8]}:{token}" + + +def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str: + token = hashlib.sha256( + f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8") + ).hexdigest()[:16] + return f"oa:{connectionId[:8]}:{token}" + + +def _extractRecipient(recipient: Dict[str, Any]) -> str: + email = (recipient or {}).get("emailAddress") or {} + name = email.get("name") or "" + addr = email.get("address") or "" + if name and addr: + return f"{name} <{addr}>" + return addr or name + + +def _joinRecipients(recipients: List[Dict[str, Any]]) -> str: + return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []])) + + +def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]: + subject = message.get("subject") or "(no subject)" + fromAddr = _extractRecipient(message.get("from") or {}) + toAddr = _joinRecipients(message.get("toRecipients") or []) + ccAddr = _joinRecipients(message.get("ccRecipients") or []) + received = message.get("receivedDateTime") or "" + snippet = message.get("bodyPreview") or "" + + body = message.get("body") or {} + bodyContent = body.get("content") or "" + bodyType = (body.get("contentType") or "").lower() + if bodyType == "html" or (bodyContent and "<" in bodyContent and ">" in bodyContent): + cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) + else: + cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else "" + + parts: List[Dict[str, Any]] = [] + header = ( + f"Subject: {subject}\n" + f"From: {fromAddr}\n" + f"To: {toAddr}\n" + + (f"Cc: {ccAddr}\n" if ccAddr else "") + + f"Date: {received}" + ) + parts.append({ + "contentObjectId": "header", + "contentType": "text", + "data": header, + "contextRef": {"part": "header"}, + }) + if snippet: + parts.append({ + "contentObjectId": "snippet", + "contentType": "text", + "data": snippet, + "contextRef": {"part": "snippet"}, + }) + if cleanedBody: + parts.append({ + "contentObjectId": "body", + "contentType": "text", + "data": cleanedBody, + "contextRef": {"part": "body"}, + }) + return parts + + +async def bootstrapOutlook( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[OutlookBootstrapLimits] = None, +) -> Dict[str, Any]: + """Enumerate Outlook folders (inbox + sent by default) and ingest messages.""" + limits = limits or OutlookBootstrapLimits() + startMs = time.time() + result = OutlookBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=outlook connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "outlook", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + folderIds = await _selectFolderIds(adapter, limits) + for folderId in folderIds: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + try: + await _ingestFolder( + adapter=adapter, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderId=folderId, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("outlook ingestion folder %s failed: %s", folderId, exc, exc_info=True) + result.errors.append(f"folder({folderId}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerMsft.connectorMsft import MsftConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = MsftConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("outlook") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _selectFolderIds(adapter, limits: OutlookBootstrapLimits) -> List[str]: + """Prefer well-known folders (inbox, sentitems); fall back to browse().""" + folderIds: List[str] = [] + for wellKnown in WELL_KNOWN_FOLDERS: + if len(folderIds) >= limits.maxFolders: + break + try: + row = await adapter._graphGet(f"me/mailFolders/{wellKnown}") + except Exception: + row = None + if isinstance(row, dict) and "error" not in row and row.get("id"): + folderIds.append(row["id"]) + + if len(folderIds) < limits.maxFolders: + try: + entries = await adapter.browse("/") + except Exception: + entries = [] + for entry in entries: + metadata = getattr(entry, "metadata", {}) or {} + fid = metadata.get("id") + if fid and fid not in folderIds: + folderIds.append(fid) + if len(folderIds) >= limits.maxFolders: + break + return folderIds + + +async def _ingestFolder( + *, + adapter, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + folderId: str, + limits: OutlookBootstrapLimits, + result: OutlookBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) + if remaining <= 0: + return + + pageSize = min(100, remaining) + select = ( + "id,subject,from,toRecipients,ccRecipients,receivedDateTime," + "bodyPreview,body,internetMessageId,hasAttachments,changeKey" + ) + endpoint: Optional[str] = ( + f"me/mailFolders/{folderId}/messages" + f"?$top={pageSize}&$orderby=receivedDateTime desc&$select={select}" + ) + + # Keep header-based age filter in Graph itself to avoid shipping ancient + # messages we'd discard client-side. + if limits.maxAgeDays: + from datetime import datetime, timezone, timedelta + + cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays) + cutoffIso = cutoff.strftime("%Y-%m-%dT%H:%M:%SZ") + endpoint = f"{endpoint}&$filter=receivedDateTime ge {cutoffIso}" + + while endpoint and (result.indexed + result.skippedDuplicate) < limits.maxMessages: + try: + page = await adapter._graphGet(endpoint) + except Exception as exc: + logger.warning("outlook graph page failed for folder %s: %s", folderId, exc) + result.errors.append(f"graph({folderId}): {exc}") + return + if not isinstance(page, dict) or "error" in page: + err = (page or {}).get("error") if isinstance(page, dict) else "unknown" + logger.warning("outlook graph page error for folder %s: %s", folderId, err) + result.errors.append(f"graph({folderId}): {err}") + return + + for message in page.get("value", []) or []: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + await _ingestMessage( + adapter=adapter, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + message=message, + limits=limits, + result=result, + progressCb=progressCb, + ) + + nextLink = page.get("@odata.nextLink") + if not nextLink: + break + # Strip Graph base so adapter._graphGet accepts the relative path. + from modules.connectors.providerMsft.connectorMsft import _stripGraphBase + + endpoint = _stripGraphBase(nextLink) + + +async def _ingestMessage( + *, + adapter, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + message: Dict[str, Any], + limits: OutlookBootstrapLimits, + result: OutlookBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + messageId = message.get("id") + if not messageId: + result.skippedPolicy += 1 + return + revision = message.get("changeKey") or message.get("internetMessageId") + subject = message.get("subject") or "(no subject)" + syntheticId = _syntheticMessageId(connectionId, messageId) + fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" + + contentObjects = _buildContentObjects(message, limits.maxBodyChars) + # Always at least the header is emitted, so `contentObjects` is non-empty. + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="outlook_message", + sourceId=syntheticId, + fileName=fileName, + mimeType="message/rfc822", + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision, + provenance={ + "connectionId": connectionId, + "authority": "msft", + "service": "outlook", + "externalItemId": messageId, + "internetMessageId": message.get("internetMessageId"), + "tier": "body", + }, + ) + ) + except Exception as exc: + logger.error("outlook ingestion %s failed: %s", messageId, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({messageId}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if limits.includeAttachments and message.get("hasAttachments"): + try: + await _ingestAttachments( + adapter=adapter, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + messageId=messageId, + parentSyntheticId=syntheticId, + limits=limits, + result=result, + ) + except Exception as exc: + logger.warning("outlook attachments %s failed: %s", messageId, exc) + result.errors.append(f"attachments({messageId}): {exc}") + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxMessages))), + f"outlook processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=outlook processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "outlook", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + await asyncio.sleep(0) + + +async def _ingestAttachments( + *, + adapter, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + messageId: str, + parentSyntheticId: str, + limits: OutlookBootstrapLimits, + result: OutlookBootstrapResult, +) -> None: + """Child ingestion jobs for file attachments (skip inline & oversized).""" + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + from modules.datamodels.datamodelExtraction import ExtractionOptions + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + import base64 + + page = await adapter._graphGet(f"me/messages/{messageId}/attachments") + if not isinstance(page, dict) or "error" in page: + return + + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + for attachment in page.get("value", []) or []: + if attachment.get("@odata.type") != "#microsoft.graph.fileAttachment": + continue + if attachment.get("isInline"): + continue + size = int(attachment.get("size") or 0) + if size and size > limits.maxAttachmentBytes: + result.skippedPolicy += 1 + continue + contentBytesB64 = attachment.get("contentBytes") + if not contentBytesB64: + continue + try: + rawBytes = base64.b64decode(contentBytesB64) + except Exception: + result.skippedPolicy += 1 + continue + fileName = attachment.get("name") or "attachment" + mimeType = attachment.get("contentType") or "application/octet-stream" + attachmentId = attachment.get("id") or fileName + syntheticId = _syntheticAttachmentId(connectionId, messageId, attachmentId) + + try: + extracted = runExtraction( + extractorRegistry, chunkerRegistry, + rawBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("outlook attachment extract %s failed: %s", attachmentId, exc) + result.failed += 1 + continue + + contentObjects: List[Dict[str, Any]] = [] + for part in getattr(extracted, "parts", None) or []: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + contentObjects.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "attachment", + **(getattr(part, "metadata", None) or {}), + }, + }) + if not contentObjects: + result.skippedPolicy += 1 + continue + + try: + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="outlook_attachment", + sourceId=syntheticId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + provenance={ + "connectionId": connectionId, + "authority": "msft", + "service": "outlook", + "parentId": parentSyntheticId, + "externalItemId": attachmentId, + "parentMessageId": messageId, + }, + ) + ) + result.attachmentsIndexed += 1 + except Exception as exc: + logger.warning("outlook attachment ingest %s failed: %s", attachmentId, exc) + result.failed += 1 + + +def _finalizeResult(connectionId: str, result: OutlookBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=outlook connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.attachmentsIndexed, result.failed, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "outlook", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py new file mode 100644 index 00000000..0bceecac --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py @@ -0,0 +1,425 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""SharePoint bootstrap for the unified knowledge ingestion lane. + +Walks the SharePoint drive(s) reachable via a UserConnection, downloads each +file-like item, runs the standard content extraction pipeline and hands the +result to `KnowledgeService.requestIngestion`. Idempotency is provided by the +ingestion façade itself; repeat bootstraps therefore produce +`ingestion.skipped.duplicate` for every unchanged item because we pass the +Graph `eTag` as `contentVersion`. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging +import time +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +from modules.datamodels.datamodelExtraction import ExtractionOptions + +logger = logging.getLogger(__name__) + +MAX_ITEMS_DEFAULT = 500 +MAX_BYTES_DEFAULT = 200 * 1024 * 1024 +MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 +SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") +MAX_DEPTH_DEFAULT = 4 +MAX_SITES_DEFAULT = 3 + + +@dataclass +class SharepointBootstrapLimits: + maxItems: int = MAX_ITEMS_DEFAULT + maxBytes: int = MAX_BYTES_DEFAULT + maxFileSize: int = MAX_FILE_SIZE_DEFAULT + skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT + maxDepth: int = MAX_DEPTH_DEFAULT + maxSites: int = MAX_SITES_DEFAULT + + +@dataclass +class SharepointBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + bytesProcessed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticFileId(connectionId: str, externalItemId: str) -> str: + """Deterministic synthetic FileContentIndex id for a SharePoint item. + + Stable across bootstraps → idempotency works; independent of file name so + moves/renames don't duplicate chunks. + """ + token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] + return f"sp:{connectionId[:8]}:{token}" + + +def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: + """Translate ExtractionResult → content objects accepted by requestIngestion.""" + parts = getattr(extracted, "parts", None) or [] + out: List[Dict[str, Any]] = [] + for part in parts: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + out.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "file", + **(getattr(part, "metadata", None) or {}), + }, + }) + return out + + +async def bootstrapSharepoint( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[SharepointBootstrapLimits] = None, + runExtractionFn: Optional[Callable[..., Any]] = None, +) -> Dict[str, Any]: + """Enumerate SharePoint drives and ingest every reachable file via the façade. + + Parameters allow injection for tests; production callers pass only + `connectionId` (and optionally a progressCb) and everything else is + resolved against the registered services. + """ + limits = limits or SharepointBootstrapLimits() + startMs = time.time() + result = SharepointBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=sharepoint connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "sharepoint", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + if runExtractionFn is None: + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] + return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + try: + sites = await adapter.browse("/", limit=limits.maxSites) + except Exception as exc: + logger.error("sharepoint site discovery failed for %s: %s", connectionId, exc, exc_info=True) + result.errors.append(f"site_discovery: {exc}") + return _finalizeResult(connectionId, result, startMs) + + for site in sites[: limits.maxSites]: + if result.indexed + result.skippedDuplicate >= limits.maxItems: + break + sitePath = getattr(site, "path", "") or "" + try: + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath=sitePath, + depth=0, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("sharepoint walk failed for site %s: %s", sitePath, exc, exc_info=True) + result.errors.append(f"walk({sitePath}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + """Load connection, instantiate SharepointAdapter, and build a KnowledgeService. + + Runs with root privileges: bootstrap is a system operation triggered by an + authenticated user via callback; it must not be gated by a per-user + service-center context. + """ + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerMsft.connectorMsft import MsftConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = MsftConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("sharepoint") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _walkFolder( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + folderPath: str, + depth: int, + limits: SharepointBootstrapLimits, + result: SharepointBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + if depth > limits.maxDepth: + return + try: + entries = await adapter.browse(folderPath) + except Exception as exc: + logger.warning("sharepoint browse %s failed: %s", folderPath, exc) + result.errors.append(f"browse({folderPath}): {exc}") + return + + for entry in entries: + if result.indexed + result.skippedDuplicate >= limits.maxItems: + return + if result.bytesProcessed >= limits.maxBytes: + return + + entryPath = getattr(entry, "path", "") or "" + if getattr(entry, "isFolder", False): + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath=entryPath, + depth=depth + 1, + limits=limits, + result=result, + progressCb=progressCb, + ) + continue + + mimeType = getattr(entry, "mimeType", None) or "application/octet-stream" + if any(mimeType.startswith(prefix) for prefix in limits.skipMimePrefixes): + result.skippedPolicy += 1 + continue + size = int(getattr(entry, "size", 0) or 0) + if size and size > limits.maxFileSize: + result.skippedPolicy += 1 + continue + + metadata = getattr(entry, "metadata", {}) or {} + externalItemId = metadata.get("id") or entryPath + revision = metadata.get("revision") or metadata.get("lastModifiedDateTime") + + await _ingestOne( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + entry=entry, + entryPath=entryPath, + mimeType=mimeType, + externalItemId=externalItemId, + revision=revision, + limits=limits, + result=result, + progressCb=progressCb, + ) + + +async def _ingestOne( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + entry, + entryPath: str, + mimeType: str, + externalItemId: str, + revision: Optional[str], + limits: SharepointBootstrapLimits, + result: SharepointBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + syntheticFileId = _syntheticFileId(connectionId, externalItemId) + fileName = getattr(entry, "name", "") or externalItemId + + try: + fileBytes = await adapter.download(entryPath) + except Exception as exc: + logger.warning("sharepoint download %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"download({entryPath}): {exc}") + return + if not fileBytes: + result.failed += 1 + return + + result.bytesProcessed += len(fileBytes) + + try: + extracted = runExtractionFn( + fileBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("sharepoint extraction %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"extract({entryPath}): {exc}") + return + + contentObjects = _toContentObjects(extracted, fileName) + if not contentObjects: + result.skippedPolicy += 1 + return + + provenance: Dict[str, Any] = { + "connectionId": connectionId, + "authority": "msft", + "service": "sharepoint", + "externalItemId": externalItemId, + "externalPath": entryPath, + "revision": revision, + } + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="sharepoint_item", + sourceId=syntheticFileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision, + provenance=provenance, + ) + ) + except Exception as exc: + logger.error("sharepoint ingestion %s failed: %s", entryPath, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({entryPath}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + if handle.error: + result.errors.append(f"ingest({entryPath}): {handle.error}") + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxItems))), + f"sharepoint processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=sharepoint processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "sharepoint", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + # Yield so the event loop can interleave other tasks (download/extract are + # CPU-ish and extraction uses sync libs; cooperative scheduling prevents + # starving other workers). + await asyncio.sleep(0) + + +def _finalizeResult(connectionId: str, result: SharepointBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=sharepoint connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, result.failed, + durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "sharepoint", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "bytesProcessed": result.bytesProcessed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subTextClean.py b/modules/serviceCenter/services/serviceKnowledge/subTextClean.py new file mode 100644 index 00000000..2d352cfa --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subTextClean.py @@ -0,0 +1,107 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Text normalisation utilities used by knowledge ingestion. + +The email body cleaning logic is intentionally regex-based and works on plain +text after an HTML→text pass so we never store unsanitised HTML/JS in the +knowledge store and retrieval stays robust (no extraneous markup tokens +eating embedding budget). +""" + +from __future__ import annotations + +import re +from typing import Optional + +DEFAULT_MAX_CHARS = 8000 + + +_QUOTE_MARKER_PATTERNS = [ + re.compile(r"^\s*(?:On\s.+?\swrote:)\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*(?:Am\s.+?\sschrieb.+?:)\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*-{2,}\s*Original\s*Message\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*-{2,}\s*Urspr.+Nachricht\s*-{2,}\s*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*From:\s+.+$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Von:\s+.+$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Sent:\s+.+$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Gesendet:\s+.+$", re.MULTILINE | re.IGNORECASE), +] + +_SIGNATURE_MARKERS = [ + re.compile(r"^\s*-{2,}\s*$", re.MULTILINE), + re.compile(r"^\s*—\s*$", re.MULTILINE), + re.compile(r"^\s*Best regards\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Kind regards\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Mit freundlichen Gr[üu]ßen\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Viele Gr[üu]ße\b.*$", re.MULTILINE | re.IGNORECASE), + re.compile(r"^\s*Best,\s*$", re.MULTILINE | re.IGNORECASE), +] + + +def _htmlToText(html: str) -> str: + """Prefer BeautifulSoup when available, fall back to regex.""" + try: + from bs4 import BeautifulSoup # type: ignore + + soup = BeautifulSoup(html, "html.parser") + for tag in soup(["script", "style", "head"]): + tag.decompose() + for br in soup.find_all(["br"]): + br.replace_with("\n") + for p in soup.find_all(["p", "div", "li", "tr"]): + p.append("\n") + text = soup.get_text() + except Exception: + # Minimal fallback: strip tags crudely. + text = re.sub(r"", "\n", html, flags=re.IGNORECASE) + text = re.sub(r"", "\n", text, flags=re.IGNORECASE) + text = re.sub(r"<[^>]+>", "", text) + # Collapse non-breaking + zero-width whitespace. + text = text.replace("\u00a0", " ").replace("\u200b", "") + return text + + +def _stripQuotedThread(text: str) -> str: + """Remove reply-chain content so only the author's own contribution remains.""" + earliest = len(text) + for pattern in _QUOTE_MARKER_PATTERNS: + match = pattern.search(text) + if match and match.start() < earliest: + earliest = match.start() + # Drop any block starting with "> " quoted lines (often Gmail/Thunderbird). + quotedBlock = re.search(r"^(?:\s*>.*\n?)+", text, re.MULTILINE) + if quotedBlock and quotedBlock.start() < earliest: + earliest = quotedBlock.start() + return text[:earliest].rstrip() + + +def _stripSignature(text: str) -> str: + earliest = len(text) + for pattern in _SIGNATURE_MARKERS: + match = pattern.search(text) + if match and match.start() < earliest: + earliest = match.start() + return text[:earliest].rstrip() + + +def _collapseWhitespace(text: str) -> str: + text = re.sub(r"[ \t]+", " ", text) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def cleanEmailBody(html: str, maxChars: Optional[int] = DEFAULT_MAX_CHARS) -> str: + """Return a compact plain-text view of an email body suitable for embedding. + + Steps: HTML → text, remove quoted reply chain, remove signature, collapse + whitespace, truncate to maxChars. Always returns a string (possibly empty). + """ + if not html: + return "" + text = _htmlToText(html) if "<" in html and ">" in html else html + text = _stripQuotedThread(text) + text = _stripSignature(text) + text = _collapseWhitespace(text) + if maxChars and len(text) > maxChars: + text = text[:maxChars].rstrip() + "…" + return text diff --git a/tests/unit/services/test_bootstrap_outlook.py b/tests/unit/services/test_bootstrap_outlook.py new file mode 100644 index 00000000..26664eaa --- /dev/null +++ b/tests/unit/services/test_bootstrap_outlook.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap Outlook tests with a fake adapter + knowledge service. + +Verifies: +- Well-known folders (inbox, sentitems) are discovered via Graph. +- Each message produces a `requestIngestion` call with sourceKind=outlook_message + and structured contentObjects (header / snippet / body). +- Pagination via `@odata.nextLink` is followed. +- changeKey is forwarded as contentVersion → idempotency. +""" + +import asyncio +import os +import sys +from types import SimpleNamespace + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook import ( + bootstrapOutlook, + OutlookBootstrapLimits, + _syntheticMessageId, + _buildContentObjects, +) + + +class _FakeOutlookAdapter: + def __init__(self, messages_by_folder, paginated_folder=None, page2=None): + self._folders = {"inbox": "INBOX-ID", "sentitems": "SENT-ID"} + self._messages = messages_by_folder + self._paginated_folder = paginated_folder + self._page2 = page2 or [] + self.requested_endpoints = [] + + async def _graphGet(self, endpoint: str): + self.requested_endpoints.append(endpoint) + if endpoint.startswith("me/mailFolders/") and "/messages" not in endpoint: + wellKnown = endpoint.split("/")[-1] + fid = self._folders.get(wellKnown) + if not fid: + return {"error": "not found"} + return {"id": fid, "displayName": wellKnown} + # message page request: e.g. me/mailFolders/INBOX-ID/messages?... + for fid, messages in self._messages.items(): + if f"me/mailFolders/{fid}/messages" in endpoint: + page = {"value": messages} + if fid == self._paginated_folder and "skiptoken" not in endpoint: + page["@odata.nextLink"] = ( + "https://graph.microsoft.com/v1.0/" + f"me/mailFolders/{fid}/messages?$skiptoken=abc" + ) + elif fid == self._paginated_folder and "skiptoken" in endpoint: + page = {"value": self._page2} + return page + return {"value": []} + + async def browse(self, path): + return [] + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls = [] + self._duplicates = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicates else "indexed" + return SimpleNamespace( + jobId=job.sourceId, status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def _msg(mid: str, subject: str = "Hi", change: str = "ck1"): + return { + "id": mid, + "subject": subject, + "from": {"emailAddress": {"name": "Alice", "address": "a@x.com"}}, + "toRecipients": [{"emailAddress": {"name": "Bob", "address": "b@x.com"}}], + "ccRecipients": [], + "receivedDateTime": "2025-01-01T10:00:00Z", + "bodyPreview": "Hello world", + "body": {"contentType": "text", "content": "Hello world\nThis is the body."}, + "internetMessageId": f"<{mid}@local>", + "hasAttachments": False, + "changeKey": change, + } + + +def test_buildContentObjects_emits_header_snippet_body(): + parts = _buildContentObjects(_msg("m1"), maxBodyChars=8000) + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header", "snippet", "body"] + header = parts[0]["data"] + assert "Subject: Hi" in header + assert "From: Alice " in header + assert "To: Bob " in header + + +def test_bootstrap_outlook_indexes_messages_from_inbox_and_sent(): + adapter = _FakeOutlookAdapter({ + "INBOX-ID": [_msg("m1"), _msg("m2")], + "SENT-ID": [_msg("m3")], + }) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapOutlook( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + limits=OutlookBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + _syntheticMessageId("c1", "m3"), + } + for job in knowledge.calls: + assert job.sourceKind == "outlook_message" + assert job.mimeType == "message/rfc822" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["service"] == "outlook" + assert job.contentVersion == "ck1" + assert any(co["contentObjectId"] == "header" for co in job.contentObjects) + + +def test_bootstrap_outlook_follows_pagination(): + adapter = _FakeOutlookAdapter( + messages_by_folder={"INBOX-ID": [_msg("m1")], "SENT-ID": []}, + paginated_folder="INBOX-ID", + page2=[_msg("m2"), _msg("m3")], + ) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapOutlook( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + limits=OutlookBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + + +def test_bootstrap_outlook_reports_duplicates(): + adapter = _FakeOutlookAdapter({ + "INBOX-ID": [_msg("m1"), _msg("m2")], + "SENT-ID": [], + }) + duplicates = { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicates) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapOutlook( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + limits=OutlookBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 2 + + +if __name__ == "__main__": + test_buildContentObjects_emits_header_snippet_body() + test_bootstrap_outlook_indexes_messages_from_inbox_and_sent() + test_bootstrap_outlook_follows_pagination() + test_bootstrap_outlook_reports_duplicates() + print("OK — bootstrapOutlook tests passed") diff --git a/tests/unit/services/test_bootstrap_sharepoint.py b/tests/unit/services/test_bootstrap_sharepoint.py new file mode 100644 index 00000000..8b011357 --- /dev/null +++ b/tests/unit/services/test_bootstrap_sharepoint.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap SharePoint tests with a fake adapter + knowledge service. + +Verifies: +- Every discovered file triggers `requestIngestion`. +- Duplicate runs (same eTag revisions) report `skippedDuplicate`. +- Synthetic fileIds are stable across runs so idempotency works end-to-end. +""" + +import asyncio +import os +import sys +from dataclasses import dataclass +from types import SimpleNamespace +from typing import Any, Dict, List, Optional + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( + bootstrapSharepoint, + _syntheticFileId, +) + + +@dataclass +class _ExtEntry: + name: str + path: str + isFolder: bool = False + size: Optional[int] = None + mimeType: Optional[str] = None + metadata: Dict[str, Any] = None + + +class _FakeSpAdapter: + """Minimal SharepointAdapter stand-in. + + Layout: + "/" → 1 site + "/sites/site-1" → 2 files (f1, f2) + 1 folder (sub) + "/sites/site-1/sub" → 1 file (f3) + """ + + def __init__(self): + self.downloaded: List[str] = [] + + async def browse(self, path: str, filter=None, limit=None): + if path == "/": + return [ + _ExtEntry( + name="Site 1", + path="/sites/site-1", + isFolder=True, + metadata={"id": "site-1"}, + ), + ] + if path == "/sites/site-1": + return [ + _ExtEntry( + name="f1.txt", path="/sites/site-1/f1.txt", + mimeType="text/plain", size=20, + metadata={"id": "f1", "revision": "etag-f1"}, + ), + _ExtEntry( + name="f2.txt", path="/sites/site-1/f2.txt", + mimeType="text/plain", size=20, + metadata={"id": "f2", "revision": "etag-f2"}, + ), + _ExtEntry( + name="sub", path="/sites/site-1/sub", + isFolder=True, metadata={"id": "sub"}, + ), + ] + if path == "/sites/site-1/sub": + return [ + _ExtEntry( + name="f3.txt", path="/sites/site-1/sub/f3.txt", + mimeType="text/plain", size=20, + metadata={"id": "f3", "revision": "etag-f3"}, + ), + ] + return [] + + async def download(self, path: str) -> bytes: + self.downloaded.append(path) + return path.encode("utf-8") + + +class _FakeKnowledgeService: + """Records requestIngestion calls and returns the scripted handles.""" + + def __init__(self, duplicateIds=None): + self.calls: List[SimpleNamespace] = [] + self._duplicateIds = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicateIds else "indexed" + return SimpleNamespace( + jobId=f"{job.sourceKind}:{job.sourceId}", + status=status, + contentHash="h", + fileId=job.sourceId, + index=None, + error=None, + ) + + +def _fakeRunExtraction(data, name, mime, options): + """Produce a single synthetic text part so `_toContentObjects` returns one.""" + return SimpleNamespace( + parts=[ + SimpleNamespace( + id="p1", + data=data.decode("utf-8") if isinstance(data, bytes) else str(data), + typeGroup="text", + label="page:1", + metadata={"pageIndex": 0}, + ) + ] + ) + + +def test_bootstrap_walks_sites_and_subfolders(): + adapter = _FakeSpAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapSharepoint( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + ) + + result = asyncio.run(_run()) + assert len(knowledge.calls) == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + assert result["indexed"] == 3 + assert result["skippedDuplicate"] == 0 + assert adapter.downloaded == [ + "/sites/site-1/f1.txt", + "/sites/site-1/f2.txt", + "/sites/site-1/sub/f3.txt", + ] + + +def test_bootstrap_reports_duplicates_on_second_run(): + adapter = _FakeSpAdapter() + duplicateIds = { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapSharepoint( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 3 + + +def test_bootstrap_passes_connection_provenance(): + adapter = _FakeSpAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapSharepoint( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + ) + + asyncio.run(_run()) + for job in knowledge.calls: + assert job.sourceKind == "sharepoint_item" + assert job.mandateId == "m1" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "msft" + assert job.provenance["service"] == "sharepoint" + assert job.contentVersion and job.contentVersion.startswith("etag-") + + +if __name__ == "__main__": + test_bootstrap_walks_sites_and_subfolders() + test_bootstrap_reports_duplicates_on_second_run() + test_bootstrap_passes_connection_provenance() + print("OK — bootstrapSharepoint tests passed") diff --git a/tests/unit/services/test_clean_email_body.py b/tests/unit/services/test_clean_email_body.py new file mode 100644 index 00000000..a3ee01df --- /dev/null +++ b/tests/unit/services/test_clean_email_body.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Unit tests for cleanEmailBody. + +Covers: HTML→text normalisation, quoted-reply removal, signature removal, +whitespace collapse and truncation. The utility is used during Outlook +bootstrap; buggy cleaning would leak quoted threads / signatures into every +embedding. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subTextClean import ( + cleanEmailBody, +) + + +def test_strips_html_tags_and_scripts(): + html = ( + "" + "

Hello world

" + "" + ) + cleaned = cleanEmailBody(html) + assert "Hello" in cleaned + assert "world" in cleaned + assert "<" not in cleaned + assert "alert" not in cleaned + + +def test_strips_quoted_reply_english(): + body = ( + "Actual answer from me.\n\n" + "On Mon, 1 Jan 2024 at 10:00, Someone wrote:\n" + "> Original question?\n" + "> Second line.\n" + ) + cleaned = cleanEmailBody(body) + assert "Actual answer" in cleaned + assert "Original question" not in cleaned + assert "wrote:" not in cleaned + + +def test_strips_quoted_reply_german(): + body = ( + "Meine Antwort.\n\n" + "Am 1. Januar 2024 um 10:00 schrieb Max Muster :\n" + "> Ursprüngliche Frage?\n" + ) + cleaned = cleanEmailBody(body) + assert "Meine Antwort" in cleaned + assert "Ursprüngliche Frage" not in cleaned + + +def test_strips_signature_after_dashes(): + body = ( + "Kurze Nachricht.\n" + "\n" + "--\n" + "Max Muster\n" + "Vorstand, Beispiel GmbH\n" + ) + cleaned = cleanEmailBody(body) + assert "Kurze Nachricht" in cleaned + assert "Beispiel GmbH" not in cleaned + + +def test_strips_signature_salutation_de(): + body = ( + "Die eigentliche Information steht hier.\n\n" + "Mit freundlichen Grüßen\n" + "Max Muster" + ) + cleaned = cleanEmailBody(body) + assert "eigentliche Information" in cleaned + assert "Max Muster" not in cleaned + + +def test_truncate_to_max_chars(): + body = "abc " * 5000 + cleaned = cleanEmailBody(body, maxChars=200) + assert len(cleaned) <= 201 # includes trailing ellipsis + + +def test_empty_input_returns_empty_string(): + assert cleanEmailBody("") == "" + assert cleanEmailBody(None) == "" # type: ignore[arg-type] + + +def test_collapses_whitespace(): + body = "A lot of spaces\n\n\n\nand blank lines" + cleaned = cleanEmailBody(body) + assert " " not in cleaned + assert "\n\n\n" not in cleaned + + +if __name__ == "__main__": + test_strips_html_tags_and_scripts() + test_strips_quoted_reply_english() + test_strips_quoted_reply_german() + test_strips_signature_after_dashes() + test_strips_signature_salutation_de() + test_truncate_to_max_chars() + test_empty_input_returns_empty_string() + test_collapses_whitespace() + print("OK — cleanEmailBody tests passed") diff --git a/tests/unit/services/test_connection_purge.py b/tests/unit/services/test_connection_purge.py new file mode 100644 index 00000000..c32cb5b3 --- /dev/null +++ b/tests/unit/services/test_connection_purge.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Purge tests for KnowledgeObjects.deleteFileContentIndexByConnectionId. + +Ensures that a `connection.revoked` event wipes every FileContentIndex + chunk +linked to the given connectionId while leaving entries from other connections +(or upload-files with connectionId=None) intact. +""" + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk +from modules.interfaces.interfaceDbKnowledge import KnowledgeObjects + + +class _FakeDb: + """Minimal in-memory stand-in for ``KnowledgeObjects.db``. + + Supports just the subset of APIs that deleteFileContentIndexByConnectionId + touches: getRecordset(FileContentIndex|ContentChunk, ...) + recordDelete. + """ + + def __init__(self): + self.indexRows: dict = {} + self.chunks: dict = {} + + def addIndex(self, row: dict) -> None: + self.indexRows[row["id"]] = row + + def addChunk(self, row: dict) -> None: + self.chunks[row["id"]] = row + + def getRecordset(self, modelClass, recordFilter=None, **_): + filter_ = recordFilter or {} + if modelClass is FileContentIndex: + rows = list(self.indexRows.values()) + elif modelClass is ContentChunk: + rows = list(self.chunks.values()) + else: + return [] + + def match(row): + for k, v in filter_.items(): + if row.get(k) != v: + return False + return True + + return [r for r in rows if match(r)] + + def recordDelete(self, modelClass, recordId): + if modelClass is FileContentIndex: + return self.indexRows.pop(recordId, None) is not None + if modelClass is ContentChunk: + return self.chunks.pop(recordId, None) is not None + return False + + +def _buildKnowledge(): + """Instantiate KnowledgeObjects without triggering the real DB bootstrap.""" + ko = KnowledgeObjects.__new__(KnowledgeObjects) + ko.currentUser = None + ko.userId = None + ko._scopeCache = {} + ko.db = _FakeDb() + return ko + + +def test_purge_by_connection_removes_only_matching_rows(): + ko = _buildKnowledge() + ko.db.addIndex({"id": "sp1", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"}) + ko.db.addIndex({"id": "sp2", "connectionId": "cx", "mandateId": "m1", "sourceKind": "sharepoint_item"}) + ko.db.addIndex({"id": "upload", "connectionId": None, "mandateId": "m1", "sourceKind": "file"}) + ko.db.addIndex({"id": "other", "connectionId": "cy", "mandateId": "m1", "sourceKind": "outlook_message"}) + ko.db.addChunk({"id": "c1", "fileId": "sp1"}) + ko.db.addChunk({"id": "c2", "fileId": "sp1"}) + ko.db.addChunk({"id": "c3", "fileId": "sp2"}) + ko.db.addChunk({"id": "c4", "fileId": "upload"}) + ko.db.addChunk({"id": "c5", "fileId": "other"}) + + result = ko.deleteFileContentIndexByConnectionId("cx") + + assert result == {"indexRows": 2, "chunks": 3} + assert "sp1" not in ko.db.indexRows + assert "sp2" not in ko.db.indexRows + assert "upload" in ko.db.indexRows + assert "other" in ko.db.indexRows + assert set(ko.db.chunks.keys()) == {"c4", "c5"} + + +def test_purge_with_empty_connection_id_is_a_noop(): + ko = _buildKnowledge() + ko.db.addIndex({"id": "sp1", "connectionId": "cx"}) + ko.db.addChunk({"id": "c1", "fileId": "sp1"}) + + result = ko.deleteFileContentIndexByConnectionId("") + + assert result == {"indexRows": 0, "chunks": 0} + assert "sp1" in ko.db.indexRows + + +def test_purge_unknown_connection_returns_zero(): + ko = _buildKnowledge() + ko.db.addIndex({"id": "sp1", "connectionId": "cx"}) + + result = ko.deleteFileContentIndexByConnectionId("nope") + + assert result == {"indexRows": 0, "chunks": 0} + assert "sp1" in ko.db.indexRows + + +if __name__ == "__main__": + test_purge_by_connection_removes_only_matching_rows() + test_purge_with_empty_connection_id_is_a_noop() + test_purge_unknown_connection_returns_zero() + print("OK — connection-purge tests passed") diff --git a/tests/unit/services/test_knowledge_ingest_consumer.py b/tests/unit/services/test_knowledge_ingest_consumer.py new file mode 100644 index 00000000..760e1ed6 --- /dev/null +++ b/tests/unit/services/test_knowledge_ingest_consumer.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Unit tests for KnowledgeIngestionConsumer event dispatch. + +- `connection.established` → enqueue a `connection.bootstrap` job. +- `connection.revoked` → synchronous purge via KnowledgeObjects. +""" + +import asyncio +import os +import sys +import types + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as consumer + + +def _resetRegistration(monkeypatch): + """Force the module-level guard to register fresh in each test.""" + monkeypatch.setattr(consumer, "_registered", False) + + +def test_onConnectionEstablished_enqueues_bootstrap(monkeypatch): + startedJobs = [] + + async def _fakeStartJob(jobType, payload, **kwargs): + startedJobs.append({"jobType": jobType, "payload": payload, "kwargs": kwargs}) + return "job-1" + + monkeypatch.setattr(consumer, "startJob", _fakeStartJob) + consumer._onConnectionEstablished( + connectionId="c1", authority="msft", userId="u1" + ) + # Drain pending tasks created by the consumer. + loop = asyncio.new_event_loop() + try: + asyncio.set_event_loop(loop) + # If the consumer created a Task on a closed loop the fake startJob + # was still called synchronously via asyncio.run — in either case we + # check the recorded call. + finally: + loop.close() + + assert len(startedJobs) == 1 + assert startedJobs[0]["jobType"] == consumer.BOOTSTRAP_JOB_TYPE + assert startedJobs[0]["payload"]["connectionId"] == "c1" + assert startedJobs[0]["payload"]["authority"] == "msft" + assert startedJobs[0]["kwargs"]["triggeredBy"] == "u1" + + +def test_onConnectionEstablished_ignores_missing_id(monkeypatch): + called = [] + + async def _fakeStartJob(*a, **kw): + called.append(1) + return "x" + + monkeypatch.setattr(consumer, "startJob", _fakeStartJob) + consumer._onConnectionEstablished(connectionId="", authority="msft") + assert called == [] + + +def test_onConnectionRevoked_runs_sync_purge(monkeypatch): + class _FakeKnowledge: + def __init__(self): + self.calls = [] + + def deleteFileContentIndexByConnectionId(self, cid): + self.calls.append(cid) + return {"indexRows": 2, "chunks": 5} + + fakeKnow = _FakeKnowledge() + + def _fakeGetInterface(_user=None): + return fakeKnow + + monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface) + consumer._onConnectionRevoked( + connectionId="c1", authority="msft", userId="u1", reason="disconnected" + ) + assert fakeKnow.calls == ["c1"] + + +def test_onConnectionRevoked_ignores_missing_id(monkeypatch): + seen = [] + + def _fakeGetInterface(_user=None): + class _K: + def deleteFileContentIndexByConnectionId(self, cid): + seen.append(cid) + return {"indexRows": 0, "chunks": 0} + + return _K() + + monkeypatch.setattr(consumer, "getKnowledgeInterface", _fakeGetInterface) + consumer._onConnectionRevoked(connectionId="") + assert seen == [] + + +def test_bootstrap_job_skips_non_pilot_authority(monkeypatch): + async def _run(): + result = await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "google"}}, + lambda *_: None, + ) + return result + + result = asyncio.run(_run()) + assert result["skipped"] is True + assert result["authority"] == "google" + + +def test_bootstrap_job_dispatches_msft_parts(monkeypatch): + calls = {"sp": 0, "ol": 0} + + async def _fakeSp(connectionId, progressCb=None): + calls["sp"] += 1 + return {"indexed": 1} + + async def _fakeOl(connectionId, progressCb=None): + calls["ol"] += 1 + return {"indexed": 2} + + # subConnectorSync* are lazy-imported inside the handler; install fake + # modules before invoking. + fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint") + fakeSharepoint.bootstrapSharepoint = _fakeSp + fakeOutlook = types.ModuleType("subConnectorSyncOutlook") + fakeOutlook.bootstrapOutlook = _fakeOl + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint", + fakeSharepoint, + ) + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncOutlook", + fakeOutlook, + ) + + async def _run(): + return await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "msft"}}, + lambda *_: None, + ) + + result = asyncio.run(_run()) + assert calls == {"sp": 1, "ol": 1} + assert result["sharepoint"] == {"indexed": 1} + assert result["outlook"] == {"indexed": 2} + + +if __name__ == "__main__": + # Usable without pytest fixtures for a quick smoke run. + class _MP: + def __init__(self): + self.undos = [] + + def setattr(self, target, name_or_value, value=None): + if value is None: + # target is an object, name_or_value is value → no, original signature + raise SystemExit("use pytest monkeypatch in CLI") + self.undos.append((target, name_or_value, getattr(target, name_or_value))) + setattr(target, name_or_value, value) + + def setitem(self, mapping, key, value): + self.undos.append((mapping, key, mapping.get(key))) + mapping[key] = value + + print("Run via pytest: pytest tests/unit/services/test_knowledge_ingest_consumer.py") From 3add5c9a80b42d023f875fefe849705e7f6b6029 Mon Sep 17 00:00:00 2001 From: Ida Date: Wed, 22 Apr 2026 11:55:41 +0200 Subject: [PATCH 06/18] commit before rebase --- .../subConnectorIngestConsumer.py | 59 +- .../subConnectorSyncClickup.py | 489 +++++++++++++++ .../subConnectorSyncGdrive.py | 429 +++++++++++++ .../serviceKnowledge/subConnectorSyncGmail.py | 578 ++++++++++++++++++ tests/unit/services/test_bootstrap_clickup.py | 203 ++++++ tests/unit/services/test_bootstrap_gdrive.py | 225 +++++++ tests/unit/services/test_bootstrap_gmail.py | 240 ++++++++ .../test_knowledge_ingest_consumer.py | 73 ++- 8 files changed, 2278 insertions(+), 18 deletions(-) create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py create mode 100644 tests/unit/services/test_bootstrap_clickup.py create mode 100644 tests/unit/services/test_bootstrap_gdrive.py create mode 100644 tests/unit/services/test_bootstrap_gmail.py diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py index 51acb71c..f9b3533d 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py @@ -135,6 +135,15 @@ async def _bootstrapJobHandler( progressCb(5, f"resolving {authority} connection") + def _normalize(res: Any, label: str) -> Dict[str, Any]: + if isinstance(res, Exception): + logger.error( + "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s", + label, connectionId, res, exc_info=res, + ) + return {"error": str(res)} + return res or {} + if authority == "msft": from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncSharepoint import ( bootstrapSharepoint, @@ -149,16 +158,6 @@ async def _bootstrapJobHandler( bootstrapOutlook(connectionId=connectionId, progressCb=progressCb), return_exceptions=True, ) - - def _normalize(res: Any, label: str) -> Dict[str, Any]: - if isinstance(res, Exception): - logger.error( - "ingestion.connection.bootstrap.failed part=%s connectionId=%s error=%s", - label, connectionId, res, exc_info=res, - ) - return {"error": str(res)} - return res or {} - return { "connectionId": connectionId, "authority": authority, @@ -166,21 +165,55 @@ async def _bootstrapJobHandler( "outlook": _normalize(olResult, "outlook"), } + if authority == "google": + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import ( + bootstrapGdrive, + ) + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + bootstrapGmail, + ) + + progressCb(10, "drive + gmail") + gdResult, gmResult = await asyncio.gather( + bootstrapGdrive(connectionId=connectionId, progressCb=progressCb), + bootstrapGmail(connectionId=connectionId, progressCb=progressCb), + return_exceptions=True, + ) + return { + "connectionId": connectionId, + "authority": authority, + "drive": _normalize(gdResult, "gdrive"), + "gmail": _normalize(gmResult, "gmail"), + } + + if authority == "clickup": + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + bootstrapClickup, + ) + + progressCb(10, "clickup tasks") + cuResult = await bootstrapClickup(connectionId=connectionId, progressCb=progressCb) + return { + "connectionId": connectionId, + "authority": authority, + "clickup": _normalize(cuResult, "clickup"), + } + logger.info( - "ingestion.connection.bootstrap.skipped reason=P1_pilot_scope authority=%s connectionId=%s", + "ingestion.connection.bootstrap.skipped reason=unsupported_authority authority=%s connectionId=%s", authority, connectionId, extra={ "event": "ingestion.connection.bootstrap.skipped", "authority": authority, "connectionId": connectionId, - "reason": "P1_pilot_scope", + "reason": "unsupported_authority", }, ) return { "connectionId": connectionId, "authority": authority, "skipped": True, - "reason": "P1_pilot_scope", + "reason": "unsupported_authority", } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py new file mode 100644 index 00000000..16e94e59 --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py @@ -0,0 +1,489 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""ClickUp bootstrap for the unified knowledge ingestion lane. + +ClickUp tasks are ingested as *virtual documents* — we never download file +bytes. Each task becomes a `sourceKind="clickup_task"` IngestionJob whose +`contentObjects` carry a summary header (name + status + metadata) and the +task description / text content so retrieval finds them without a live API +call. + +Hierarchy traversal: workspace (team) → spaces → folders / folderless lists → +tasks. We cap the fan-out with `maxWorkspaces` / `maxListsPerWorkspace` / +`maxTasks` and skip tasks older than `maxAgeDays` (default 180 d). + +Idempotency: `date_updated` from the ClickUp task payload is a millisecond +timestamp and strictly monotonic per revision — used as `contentVersion`. +""" + +from __future__ import annotations + +import hashlib +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +logger = logging.getLogger(__name__) + +MAX_TASKS_DEFAULT = 500 +MAX_WORKSPACES_DEFAULT = 3 +MAX_LISTS_PER_WORKSPACE_DEFAULT = 20 +MAX_DESCRIPTION_CHARS_DEFAULT = 8000 +MAX_AGE_DAYS_DEFAULT = 180 + + +@dataclass +class ClickupBootstrapLimits: + maxTasks: int = MAX_TASKS_DEFAULT + maxWorkspaces: int = MAX_WORKSPACES_DEFAULT + maxListsPerWorkspace: int = MAX_LISTS_PER_WORKSPACE_DEFAULT + maxDescriptionChars: int = MAX_DESCRIPTION_CHARS_DEFAULT + # Only ingest tasks updated within the last N days. None disables filter. + maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT + # Include closed/archived tasks if they still meet the recency filter. + # ClickUp `closed` tasks often carry the most useful RAG context + # ("why was this shipped the way it was?"). + includeClosed: bool = True + + +@dataclass +class ClickupBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + workspaces: int = 0 + lists: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticTaskId(connectionId: str, taskId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{taskId}".encode("utf-8")).hexdigest()[:16] + return f"cu:{connectionId[:8]}:{token}" + + +def _truncate(value: Any, limit: int) -> str: + text = str(value or "").strip() + if not text: + return "" + if len(text) <= limit: + return text + return text[:limit].rstrip() + "\n[truncated]" + + +def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool: + if not maxAgeDays: + return True + if not dateUpdatedMs: + return True + try: + ts = datetime.fromtimestamp(int(dateUpdatedMs) / 1000.0, tz=timezone.utc) + except Exception: + return True + cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) + return ts >= cutoff + + +def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]: + """Header (name/status/metadata) + description + text_content, all text.""" + name = task.get("name") or f"Task {task.get('id', '')}" + status = ((task.get("status") or {}).get("status")) or "" + assignees = ", ".join( + filter(None, [ + (a.get("username") or a.get("email") or "") + for a in (task.get("assignees") or []) + ]) + ) + tags = ", ".join(filter(None, [t.get("name", "") for t in (task.get("tags") or [])])) + listInfo = task.get("list") or {} + folderInfo = task.get("folder") or {} + spaceInfo = task.get("space") or {} + dueMs = task.get("due_date") + dueIso = "" + if dueMs: + try: + dueIso = datetime.fromtimestamp(int(dueMs) / 1000.0, tz=timezone.utc).strftime("%Y-%m-%d") + except Exception: + dueIso = "" + + headerLines = [ + f"Task: {name}", + f"Status: {status}" if status else "", + f"List: {listInfo.get('name', '')}" if listInfo else "", + f"Folder: {folderInfo.get('name', '')}" if folderInfo else "", + f"Space: {spaceInfo.get('name', '')}" if spaceInfo else "", + f"Assignees: {assignees}" if assignees else "", + f"Tags: {tags}" if tags else "", + f"Due: {dueIso}" if dueIso else "", + f"Url: {task.get('url', '')}" if task.get("url") else "", + ] + header = "\n".join(line for line in headerLines if line) + + parts: List[Dict[str, Any]] = [{ + "contentObjectId": "header", + "contentType": "text", + "data": header, + "contextRef": {"part": "header"}, + }] + + description = _truncate(task.get("description"), limits.maxDescriptionChars) + if description: + parts.append({ + "contentObjectId": "description", + "contentType": "text", + "data": description, + "contextRef": {"part": "description"}, + }) + # text_content is ClickUp's rendered-markdown version; include if it adds + # something beyond the plain description (common for bullet lists, checklists). + textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars) + if textContent and textContent != description: + parts.append({ + "contentObjectId": "text_content", + "contentType": "text", + "data": textContent, + "contextRef": {"part": "text_content"}, + }) + return parts + + +async def bootstrapClickup( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[ClickupBootstrapLimits] = None, +) -> Dict[str, Any]: + """Walk workspaces → lists → tasks and ingest each task as a virtual doc.""" + limits = limits or ClickupBootstrapLimits() + startMs = time.time() + result = ClickupBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=clickup connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "clickup", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + svc = getattr(adapter, "_svc", None) + if svc is None: + result.errors.append("adapter missing _svc instance") + return _finalizeResult(connectionId, result, startMs) + + try: + teamsResp = await svc.getAuthorizedTeams() + except Exception as exc: + logger.error("clickup team discovery failed for %s: %s", connectionId, exc, exc_info=True) + result.errors.append(f"teams: {exc}") + return _finalizeResult(connectionId, result, startMs) + + teams = (teamsResp or {}).get("teams") or [] + for team in teams[: limits.maxWorkspaces]: + if result.indexed + result.skippedDuplicate >= limits.maxTasks: + break + teamId = str(team.get("id", "") or "") + if not teamId: + continue + result.workspaces += 1 + try: + await _walkTeam( + svc=svc, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + team=team, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("clickup team %s walk failed: %s", teamId, exc, exc_info=True) + result.errors.append(f"team({teamId}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerClickup.connectorClickup import ClickupConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = ClickupConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("clickup") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _walkTeam( + *, + svc, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + team: Dict[str, Any], + limits: ClickupBootstrapLimits, + result: ClickupBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + teamId = str(team.get("id", "") or "") + spacesResp = await svc.getSpaces(teamId) + spaces = (spacesResp or {}).get("spaces") or [] + + listsCollected: List[Dict[str, Any]] = [] + for space in spaces: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + spaceId = str(space.get("id", "") or "") + if not spaceId: + continue + + # Folderless lists directly under the space + folderless = await svc.getFolderlessLists(spaceId) + for lst in (folderless or {}).get("lists") or []: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + listsCollected.append({**lst, "_space": space}) + + # Lists inside folders + foldersResp = await svc.getFolders(spaceId) + for folder in (foldersResp or {}).get("folders") or []: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + folderId = str(folder.get("id", "") or "") + if not folderId: + continue + folderLists = await svc.getListsInFolder(folderId) + for lst in (folderLists or {}).get("lists") or []: + if len(listsCollected) >= limits.maxListsPerWorkspace: + break + listsCollected.append({**lst, "_space": space, "_folder": folder}) + + for lst in listsCollected: + if result.indexed + result.skippedDuplicate >= limits.maxTasks: + return + result.lists += 1 + await _walkList( + svc=svc, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + teamId=teamId, + lst=lst, + limits=limits, + result=result, + progressCb=progressCb, + ) + + +async def _walkList( + *, + svc, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + teamId: str, + lst: Dict[str, Any], + limits: ClickupBootstrapLimits, + result: ClickupBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + listId = str(lst.get("id", "") or "") + if not listId: + return + page = 0 + while result.indexed + result.skippedDuplicate < limits.maxTasks: + resp = await svc.getTasksInList( + listId, + page=page, + include_closed=limits.includeClosed, + subtasks=True, + ) + if isinstance(resp, dict) and resp.get("error"): + logger.warning("clickup tasks list=%s page=%d error: %s", listId, page, resp.get("error")) + result.errors.append(f"list({listId}): {resp.get('error')}") + return + tasks = (resp or {}).get("tasks") or [] + if not tasks: + return + + for task in tasks: + if result.indexed + result.skippedDuplicate >= limits.maxTasks: + return + if not _isRecent(task.get("date_updated"), limits.maxAgeDays): + result.skippedPolicy += 1 + continue + # Inject the list/folder/space metadata we already loaded. + task["list"] = task.get("list") or {"id": listId, "name": lst.get("name")} + task["folder"] = task.get("folder") or lst.get("_folder") or {} + task["space"] = task.get("space") or lst.get("_space") or {} + await _ingestTask( + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + teamId=teamId, + task=task, + limits=limits, + result=result, + progressCb=progressCb, + ) + + if len(tasks) < 100: # ClickUp page-size hint: fewer than 100 => last page + return + page += 1 + + +async def _ingestTask( + *, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + teamId: str, + task: Dict[str, Any], + limits: ClickupBootstrapLimits, + result: ClickupBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + taskId = str(task.get("id", "") or "") + if not taskId: + result.skippedPolicy += 1 + return + revision = str(task.get("date_updated") or task.get("date_created") or "") + name = task.get("name") or f"Task {taskId}" + syntheticId = _syntheticTaskId(connectionId, taskId) + fileName = f"{name[:80].strip() or taskId}.task.json" + + contentObjects = _buildContentObjects(task, limits) + + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="clickup_task", + sourceId=syntheticId, + fileName=fileName, + mimeType="application/vnd.clickup.task+json", + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision or None, + provenance={ + "connectionId": connectionId, + "authority": "clickup", + "service": "clickup", + "externalItemId": taskId, + "teamId": teamId, + "listId": ((task.get("list") or {}).get("id")), + "spaceId": ((task.get("space") or {}).get("id")), + "url": task.get("url"), + "status": ((task.get("status") or {}).get("status")), + "tier": "body", + }, + ) + ) + except Exception as exc: + logger.error("clickup ingestion %s failed: %s", taskId, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({taskId}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxTasks))), + f"clickup processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=clickup processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "clickup", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + +def _finalizeResult(connectionId: str, result: ClickupBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=clickup connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d workspaces=%d lists=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.failed, result.workspaces, result.lists, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "clickup", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "workspaces": result.workspaces, + "lists": result.lists, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "workspaces": result.workspaces, + "lists": result.lists, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py new file mode 100644 index 00000000..3e73a040 --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py @@ -0,0 +1,429 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Google Drive bootstrap for the unified knowledge ingestion lane. + +Mirrors the SharePoint pilot (see subConnectorSyncSharepoint.py). Walks the +user's *My Drive* tree from the virtual `root` folder, downloads each +file-like item via `DriveAdapter.download` (which handles native Google docs +via export), runs the standard extraction pipeline and routes results through +`KnowledgeService.requestIngestion` with `sourceKind="gdrive_item"` and +`contentVersion = modifiedTime` (monotonic per-revision). +""" + +from __future__ import annotations + +import hashlib +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +from modules.datamodels.datamodelExtraction import ExtractionOptions + +logger = logging.getLogger(__name__) + +MAX_ITEMS_DEFAULT = 500 +MAX_BYTES_DEFAULT = 200 * 1024 * 1024 +MAX_FILE_SIZE_DEFAULT = 25 * 1024 * 1024 +SKIP_MIME_PREFIXES_DEFAULT = ("video/", "audio/") +MAX_DEPTH_DEFAULT = 4 +MAX_AGE_DAYS_DEFAULT = 365 + +# Google Drive uses virtual mime-types for folders and non-downloadable assets. +FOLDER_MIME = "application/vnd.google-apps.folder" + + +@dataclass +class GdriveBootstrapLimits: + maxItems: int = MAX_ITEMS_DEFAULT + maxBytes: int = MAX_BYTES_DEFAULT + maxFileSize: int = MAX_FILE_SIZE_DEFAULT + skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT + maxDepth: int = MAX_DEPTH_DEFAULT + # Only ingest files modified within the last N days. None disables filter. + maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT + + +@dataclass +class GdriveBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + bytesProcessed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticFileId(connectionId: str, externalItemId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{externalItemId}".encode("utf-8")).hexdigest()[:16] + return f"gd:{connectionId[:8]}:{token}" + + +def _toContentObjects(extracted, fileName: str) -> List[Dict[str, Any]]: + parts = getattr(extracted, "parts", None) or [] + out: List[Dict[str, Any]] = [] + for part in parts: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + out.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "file", + **(getattr(part, "metadata", None) or {}), + }, + }) + return out + + +def _isRecent(modifiedIso: Optional[str], maxAgeDays: Optional[int]) -> bool: + if not maxAgeDays: + return True + if not modifiedIso: + # No timestamp -> be permissive (Drive native docs sometimes omit it on export). + return True + try: + # Google returns RFC 3339 with `Z` or offset; python 3.11+ parses both. + ts = datetime.fromisoformat(modifiedIso.replace("Z", "+00:00")) + except Exception: + return True + cutoff = datetime.now(timezone.utc) - timedelta(days=maxAgeDays) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + return ts >= cutoff + + +async def bootstrapGdrive( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[GdriveBootstrapLimits] = None, + runExtractionFn: Optional[Callable[..., Any]] = None, +) -> Dict[str, Any]: + """Walk My Drive starting from the virtual root folder.""" + limits = limits or GdriveBootstrapLimits() + startMs = time.time() + result = GdriveBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=gdrive connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "gdrive", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + if runExtractionFn is None: + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + def runExtractionFn(bytesData, name, mime, options): # type: ignore[no-redef] + return runExtraction(extractorRegistry, chunkerRegistry, bytesData, name, mime, options) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + try: + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath="/", # DriveAdapter.browse maps "" / "/" -> "root" + depth=0, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("gdrive walk failed for %s: %s", connectionId, exc, exc_info=True) + result.errors.append(f"walk: {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = GoogleConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("drive") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _walkFolder( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + folderPath: str, + depth: int, + limits: GdriveBootstrapLimits, + result: GdriveBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + if depth > limits.maxDepth: + return + try: + entries = await adapter.browse(folderPath) + except Exception as exc: + logger.warning("gdrive browse %s failed: %s", folderPath, exc) + result.errors.append(f"browse({folderPath}): {exc}") + return + + for entry in entries: + if result.indexed + result.skippedDuplicate >= limits.maxItems: + return + if result.bytesProcessed >= limits.maxBytes: + return + + entryPath = getattr(entry, "path", "") or "" + metadata = getattr(entry, "metadata", {}) or {} + mimeType = getattr(entry, "mimeType", None) or metadata.get("mimeType") + + if getattr(entry, "isFolder", False) or mimeType == FOLDER_MIME: + await _walkFolder( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + folderPath=entryPath, + depth=depth + 1, + limits=limits, + result=result, + progressCb=progressCb, + ) + continue + + effectiveMime = mimeType or "application/octet-stream" + if any(effectiveMime.startswith(prefix) for prefix in limits.skipMimePrefixes): + result.skippedPolicy += 1 + continue + size = int(getattr(entry, "size", 0) or 0) + if size and size > limits.maxFileSize: + result.skippedPolicy += 1 + continue + modifiedTime = metadata.get("modifiedTime") + if not _isRecent(modifiedTime, limits.maxAgeDays): + result.skippedPolicy += 1 + continue + + externalItemId = metadata.get("id") or entryPath + revision = modifiedTime + + await _ingestOne( + adapter=adapter, + knowledgeService=knowledgeService, + runExtractionFn=runExtractionFn, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + entry=entry, + entryPath=entryPath, + mimeType=effectiveMime, + externalItemId=externalItemId, + revision=revision, + limits=limits, + result=result, + progressCb=progressCb, + ) + + +async def _ingestOne( + *, + adapter, + knowledgeService, + runExtractionFn, + connectionId: str, + mandateId: str, + userId: str, + entry, + entryPath: str, + mimeType: str, + externalItemId: str, + revision: Optional[str], + limits: GdriveBootstrapLimits, + result: GdriveBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + syntheticFileId = _syntheticFileId(connectionId, externalItemId) + fileName = getattr(entry, "name", "") or externalItemId + + try: + downloaded = await adapter.download(entryPath) + except Exception as exc: + logger.warning("gdrive download %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"download({entryPath}): {exc}") + return + + # Adapter.download returns raw bytes today; guard DownloadResult shape too. + fileBytes: bytes + if isinstance(downloaded, (bytes, bytearray)): + fileBytes = bytes(downloaded) + else: + fileBytes = bytes(getattr(downloaded, "data", b"") or b"") + if getattr(downloaded, "mimeType", None): + mimeType = downloaded.mimeType # export may have changed the type + if not fileBytes: + result.failed += 1 + return + if len(fileBytes) > limits.maxFileSize: + result.skippedPolicy += 1 + return + + result.bytesProcessed += len(fileBytes) + + try: + extracted = runExtractionFn( + fileBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("gdrive extraction %s failed: %s", entryPath, exc) + result.failed += 1 + result.errors.append(f"extract({entryPath}): {exc}") + return + + contentObjects = _toContentObjects(extracted, fileName) + if not contentObjects: + result.skippedPolicy += 1 + return + + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="gdrive_item", + sourceId=syntheticFileId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=revision, + provenance={ + "connectionId": connectionId, + "authority": "google", + "service": "drive", + "externalItemId": externalItemId, + "entryPath": entryPath, + "tier": "body", + }, + ) + ) + except Exception as exc: + logger.error("gdrive ingestion %s failed: %s", entryPath, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({entryPath}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxItems))), + f"gdrive processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=gdrive processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "gdrive", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + +def _finalizeResult(connectionId: str, result: GdriveBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=gdrive connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d failed=%d bytes=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.failed, result.bytesProcessed, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "gdrive", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "bytes": result.bytesProcessed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "failed": result.failed, + "bytesProcessed": result.bytesProcessed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py new file mode 100644 index 00000000..827add6b --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py @@ -0,0 +1,578 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Gmail bootstrap for the unified knowledge ingestion lane. + +Mirrors the Outlook pilot (see subConnectorSyncOutlook.py) but talks to Google +Mail's REST API. Messages become `sourceKind="gmail_message"` virtual documents +with header / snippet / cleaned body content-objects; attachments are optional +child jobs with `sourceKind="gmail_attachment"`. + +Idempotency: Gmail's stable `historyId` (or `internalDate` as fallback) is +passed as `contentVersion`, so rerunning the bootstrap yields +`ingestion.skipped.duplicate` for unchanged messages. +""" + +from __future__ import annotations + +import asyncio +import base64 +import hashlib +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime, timedelta, timezone +from typing import Any, Callable, Dict, List, Optional + +from modules.serviceCenter.services.serviceKnowledge.subTextClean import cleanEmailBody + +logger = logging.getLogger(__name__) + +MAX_MESSAGES_DEFAULT = 500 +MAX_BODY_CHARS_DEFAULT = 8000 +MAX_ATTACHMENT_BYTES_DEFAULT = 10 * 1024 * 1024 +DEFAULT_LABELS = ("INBOX", "SENT") + + +@dataclass +class GmailBootstrapLimits: + maxMessages: int = MAX_MESSAGES_DEFAULT + labels: tuple = DEFAULT_LABELS + maxBodyChars: int = MAX_BODY_CHARS_DEFAULT + includeAttachments: bool = False + maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT + # Only fetch messages newer than N days. None disables filter. + maxAgeDays: Optional[int] = 90 + + +@dataclass +class GmailBootstrapResult: + connectionId: str + indexed: int = 0 + skippedDuplicate: int = 0 + skippedPolicy: int = 0 + failed: int = 0 + attachmentsIndexed: int = 0 + errors: List[str] = field(default_factory=list) + + +def _syntheticMessageId(connectionId: str, messageId: str) -> str: + token = hashlib.sha256(f"{connectionId}:{messageId}".encode("utf-8")).hexdigest()[:16] + return f"gm:{connectionId[:8]}:{token}" + + +def _syntheticAttachmentId(connectionId: str, messageId: str, attachmentId: str) -> str: + token = hashlib.sha256( + f"{connectionId}:{messageId}:{attachmentId}".encode("utf-8") + ).hexdigest()[:16] + return f"ga:{connectionId[:8]}:{token}" + + +def _decodeBase64Url(data: str) -> bytes: + if not data: + return b"" + # Gmail uses URL-safe base64 without padding. + padding = 4 - (len(data) % 4) + if padding != 4: + data = data + ("=" * padding) + try: + return base64.urlsafe_b64decode(data) + except Exception: + return b"" + + +def _walkPayloadForBody(payload: Dict[str, Any]) -> Dict[str, str]: + """Return {"text": ..., "html": ...} by walking MIME parts. + + Gmail `payload` is a tree of parts. We prefer `text/plain` for the cleaned + body, but capture `text/html` as a fallback so `cleanEmailBody` can strip + markup if plain is missing. + """ + found: Dict[str, str] = {"text": "", "html": ""} + + def _walk(part: Dict[str, Any]) -> None: + mime = (part.get("mimeType") or "").lower() + body = part.get("body") or {} + raw = body.get("data") or "" + if raw and mime.startswith("text/"): + decoded = _decodeBase64Url(raw).decode("utf-8", errors="replace") + key = "text" if mime == "text/plain" else ("html" if mime == "text/html" else "") + if key and not found[key]: + found[key] = decoded + for sub in part.get("parts") or []: + _walk(sub) + + _walk(payload or {}) + return found + + +def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]: + return { + (h.get("name") or "").lower(): (h.get("value") or "") + for h in (payload.get("headers") or []) + } + + +def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]: + payload = message.get("payload") or {} + headers = _headerMap(payload) + subject = headers.get("subject") or "(no subject)" + fromAddr = headers.get("from") or "" + toAddr = headers.get("to") or "" + ccAddr = headers.get("cc") or "" + date = headers.get("date") or "" + snippet = message.get("snippet") or "" + + bodies = _walkPayloadForBody(payload) + rawBody = bodies["text"] or bodies["html"] + cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else "" + + parts: List[Dict[str, Any]] = [] + header = ( + f"Subject: {subject}\n" + f"From: {fromAddr}\n" + f"To: {toAddr}\n" + + (f"Cc: {ccAddr}\n" if ccAddr else "") + + f"Date: {date}" + ) + parts.append({ + "contentObjectId": "header", + "contentType": "text", + "data": header, + "contextRef": {"part": "header"}, + }) + if snippet: + parts.append({ + "contentObjectId": "snippet", + "contentType": "text", + "data": snippet, + "contextRef": {"part": "snippet"}, + }) + if cleanedBody: + parts.append({ + "contentObjectId": "body", + "contentType": "text", + "data": cleanedBody, + "contextRef": {"part": "body"}, + }) + return parts + + +async def bootstrapGmail( + connectionId: str, + *, + progressCb: Optional[Callable[[int, Optional[str]], None]] = None, + adapter: Any = None, + connection: Any = None, + knowledgeService: Any = None, + limits: Optional[GmailBootstrapLimits] = None, + googleGetFn: Optional[Callable[..., Any]] = None, +) -> Dict[str, Any]: + """Enumerate Gmail labels (INBOX + SENT default) and ingest messages.""" + limits = limits or GmailBootstrapLimits() + startMs = time.time() + result = GmailBootstrapResult(connectionId=connectionId) + + logger.info( + "ingestion.connection.bootstrap.started part=gmail connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.started", + "part": "gmail", + "connectionId": connectionId, + }, + ) + + if adapter is None or knowledgeService is None or connection is None: + adapter, connection, knowledgeService = await _resolveDependencies(connectionId) + + if googleGetFn is None: + from modules.connectors.providerGoogle.connectorGoogle import _googleGet as _defaultGet + + token = getattr(adapter, "_token", "") + + async def googleGetFn(url: str) -> Dict[str, Any]: # type: ignore[no-redef] + return await _defaultGet(token, url) + + mandateId = str(getattr(connection, "mandateId", "") or "") if connection is not None else "" + userId = str(getattr(connection, "userId", "") or "") if connection is not None else "" + + for labelId in limits.labels: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + try: + await _ingestLabel( + googleGetFn=googleGetFn, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + labelId=labelId, + limits=limits, + result=result, + progressCb=progressCb, + ) + except Exception as exc: + logger.error("gmail ingestion label %s failed: %s", labelId, exc, exc_info=True) + result.errors.append(f"label({labelId}): {exc}") + + return _finalizeResult(connectionId, result, startMs) + + +async def _resolveDependencies(connectionId: str): + from modules.interfaces.interfaceDbApp import getRootInterface + from modules.auth import TokenManager + from modules.connectors.providerGoogle.connectorGoogle import GoogleConnector + from modules.serviceCenter import getService + from modules.serviceCenter.context import ServiceCenterContext + from modules.security.rootAccess import getRootUser + + rootInterface = getRootInterface() + connection = rootInterface.getUserConnectionById(connectionId) + if connection is None: + raise ValueError(f"UserConnection not found: {connectionId}") + + token = TokenManager().getFreshToken(connectionId) + if not token or not token.tokenAccess: + raise ValueError(f"No valid token for connection {connectionId}") + + provider = GoogleConnector(connection, token.tokenAccess) + adapter = provider.getServiceAdapter("gmail") + + rootUser = getRootUser() + ctx = ServiceCenterContext( + user=rootUser, + mandate_id=str(getattr(connection, "mandateId", "") or ""), + ) + knowledgeService = getService("knowledge", ctx) + return adapter, connection, knowledgeService + + +async def _ingestLabel( + *, + googleGetFn, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + labelId: str, + limits: GmailBootstrapLimits, + result: GmailBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + remaining = limits.maxMessages - (result.indexed + result.skippedDuplicate) + if remaining <= 0: + return + + pageSize = min(100, remaining) + query = "" + if limits.maxAgeDays: + cutoff = datetime.now(timezone.utc) - timedelta(days=limits.maxAgeDays) + # Gmail uses YYYY/MM/DD. + query = f"after:{cutoff.strftime('%Y/%m/%d')}" + + baseUrl = ( + "https://gmail.googleapis.com/gmail/v1/users/me/messages" + f"?labelIds={labelId}&maxResults={pageSize}" + ) + if query: + baseUrl = f"{baseUrl}&q={query}" + + nextPageToken: Optional[str] = None + while (result.indexed + result.skippedDuplicate) < limits.maxMessages: + url = baseUrl if not nextPageToken else f"{baseUrl}&pageToken={nextPageToken}" + page = await googleGetFn(url) + if not isinstance(page, dict) or "error" in page: + err = (page or {}).get("error") if isinstance(page, dict) else "unknown" + logger.warning("gmail list page error for label %s: %s", labelId, err) + result.errors.append(f"list({labelId}): {err}") + return + + messageStubs = page.get("messages") or [] + for stub in messageStubs: + if result.indexed + result.skippedDuplicate >= limits.maxMessages: + break + msgId = stub.get("id") + if not msgId: + continue + detailUrl = ( + f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{msgId}?format=full" + ) + detail = await googleGetFn(detailUrl) + if not isinstance(detail, dict) or "error" in detail: + result.failed += 1 + continue + await _ingestMessage( + googleGetFn=googleGetFn, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + labelId=labelId, + message=detail, + limits=limits, + result=result, + progressCb=progressCb, + ) + + nextPageToken = page.get("nextPageToken") + if not nextPageToken: + break + + +async def _ingestMessage( + *, + googleGetFn, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + labelId: str, + message: Dict[str, Any], + limits: GmailBootstrapLimits, + result: GmailBootstrapResult, + progressCb: Optional[Callable[[int, Optional[str]], None]], +) -> None: + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + messageId = message.get("id") + if not messageId: + result.skippedPolicy += 1 + return + revision = message.get("historyId") or message.get("internalDate") + headers = _headerMap(message.get("payload") or {}) + subject = headers.get("subject") or "(no subject)" + syntheticId = _syntheticMessageId(connectionId, messageId) + fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" + + contentObjects = _buildContentObjects(message, limits.maxBodyChars) + try: + handle = await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="gmail_message", + sourceId=syntheticId, + fileName=fileName, + mimeType="message/rfc822", + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + contentVersion=str(revision) if revision else None, + provenance={ + "connectionId": connectionId, + "authority": "google", + "service": "gmail", + "externalItemId": messageId, + "label": labelId, + "threadId": message.get("threadId"), + "tier": "body", + }, + ) + ) + except Exception as exc: + logger.error("gmail ingestion %s failed: %s", messageId, exc, exc_info=True) + result.failed += 1 + result.errors.append(f"ingest({messageId}): {exc}") + return + + if handle.status == "duplicate": + result.skippedDuplicate += 1 + elif handle.status == "indexed": + result.indexed += 1 + else: + result.failed += 1 + + if limits.includeAttachments: + try: + await _ingestAttachments( + googleGetFn=googleGetFn, + knowledgeService=knowledgeService, + connectionId=connectionId, + mandateId=mandateId, + userId=userId, + message=message, + parentSyntheticId=syntheticId, + limits=limits, + result=result, + ) + except Exception as exc: + logger.warning("gmail attachments %s failed: %s", messageId, exc) + result.errors.append(f"attachments({messageId}): {exc}") + + if progressCb is not None and (result.indexed + result.skippedDuplicate) % 50 == 0: + processed = result.indexed + result.skippedDuplicate + try: + progressCb( + min(90, 10 + int(80 * processed / max(1, limits.maxMessages))), + f"gmail processed={processed}", + ) + except Exception: + pass + logger.info( + "ingestion.connection.bootstrap.progress part=gmail processed=%d skippedDup=%d failed=%d", + processed, result.skippedDuplicate, result.failed, + extra={ + "event": "ingestion.connection.bootstrap.progress", + "part": "gmail", + "connectionId": connectionId, + "processed": processed, + "skippedDup": result.skippedDuplicate, + "failed": result.failed, + }, + ) + + await asyncio.sleep(0) + + +async def _ingestAttachments( + *, + googleGetFn, + knowledgeService, + connectionId: str, + mandateId: str, + userId: str, + message: Dict[str, Any], + parentSyntheticId: str, + limits: GmailBootstrapLimits, + result: GmailBootstrapResult, +) -> None: + """Child ingestion jobs for file attachments. Skips inline images (cid: refs).""" + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + from modules.datamodels.datamodelExtraction import ExtractionOptions + from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction + from modules.serviceCenter.services.serviceExtraction.subRegistry import ( + ExtractorRegistry, ChunkerRegistry, + ) + + messageId = message.get("id") or "" + + def _collectAttachmentStubs(part: Dict[str, Any], acc: List[Dict[str, Any]]) -> None: + filename = part.get("filename") or "" + body = part.get("body") or {} + attId = body.get("attachmentId") + if filename and attId: + acc.append({ + "filename": filename, + "mimeType": part.get("mimeType") or "application/octet-stream", + "attachmentId": attId, + "size": int(body.get("size") or 0), + }) + for sub in part.get("parts") or []: + _collectAttachmentStubs(sub, acc) + + stubs: List[Dict[str, Any]] = [] + _collectAttachmentStubs(message.get("payload") or {}, stubs) + if not stubs: + return + + extractorRegistry = ExtractorRegistry() + chunkerRegistry = ChunkerRegistry() + + for stub in stubs: + if stub["size"] and stub["size"] > limits.maxAttachmentBytes: + result.skippedPolicy += 1 + continue + attUrl = ( + f"https://gmail.googleapis.com/gmail/v1/users/me/messages/{messageId}" + f"/attachments/{stub['attachmentId']}" + ) + detail = await googleGetFn(attUrl) + if not isinstance(detail, dict) or "error" in detail: + result.failed += 1 + continue + rawBytes = _decodeBase64Url(detail.get("data") or "") + if not rawBytes: + continue + fileName = stub["filename"] + mimeType = stub["mimeType"] + syntheticId = _syntheticAttachmentId(connectionId, messageId, stub["attachmentId"]) + + try: + extracted = runExtraction( + extractorRegistry, chunkerRegistry, + rawBytes, fileName, mimeType, + ExtractionOptions(mergeStrategy=None), + ) + except Exception as exc: + logger.warning("gmail attachment extract %s failed: %s", stub["attachmentId"], exc) + result.failed += 1 + continue + + contentObjects: List[Dict[str, Any]] = [] + for part in getattr(extracted, "parts", None) or []: + data = getattr(part, "data", None) or "" + if not data or not str(data).strip(): + continue + typeGroup = getattr(part, "typeGroup", "text") or "text" + contentType = "text" + if typeGroup == "image": + contentType = "image" + elif typeGroup in ("binary", "container"): + contentType = "other" + contentObjects.append({ + "contentObjectId": getattr(part, "id", ""), + "contentType": contentType, + "data": data, + "contextRef": { + "containerPath": fileName, + "location": getattr(part, "label", None) or "attachment", + **(getattr(part, "metadata", None) or {}), + }, + }) + if not contentObjects: + result.skippedPolicy += 1 + continue + + try: + await knowledgeService.requestIngestion( + IngestionJob( + sourceKind="gmail_attachment", + sourceId=syntheticId, + fileName=fileName, + mimeType=mimeType, + userId=userId, + mandateId=mandateId, + contentObjects=contentObjects, + provenance={ + "connectionId": connectionId, + "authority": "google", + "service": "gmail", + "parentId": parentSyntheticId, + "externalItemId": stub["attachmentId"], + "parentMessageId": messageId, + }, + ) + ) + result.attachmentsIndexed += 1 + except Exception as exc: + logger.warning("gmail attachment ingest %s failed: %s", stub["attachmentId"], exc) + result.failed += 1 + + +def _finalizeResult(connectionId: str, result: GmailBootstrapResult, startMs: float) -> Dict[str, Any]: + durationMs = int((time.time() - startMs) * 1000) + logger.info( + "ingestion.connection.bootstrap.done part=gmail connectionId=%s indexed=%d skippedDup=%d skippedPolicy=%d attachments=%d failed=%d durationMs=%d", + connectionId, + result.indexed, result.skippedDuplicate, result.skippedPolicy, + result.attachmentsIndexed, result.failed, durationMs, + extra={ + "event": "ingestion.connection.bootstrap.done", + "part": "gmail", + "connectionId": connectionId, + "indexed": result.indexed, + "skippedDup": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + }, + ) + return { + "connectionId": result.connectionId, + "indexed": result.indexed, + "skippedDuplicate": result.skippedDuplicate, + "skippedPolicy": result.skippedPolicy, + "attachmentsIndexed": result.attachmentsIndexed, + "failed": result.failed, + "durationMs": durationMs, + "errors": result.errors[:20], + } diff --git a/tests/unit/services/test_bootstrap_clickup.py b/tests/unit/services/test_bootstrap_clickup.py new file mode 100644 index 00000000..87c08c3d --- /dev/null +++ b/tests/unit/services/test_bootstrap_clickup.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap ClickUp tests with a fake service + knowledge service. + +Verifies: +- Teams → spaces → lists (folderless + folder-based) → tasks traversal. +- Each task produces a `requestIngestion` call with `sourceKind="clickup_task"` + and header + description content-objects. +- `date_updated` is forwarded as contentVersion → idempotency. +- Recency filter drops tasks older than `maxAgeDays`. +- maxWorkspaces / maxListsPerWorkspace / maxTasks caps are respected. +""" + +import asyncio +import os +import sys +import time +from types import SimpleNamespace + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + bootstrapClickup, + ClickupBootstrapLimits, + _syntheticTaskId, +) + + +def _nowMs(offsetDays: int = 0) -> str: + return str(int((time.time() + offsetDays * 86400) * 1000)) + + +class _FakeClickupService: + """Records API calls; serves a canned 1-team / 1-space / 1-list / 2-task layout.""" + + def __init__(self, taskCount=2, oldTask=False): + self._taskCount = taskCount + self._oldTask = oldTask # when True, the second task is 400 days old + self.calls = [] + + async def getAuthorizedTeams(self): + self.calls.append(("getAuthorizedTeams",)) + return {"teams": [{"id": "team-1", "name": "Acme"}]} + + async def getSpaces(self, team_id: str): + self.calls.append(("getSpaces", team_id)) + return {"spaces": [{"id": "space-1", "name": "Engineering"}]} + + async def getFolderlessLists(self, space_id: str): + self.calls.append(("getFolderlessLists", space_id)) + return {"lists": [{"id": "list-1", "name": "Sprint 1"}]} + + async def getFolders(self, space_id: str): + self.calls.append(("getFolders", space_id)) + return {"folders": [{"id": "folder-1", "name": "Subproject"}]} + + async def getListsInFolder(self, folder_id: str): + self.calls.append(("getListsInFolder", folder_id)) + return {"lists": [{"id": "list-2", "name": "Sub-tasks"}]} + + async def getTasksInList(self, list_id: str, *, page=0, include_closed=False, subtasks=True): + self.calls.append(("getTasksInList", list_id, page, include_closed)) + if page > 0: + return {"tasks": []} + tasks = [] + for i in range(self._taskCount): + tid = f"{list_id}-task-{i}" + offsetDays = -400 if (self._oldTask and i == 1) else 0 + tasks.append({ + "id": tid, + "name": f"Task {i} of {list_id}", + "description": f"Plain description for task {i}", + "text_content": f"Rich content for task {i}", + "status": {"status": "open" if i == 0 else "closed"}, + "assignees": [{"username": "alice"}], + "tags": [{"name": "urgent"}], + "date_updated": _nowMs(offsetDays), + "date_created": _nowMs(-1), + "url": f"https://app.clickup.com/t/{tid}", + }) + return {"tasks": tasks} + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls = [] + self._duplicates = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicates else "indexed" + return SimpleNamespace( + jobId=job.sourceId, status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def _adapter(svc): + return SimpleNamespace(_svc=svc) + + +def test_bootstrap_walks_team_space_lists_and_tasks(): + svc = _FakeClickupService(taskCount=2) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + # 2 lists (folderless list-1 + folder's list-2) × 2 tasks each = 4 tasks + assert result["indexed"] == 4 + assert result["workspaces"] == 1 + assert result["lists"] == 2 + sourceIds = {c.sourceId for c in knowledge.calls} + assert len(sourceIds) == 4 + for job in knowledge.calls: + assert job.sourceKind == "clickup_task" + assert job.mimeType == "application/vnd.clickup.task+json" + assert job.mandateId == "m1" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "clickup" + assert job.provenance["teamId"] == "team-1" + assert job.contentVersion # numeric millisecond string + # At least the header content-object is present. + ids = [co["contentObjectId"] for co in job.contentObjects] + assert "header" in ids + + +def test_bootstrap_reports_duplicates_on_second_run(): + svc = _FakeClickupService(taskCount=1) + duplicates = { + _syntheticTaskId("c1", "list-1-task-0"), + _syntheticTaskId("c1", "list-2-task-0"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicates) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 2 + + +def test_bootstrap_skips_tasks_older_than_maxAgeDays(): + svc = _FakeClickupService(taskCount=2, oldTask=True) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=180), + ) + + result = asyncio.run(_run()) + # 2 lists × (1 recent + 1 skipped old) = 2 indexed + 2 skippedPolicy + assert result["indexed"] == 2 + assert result["skippedPolicy"] == 2 + + +def test_bootstrap_maxTasks_caps_ingestion(): + svc = _FakeClickupService(taskCount=2) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapClickup( + connectionId="c1", + adapter=_adapter(svc), + connection=connection, + knowledgeService=knowledge, + limits=ClickupBootstrapLimits(maxAgeDays=None, maxTasks=3), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + + +if __name__ == "__main__": + test_bootstrap_walks_team_space_lists_and_tasks() + test_bootstrap_reports_duplicates_on_second_run() + test_bootstrap_skips_tasks_older_than_maxAgeDays() + test_bootstrap_maxTasks_caps_ingestion() + print("OK — bootstrapClickup tests passed") diff --git a/tests/unit/services/test_bootstrap_gdrive.py b/tests/unit/services/test_bootstrap_gdrive.py new file mode 100644 index 00000000..1b88677e --- /dev/null +++ b/tests/unit/services/test_bootstrap_gdrive.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap Google Drive tests with a fake adapter + knowledge service. + +Verifies: +- Drive walk traverses root → subfolders, respecting `maxDepth`. +- Every file triggers `requestIngestion` with `sourceKind="gdrive_item"`. +- Duplicate runs (same modifiedTime revision) report `skippedDuplicate`. +- Provenance carries `authority="google"` and the Drive file id. +- Recency filter skips files older than `maxAgeDays`. +""" + +import asyncio +import os +import sys +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from types import SimpleNamespace +from typing import Any, Dict, List, Optional + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive import ( + bootstrapGdrive, + GdriveBootstrapLimits, + _syntheticFileId, +) + + +@dataclass +class _ExtEntry: + name: str + path: str + isFolder: bool = False + size: Optional[int] = None + mimeType: Optional[str] = None + metadata: Dict[str, Any] = None + + +def _today_iso(offsetDays: int = 0) -> str: + return (datetime.now(timezone.utc) + timedelta(days=offsetDays)).strftime("%Y-%m-%dT%H:%M:%SZ") + + +class _FakeDriveAdapter: + """Minimal DriveAdapter stand-in. + + Layout: + "/" (root) → 2 files + 1 folder (sub) + "/sub_id" → 1 file + """ + + def __init__(self, recent_only: bool = True): + self.downloaded: List[str] = [] + self._recent = _today_iso(0) + self._old = _today_iso(-400) + self._recent_only = recent_only + + async def browse(self, path: str, filter=None, limit=None): + if path in ("/", "", "root"): + return [ + _ExtEntry( + name="f1.txt", path="/f1", size=20, + mimeType="text/plain", + metadata={"id": "f1", "modifiedTime": self._recent}, + ), + _ExtEntry( + name="f2.txt", path="/f2", size=20, + mimeType="text/plain", + metadata={"id": "f2", "modifiedTime": self._recent if self._recent_only else self._old}, + ), + _ExtEntry( + name="Subfolder", path="/sub_id", isFolder=True, + mimeType="application/vnd.google-apps.folder", + metadata={"id": "sub_id", "modifiedTime": self._recent}, + ), + ] + if path == "/sub_id": + return [ + _ExtEntry( + name="f3.txt", path="/f3", size=20, + mimeType="text/plain", + metadata={"id": "f3", "modifiedTime": self._recent}, + ), + ] + return [] + + async def download(self, path: str) -> bytes: + self.downloaded.append(path) + return path.encode("utf-8") + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls: List[SimpleNamespace] = [] + self._duplicateIds = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicateIds else "indexed" + return SimpleNamespace( + jobId=f"{job.sourceKind}:{job.sourceId}", + status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def _fakeRunExtraction(data, name, mime, options): + return SimpleNamespace( + parts=[ + SimpleNamespace( + id="p1", + data=data.decode("utf-8") if isinstance(data, bytes) else str(data), + typeGroup="text", + label="page:1", + metadata={"pageIndex": 0}, + ) + ] + ) + + +def test_bootstrap_walks_drive_and_subfolders(): + adapter = _FakeDriveAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert len(knowledge.calls) == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + assert result["indexed"] == 3 + assert result["skippedDuplicate"] == 0 + assert adapter.downloaded == ["/f1", "/f2", "/f3"] + + +def test_bootstrap_reports_duplicates_on_second_run(): + adapter = _FakeDriveAdapter() + duplicateIds = { + _syntheticFileId("c1", "f1"), + _syntheticFileId("c1", "f2"), + _syntheticFileId("c1", "f3"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicateIds) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=None), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 3 + + +def test_bootstrap_skips_files_older_than_maxAgeDays(): + adapter = _FakeDriveAdapter(recent_only=False) # f2 is 400 days old + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=180), + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 2 # f1, f3 + assert result["skippedPolicy"] == 1 # f2 filtered out + + +def test_bootstrap_passes_connection_provenance(): + adapter = _FakeDriveAdapter() + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGdrive( + connectionId="c1", + adapter=adapter, + connection=connection, + knowledgeService=knowledge, + runExtractionFn=_fakeRunExtraction, + limits=GdriveBootstrapLimits(maxAgeDays=None), + ) + + asyncio.run(_run()) + for job in knowledge.calls: + assert job.sourceKind == "gdrive_item" + assert job.mandateId == "m1" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "google" + assert job.provenance["service"] == "drive" + assert job.contentVersion # modifiedTime ISO string + + +if __name__ == "__main__": + test_bootstrap_walks_drive_and_subfolders() + test_bootstrap_reports_duplicates_on_second_run() + test_bootstrap_skips_files_older_than_maxAgeDays() + test_bootstrap_passes_connection_provenance() + print("OK — bootstrapGdrive tests passed") diff --git a/tests/unit/services/test_bootstrap_gmail.py b/tests/unit/services/test_bootstrap_gmail.py new file mode 100644 index 00000000..4f7cfe4d --- /dev/null +++ b/tests/unit/services/test_bootstrap_gmail.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Bootstrap Gmail tests with a fake googleGet + knowledge service. + +Verifies: +- Default labels (INBOX + SENT) are traversed. +- Each message produces a requestIngestion call with sourceKind=gmail_message + and structured contentObjects (header / snippet / body). +- Pagination via `nextPageToken` is followed. +- historyId is forwarded as contentVersion → idempotency. +- MIME body extraction walks nested parts (multipart/alternative). +""" + +import asyncio +import base64 +import os +import sys +from types import SimpleNamespace + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + +from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + bootstrapGmail, + GmailBootstrapLimits, + _syntheticMessageId, + _buildContentObjects, + _walkPayloadForBody, +) + + +def _b64url(text: str) -> str: + return base64.urlsafe_b64encode(text.encode("utf-8")).decode("ascii").rstrip("=") + + +def _msg(mid: str, subject: str = "Hi", body: str = "Hello world", historyId: str = "h1"): + return { + "id": mid, + "threadId": f"thread-{mid}", + "historyId": historyId, + "internalDate": "1700000000000", + "snippet": body[:120], + "payload": { + "headers": [ + {"name": "Subject", "value": subject}, + {"name": "From", "value": "Alice "}, + {"name": "To", "value": "Bob "}, + {"name": "Date", "value": "Tue, 01 Jan 2025 10:00:00 +0000"}, + ], + "mimeType": "text/plain", + "body": {"data": _b64url(body), "size": len(body)}, + "parts": [], + }, + } + + +class _FakeGoogleGet: + """Records URLs + returns the wired-up page or message response.""" + + def __init__(self, messages_by_label, paginated_label=None, page2=None): + self._messages = messages_by_label + self._paginated = paginated_label + self._page2 = page2 or [] + self._served_first_page = set() + self.requested = [] + + async def __call__(self, url: str): + self.requested.append(url) + # List page: contains `/users/me/messages?labelIds=...` + if "/users/me/messages?" in url: + for label, msgs in self._messages.items(): + if f"labelIds={label}" in url: + if ( + label == self._paginated + and label not in self._served_first_page + ): + self._served_first_page.add(label) + return { + "messages": [{"id": m["id"]} for m in msgs], + "nextPageToken": "token-2", + } + if label == self._paginated and "pageToken=token-2" in url: + return { + "messages": [{"id": m["id"]} for m in self._page2], + } + return {"messages": [{"id": m["id"]} for m in msgs]} + return {"messages": []} + # Detail fetch: /users/me/messages/{id}?format=full + if "/users/me/messages/" in url and "format=full" in url: + msgId = url.split("/users/me/messages/")[-1].split("?")[0] + for msgs in self._messages.values(): + for m in msgs: + if m["id"] == msgId: + return m + for m in self._page2: + if m["id"] == msgId: + return m + return {"error": "not found"} + + +class _FakeKnowledgeService: + def __init__(self, duplicateIds=None): + self.calls = [] + self._duplicates = duplicateIds or set() + + async def requestIngestion(self, job): + self.calls.append(job) + status = "duplicate" if job.sourceId in self._duplicates else "indexed" + return SimpleNamespace( + jobId=job.sourceId, status=status, contentHash="h", + fileId=job.sourceId, index=None, error=None, + ) + + +def test_buildContentObjects_emits_header_snippet_body(): + parts = _buildContentObjects(_msg("m1", body="Hello\nWorld"), maxBodyChars=8000) + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header", "snippet", "body"] + header = parts[0]["data"] + assert "Subject: Hi" in header + assert "From: Alice " in header + assert "To: Bob " in header + + +def test_walkPayloadForBody_prefers_plain_over_html(): + payload = { + "mimeType": "multipart/alternative", + "parts": [ + {"mimeType": "text/plain", "body": {"data": _b64url("plain body")}}, + {"mimeType": "text/html", "body": {"data": _b64url("

html body

")}}, + ], + } + bodies = _walkPayloadForBody(payload) + assert bodies["text"] == "plain body" + assert bodies["html"] == "

html body

" + + +def test_walkPayloadForBody_falls_back_to_html(): + payload = { + "mimeType": "multipart/alternative", + "parts": [ + {"mimeType": "text/html", "body": {"data": _b64url("

only html

")}}, + ], + } + bodies = _walkPayloadForBody(payload) + assert bodies["text"] == "" + assert "only html" in bodies["html"] + + +def test_bootstrap_gmail_indexes_messages_from_inbox_and_sent(): + fake_get = _FakeGoogleGet({ + "INBOX": [_msg("m1"), _msg("m2")], + "SENT": [_msg("m3")], + }) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGmail( + connectionId="c1", + adapter=SimpleNamespace(_token="t"), + connection=connection, + knowledgeService=knowledge, + limits=GmailBootstrapLimits(maxAgeDays=None), + googleGetFn=fake_get, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + sourceIds = {c.sourceId for c in knowledge.calls} + assert sourceIds == { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + _syntheticMessageId("c1", "m3"), + } + for job in knowledge.calls: + assert job.sourceKind == "gmail_message" + assert job.mimeType == "message/rfc822" + assert job.provenance["connectionId"] == "c1" + assert job.provenance["authority"] == "google" + assert job.provenance["service"] == "gmail" + assert job.contentVersion == "h1" + assert any(co["contentObjectId"] == "header" for co in job.contentObjects) + + +def test_bootstrap_gmail_follows_pagination(): + fake_get = _FakeGoogleGet( + messages_by_label={"INBOX": [_msg("m1")], "SENT": []}, + paginated_label="INBOX", + page2=[_msg("m2"), _msg("m3")], + ) + knowledge = _FakeKnowledgeService() + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGmail( + connectionId="c1", + adapter=SimpleNamespace(_token="t"), + connection=connection, + knowledgeService=knowledge, + limits=GmailBootstrapLimits(maxAgeDays=None), + googleGetFn=fake_get, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 3 + + +def test_bootstrap_gmail_reports_duplicates(): + fake_get = _FakeGoogleGet({"INBOX": [_msg("m1"), _msg("m2")], "SENT": []}) + duplicates = { + _syntheticMessageId("c1", "m1"), + _syntheticMessageId("c1", "m2"), + } + knowledge = _FakeKnowledgeService(duplicateIds=duplicates) + connection = SimpleNamespace(mandateId="m1", userId="u1") + + async def _run(): + return await bootstrapGmail( + connectionId="c1", + adapter=SimpleNamespace(_token="t"), + connection=connection, + knowledgeService=knowledge, + limits=GmailBootstrapLimits(maxAgeDays=None), + googleGetFn=fake_get, + ) + + result = asyncio.run(_run()) + assert result["indexed"] == 0 + assert result["skippedDuplicate"] == 2 + + +if __name__ == "__main__": + test_buildContentObjects_emits_header_snippet_body() + test_walkPayloadForBody_prefers_plain_over_html() + test_walkPayloadForBody_falls_back_to_html() + test_bootstrap_gmail_indexes_messages_from_inbox_and_sent() + test_bootstrap_gmail_follows_pagination() + test_bootstrap_gmail_reports_duplicates() + print("OK — bootstrapGmail tests passed") diff --git a/tests/unit/services/test_knowledge_ingest_consumer.py b/tests/unit/services/test_knowledge_ingest_consumer.py index 760e1ed6..6b27a6e8 100644 --- a/tests/unit/services/test_knowledge_ingest_consumer.py +++ b/tests/unit/services/test_knowledge_ingest_consumer.py @@ -99,17 +99,18 @@ def test_onConnectionRevoked_ignores_missing_id(monkeypatch): assert seen == [] -def test_bootstrap_job_skips_non_pilot_authority(monkeypatch): +def test_bootstrap_job_skips_unsupported_authority(monkeypatch): async def _run(): result = await consumer._bootstrapJobHandler( - {"payload": {"connectionId": "c1", "authority": "google"}}, + {"payload": {"connectionId": "c1", "authority": "slack"}}, lambda *_: None, ) return result result = asyncio.run(_run()) assert result["skipped"] is True - assert result["authority"] == "google" + assert result["authority"] == "slack" + assert result["reason"] == "unsupported_authority" def test_bootstrap_job_dispatches_msft_parts(monkeypatch): @@ -123,8 +124,6 @@ def test_bootstrap_job_dispatches_msft_parts(monkeypatch): calls["ol"] += 1 return {"indexed": 2} - # subConnectorSync* are lazy-imported inside the handler; install fake - # modules before invoking. fakeSharepoint = types.ModuleType("subConnectorSyncSharepoint") fakeSharepoint.bootstrapSharepoint = _fakeSp fakeOutlook = types.ModuleType("subConnectorSyncOutlook") @@ -152,6 +151,70 @@ def test_bootstrap_job_dispatches_msft_parts(monkeypatch): assert result["outlook"] == {"indexed": 2} +def test_bootstrap_job_dispatches_google_parts(monkeypatch): + calls = {"gd": 0, "gm": 0} + + async def _fakeGd(connectionId, progressCb=None): + calls["gd"] += 1 + return {"indexed": 7} + + async def _fakeGm(connectionId, progressCb=None): + calls["gm"] += 1 + return {"indexed": 11} + + fakeGdrive = types.ModuleType("subConnectorSyncGdrive") + fakeGdrive.bootstrapGdrive = _fakeGd + fakeGmail = types.ModuleType("subConnectorSyncGmail") + fakeGmail.bootstrapGmail = _fakeGm + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive", + fakeGdrive, + ) + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail", + fakeGmail, + ) + + async def _run(): + return await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "google"}}, + lambda *_: None, + ) + + result = asyncio.run(_run()) + assert calls == {"gd": 1, "gm": 1} + assert result["drive"] == {"indexed": 7} + assert result["gmail"] == {"indexed": 11} + + +def test_bootstrap_job_dispatches_clickup_part(monkeypatch): + calls = {"cu": 0} + + async def _fakeCu(connectionId, progressCb=None): + calls["cu"] += 1 + return {"indexed": 4} + + fakeClickup = types.ModuleType("subConnectorSyncClickup") + fakeClickup.bootstrapClickup = _fakeCu + monkeypatch.setitem( + sys.modules, + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup", + fakeClickup, + ) + + async def _run(): + return await consumer._bootstrapJobHandler( + {"payload": {"connectionId": "c1", "authority": "clickup"}}, + lambda *_: None, + ) + + result = asyncio.run(_run()) + assert calls == {"cu": 1} + assert result["clickup"] == {"indexed": 4} + + if __name__ == "__main__": # Usable without pytest fixtures for a quick smoke run. class _MP: From 93cb6939dc6a42212b100259538f6951594114fb Mon Sep 17 00:00:00 2001 From: Ida Date: Wed, 29 Apr 2026 09:13:57 +0200 Subject: [PATCH 07/18] feat: frontend consent integration --- modules/datamodels/datamodelUam.py | 18 +- modules/interfaces/interfaceDbApp.py | 41 +-- modules/routes/routeDataConnections.py | 13 +- modules/routes/routeSecurityClickup.py | 23 +- modules/routes/routeSecurityGoogle.py | 23 +- modules/routes/routeSecurityMsft.py | 23 +- .../serviceKnowledge/mainServiceKnowledge.py | 9 +- .../subConnectorIngestConsumer.py | 21 ++ .../serviceKnowledge/subConnectorPrefs.py | 101 ++++++ .../subConnectorSyncClickup.py | 65 ++-- .../subConnectorSyncGdrive.py | 16 +- .../serviceKnowledge/subConnectorSyncGmail.py | 60 +++- .../subConnectorSyncOutlook.py | 65 ++-- .../subConnectorSyncSharepoint.py | 10 +- tests/unit/services/test_p1d_consent_prefs.py | 298 ++++++++++++++++++ 15 files changed, 678 insertions(+), 108 deletions(-) create mode 100644 modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py create mode 100644 tests/unit/services/test_p1d_consent_prefs.py diff --git a/modules/datamodels/datamodelUam.py b/modules/datamodels/datamodelUam.py index 0f7fe6b8..6aba24eb 100644 --- a/modules/datamodels/datamodelUam.py +++ b/modules/datamodels/datamodelUam.py @@ -475,7 +475,23 @@ class UserConnection(PowerOnModel): description="OAuth scopes granted for this connection", json_schema_extra={"frontend_type": "list", "frontend_readonly": True, "frontend_required": False, "label": "Gewährte Berechtigungen"}, ) - + knowledgeIngestionEnabled: bool = Field( + default=False, + description="Whether the user has consented to knowledge ingestion for this connection", + json_schema_extra={"frontend_type": "boolean", "frontend_readonly": False, "frontend_required": False, "label": "Wissensdatenbank aktiv"}, + ) + knowledgePreferences: Optional[Dict[str, Any]] = Field( + default=None, + description=( + "Per-connection knowledge ingestion preferences. schemaVersion=1 keys: " + "neutralizeBeforeEmbed (bool), mailContentDepth (metadata|snippet|full), " + "mailIndexAttachments (bool), filesIndexBinaries (bool), mimeAllowlist (list[str]), " + "clickupScope (titles|title_description|with_comments), " + "surfaceToggles (dict per authority), maxAgeDays (int)." + ), + json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False, "label": "Wissenspräferenzen"}, + ) + @computed_field @property def connectionReference(self) -> str: diff --git a/modules/interfaces/interfaceDbApp.py b/modules/interfaces/interfaceDbApp.py index 51519a29..04ae82ff 100644 --- a/modules/interfaces/interfaceDbApp.py +++ b/modules/interfaces/interfaceDbApp.py @@ -1268,19 +1268,7 @@ class AppObjects: result = [] for conn_dict in connections: try: - # Create UserConnection object - connection = UserConnection( - id=conn_dict["id"], - userId=conn_dict["userId"], - authority=conn_dict.get("authority"), - externalId=conn_dict.get("externalId", ""), - externalUsername=conn_dict.get("externalUsername", ""), - externalEmail=conn_dict.get("externalEmail"), - status=conn_dict.get("status", "pending"), - connectedAt=conn_dict.get("connectedAt"), - lastChecked=conn_dict.get("lastChecked"), - expiresAt=conn_dict.get("expiresAt"), - ) + connection = UserConnection.model_validate(conn_dict) result.append(connection) except Exception as e: logger.error( @@ -1317,18 +1305,21 @@ class AppObjects: if connections: conn_dict = connections[0] - return UserConnection( - id=conn_dict["id"], - userId=conn_dict["userId"], - authority=conn_dict.get("authority"), - externalId=conn_dict.get("externalId", ""), - externalUsername=conn_dict.get("externalUsername", ""), - externalEmail=conn_dict.get("externalEmail"), - status=conn_dict.get("status", "pending"), - connectedAt=conn_dict.get("connectedAt"), - lastChecked=conn_dict.get("lastChecked"), - expiresAt=conn_dict.get("expiresAt"), - ) + try: + return UserConnection.model_validate(conn_dict) + except Exception: + return UserConnection( + id=conn_dict["id"], + userId=conn_dict["userId"], + authority=conn_dict.get("authority"), + externalId=conn_dict.get("externalId", ""), + externalUsername=conn_dict.get("externalUsername", ""), + externalEmail=conn_dict.get("externalEmail"), + status=conn_dict.get("status", "pending"), + connectedAt=conn_dict.get("connectedAt"), + lastChecked=conn_dict.get("lastChecked"), + expiresAt=conn_dict.get("expiresAt"), + ) return None except Exception as e: logger.error(f"Error getting user connection by ID: {str(e)}") diff --git a/modules/routes/routeDataConnections.py b/modules/routes/routeDataConnections.py index b8ccf4bf..51549d6a 100644 --- a/modules/routes/routeDataConnections.py +++ b/modules/routes/routeDataConnections.py @@ -351,11 +351,18 @@ def create_connection( externalUsername="", # Will be set after OAuth status=ConnectionStatus.PENDING # Start with PENDING status ) - + + # Apply knowledge consent + preferences from request body before persisting + knowledge_enabled = connection_data.get("knowledgeIngestionEnabled") + if isinstance(knowledge_enabled, bool): + connection.knowledgeIngestionEnabled = knowledge_enabled + knowledge_prefs = connection_data.get("knowledgePreferences") + if isinstance(knowledge_prefs, dict): + connection.knowledgePreferences = knowledge_prefs + # Save connection record - models now handle timestamp serialization automatically interface.db.recordModify(UserConnection, connection.id, connection.model_dump()) - - + return connection except HTTPException: diff --git a/modules/routes/routeSecurityClickup.py b/modules/routes/routeSecurityClickup.py index 698e3ca1..d6f71d20 100644 --- a/modules/routes/routeSecurityClickup.py +++ b/modules/routes/routeSecurityClickup.py @@ -244,12 +244,23 @@ async def auth_connect_callback( try: from modules.shared.callbackRegistry import callbackRegistry - callbackRegistry.trigger( - "connection.established", - connectionId=connection.id, - authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"), - userId=str(user.id), - ) + if connection.knowledgeIngestionEnabled: + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "clickup"), + userId=str(user.id), + ) + else: + logger.info( + "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user", + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connection.id, + "authority": "clickup", + "reason": "consent_disabled", + }, + ) except Exception as _cbErr: logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) diff --git a/modules/routes/routeSecurityGoogle.py b/modules/routes/routeSecurityGoogle.py index 2b6a70f5..7b6c1c64 100644 --- a/modules/routes/routeSecurityGoogle.py +++ b/modules/routes/routeSecurityGoogle.py @@ -482,12 +482,23 @@ async def auth_connect_callback( try: from modules.shared.callbackRegistry import callbackRegistry - callbackRegistry.trigger( - "connection.established", - connectionId=connection.id, - authority=str(getattr(connection.authority, "value", connection.authority) or "google"), - userId=str(user.id), - ) + if connection.knowledgeIngestionEnabled: + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "google"), + userId=str(user.id), + ) + else: + logger.info( + "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user", + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connection.id, + "authority": "google", + "reason": "consent_disabled", + }, + ) except Exception as _cbErr: logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) diff --git a/modules/routes/routeSecurityMsft.py b/modules/routes/routeSecurityMsft.py index e087a44c..a2768a2b 100644 --- a/modules/routes/routeSecurityMsft.py +++ b/modules/routes/routeSecurityMsft.py @@ -423,12 +423,23 @@ async def auth_connect_callback( try: from modules.shared.callbackRegistry import callbackRegistry - callbackRegistry.trigger( - "connection.established", - connectionId=connection.id, - authority=str(getattr(connection.authority, "value", connection.authority) or "msft"), - userId=str(user.id), - ) + if connection.knowledgeIngestionEnabled: + callbackRegistry.trigger( + "connection.established", + connectionId=connection.id, + authority=str(getattr(connection.authority, "value", connection.authority) or "msft"), + userId=str(user.id), + ) + else: + logger.info( + "ingestion.connection.bootstrap.skipped — knowledge ingestion disabled by user", + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connection.id, + "authority": "msft", + "reason": "consent_disabled", + }, + ) except Exception as _cbErr: logger.warning("connection.established callback failed for %s: %s", connection.id, _cbErr) diff --git a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py index 0267e2fd..6698e164 100644 --- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py +++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py @@ -48,6 +48,9 @@ class IngestionJob: containerPath: Optional[str] = None contentVersion: Optional[str] = None provenance: Optional[Dict[str, Any]] = None + # Connector-driven neutralization: True when the user opted in via §2.6 preferences. + # For sourceKind == "file", _indexFileInternal resolves this from FileItem.neutralize instead. + neutralize: bool = False @dataclass @@ -205,6 +208,7 @@ class KnowledgeService: containerPath=job.containerPath, sourceKind=job.sourceKind, connectionId=(job.provenance or {}).get("connectionId"), + neutralize=job.neutralize, ) except Exception as exc: logger.error( @@ -391,6 +395,7 @@ class KnowledgeService: containerPath: str = None, sourceKind: str = "file", connectionId: Optional[str] = None, + neutralize: bool = False, ) -> FileContentIndex: """Index a file's content objects and create embeddings for text chunks. @@ -421,7 +426,7 @@ class KnowledgeService: resolvedMandateId = mandateId resolvedFeatureInstanceId = featureInstanceId resolvedUserId = userId - _shouldNeutralize = False + _shouldNeutralize = neutralize # caller-supplied flag (connector prefs / IngestionJob) if sourceKind == "file": try: from modules.datamodels.datamodelFiles import FileItem as _FileItem @@ -435,7 +440,7 @@ class KnowledgeService: if _fileRecords: _fileRecord = _fileRecords[0] _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d)) - _shouldNeutralize = bool(_get("neutralize", False)) + _shouldNeutralize = bool(_get("neutralize", False)) # FileItem is authoritative for uploads _fileScope = _get("scope") if _fileScope: resolvedScope = _fileScope diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py index f9b3533d..e27e2d29 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py @@ -135,6 +135,27 @@ async def _bootstrapJobHandler( progressCb(5, f"resolving {authority} connection") + # Defensive consent check: if the connection has since disabled knowledge ingestion + # (e.g. user toggled setting after the job was enqueued), skip all walkers. + try: + from modules.interfaces.interfaceDbApp import getRootInterface + _root = getRootInterface() + _conn = _root.getUserConnectionById(connectionId) + if _conn and not getattr(_conn, "knowledgeIngestionEnabled", True): + logger.info( + "ingestion.connection.bootstrap.skipped — consent disabled connectionId=%s", + connectionId, + extra={ + "event": "ingestion.connection.bootstrap.skipped", + "connectionId": connectionId, + "authority": authority, + "reason": "consent_disabled", + }, + ) + return {"connectionId": connectionId, "authority": authority, "skipped": True, "reason": "consent_disabled"} + except Exception as _guardErr: + logger.debug("Could not load connection for consent guard: %s", _guardErr) + def _normalize(res: Any, label: str) -> Dict[str, Any]: if isinstance(res, Exception): logger.error( diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py new file mode 100644 index 00000000..950400ce --- /dev/null +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorPrefs.py @@ -0,0 +1,101 @@ +"""Per-connection knowledge ingestion preference helpers. + +Walkers call `loadConnectionPrefs(connectionId)` once at bootstrap start and +receive a `ConnectionIngestionPrefs` dataclass they can pass down into their +inner loops. All fields have safe defaults so walkers stay backward-compatible +with connections that predate the §2.6 preference schema (knowledgePreferences +is None). +""" +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + +_DEFAULT_MAX_AGE_DAYS = 90 +_DEFAULT_MAIL_DEPTH = "full" +_DEFAULT_CLICKUP_SCOPE = "title_description" + + +@dataclass +class ConnectionIngestionPrefs: + """Parsed per-connection preferences for knowledge ingestion walkers.""" + + # PII + neutralizeBeforeEmbed: bool = False + + # Mail (Outlook + Gmail) + mailContentDepth: str = _DEFAULT_MAIL_DEPTH # "metadata" | "snippet" | "full" + mailIndexAttachments: bool = False + + # Files (Drive / SharePoint / OneDrive) + filesIndexBinaries: bool = True + mimeAllowlist: List[str] = field(default_factory=list) # empty = all allowed + + # ClickUp + clickupScope: str = _DEFAULT_CLICKUP_SCOPE # "titles" | "title_description" | "with_comments" + clickupIndexAttachments: bool = False + + # Per-authority surface toggles (default everything on) + gmailEnabled: bool = True + driveEnabled: bool = True + sharepointEnabled: bool = True + outlookEnabled: bool = True + + # Time window + maxAgeDays: int = _DEFAULT_MAX_AGE_DAYS # 0 = no limit + + +def loadConnectionPrefs(connectionId: str) -> ConnectionIngestionPrefs: + """Load and parse per-connection preferences from the database. + + Returns safe defaults for any missing or unparseable values so walkers + never fail due to missing preference data. + """ + try: + from modules.interfaces.interfaceDbApp import getRootInterface + root = getRootInterface() + conn = root.getUserConnectionById(connectionId) + if not conn: + logger.debug("loadConnectionPrefs: connection %s not found, using defaults", connectionId) + return ConnectionIngestionPrefs() + + raw: Optional[Dict[str, Any]] = getattr(conn, "knowledgePreferences", None) + if not raw or not isinstance(raw, dict): + return ConnectionIngestionPrefs() + + def _bool(key: str, default: bool) -> bool: + v = raw.get(key) + return bool(v) if isinstance(v, bool) else default + + def _str(key: str, allowed: List[str], default: str) -> str: + v = raw.get(key) + return v if v in allowed else default + + def _int(key: str, default: int) -> int: + v = raw.get(key) + return int(v) if isinstance(v, int) else default + + surface = raw.get("surfaceToggles") or {} + google_surf = surface.get("google") or {} + msft_surf = surface.get("msft") or {} + + return ConnectionIngestionPrefs( + neutralizeBeforeEmbed=_bool("neutralizeBeforeEmbed", False), + mailContentDepth=_str("mailContentDepth", ["metadata", "snippet", "full"], _DEFAULT_MAIL_DEPTH), + mailIndexAttachments=_bool("mailIndexAttachments", False), + filesIndexBinaries=_bool("filesIndexBinaries", True), + mimeAllowlist=list(raw.get("mimeAllowlist") or []), + clickupScope=_str("clickupScope", ["titles", "title_description", "with_comments"], _DEFAULT_CLICKUP_SCOPE), + clickupIndexAttachments=_bool("clickupIndexAttachments", False), + gmailEnabled=bool(google_surf.get("gmail", True)), + driveEnabled=bool(google_surf.get("drive", True)), + sharepointEnabled=bool(msft_surf.get("sharepoint", True)), + outlookEnabled=bool(msft_surf.get("outlook", True)), + maxAgeDays=_int("maxAgeDays", _DEFAULT_MAX_AGE_DAYS), + ) + except Exception as exc: + logger.warning("loadConnectionPrefs failed for %s, using defaults: %s", connectionId, exc) + return ConnectionIngestionPrefs() diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py index 16e94e59..31ac9687 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncClickup.py @@ -46,6 +46,10 @@ class ClickupBootstrapLimits: # ClickUp `closed` tasks often carry the most useful RAG context # ("why was this shipped the way it was?"). includeClosed: bool = True + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + # Content scope: "titles" | "title_description" | "with_comments" + clickupScope: str = "title_description" @dataclass @@ -88,7 +92,14 @@ def _isRecent(dateUpdatedMs: Any, maxAgeDays: Optional[int]) -> bool: def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) -> List[Dict[str, Any]]: - """Header (name/status/metadata) + description + text_content, all text.""" + """Header (name/status/metadata) + optional description + text_content. + + `limits.clickupScope` controls how much is embedded: + - "titles": task name + status metadata only + - "title_description": header + description / text_content (default) + - "with_comments": header + description + text_content + (comments themselves are not yet fetched in v1) + """ name = task.get("name") or f"Task {task.get('id', '')}" status = ((task.get("status") or {}).get("status")) or "" assignees = ", ".join( @@ -129,24 +140,26 @@ def _buildContentObjects(task: Dict[str, Any], limits: ClickupBootstrapLimits) - "contextRef": {"part": "header"}, }] - description = _truncate(task.get("description"), limits.maxDescriptionChars) - if description: - parts.append({ - "contentObjectId": "description", - "contentType": "text", - "data": description, - "contextRef": {"part": "description"}, - }) - # text_content is ClickUp's rendered-markdown version; include if it adds - # something beyond the plain description (common for bullet lists, checklists). - textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars) - if textContent and textContent != description: - parts.append({ - "contentObjectId": "text_content", - "contentType": "text", - "data": textContent, - "contextRef": {"part": "text_content"}, - }) + scope = getattr(limits, "clickupScope", "title_description") + if scope in ("title_description", "with_comments"): + description = _truncate(task.get("description"), limits.maxDescriptionChars) + if description: + parts.append({ + "contentObjectId": "description", + "contentType": "text", + "data": description, + "contextRef": {"part": "description"}, + }) + # text_content is ClickUp's rendered-markdown version; include if it adds + # something beyond the plain description (common for bullet lists, checklists). + textContent = _truncate(task.get("text_content"), limits.maxDescriptionChars) + if textContent and textContent != description: + parts.append({ + "contentObjectId": "text_content", + "contentType": "text", + "data": textContent, + "contextRef": {"part": "text_content"}, + }) return parts @@ -160,7 +173,16 @@ async def bootstrapClickup( limits: Optional[ClickupBootstrapLimits] = None, ) -> Dict[str, Any]: """Walk workspaces → lists → tasks and ingest each task as a virtual doc.""" - limits = limits or ClickupBootstrapLimits() + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = ClickupBootstrapLimits( + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + neutralize=prefs.neutralizeBeforeEmbed, + clickupScope=prefs.clickupScope, + ) + startMs = time.time() result = ClickupBootstrapResult(connectionId=connectionId) @@ -406,6 +428,7 @@ async def _ingestTask( mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision or None, + neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "clickup", @@ -416,7 +439,7 @@ async def _ingestTask( "spaceId": ((task.get("space") or {}).get("id")), "url": task.get("url"), "status": ((task.get("status") or {}).get("status")), - "tier": "body", + "tier": limits.clickupScope, }, ) ) diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py index 3e73a040..5e4e659b 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGdrive.py @@ -43,6 +43,10 @@ class GdriveBootstrapLimits: maxDepth: int = MAX_DEPTH_DEFAULT # Only ingest files modified within the last N days. None disables filter. maxAgeDays: Optional[int] = MAX_AGE_DAYS_DEFAULT + # Pass-through to IngestionJob.neutralize + neutralize: bool = False + # Whether to skip binary/non-text files + filesIndexBinaries: bool = True @dataclass @@ -115,7 +119,16 @@ async def bootstrapGdrive( runExtractionFn: Optional[Callable[..., Any]] = None, ) -> Dict[str, Any]: """Walk My Drive starting from the virtual root folder.""" - limits = limits or GdriveBootstrapLimits() + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = GdriveBootstrapLimits( + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + neutralize=prefs.neutralizeBeforeEmbed, + filesIndexBinaries=prefs.filesIndexBinaries, + ) + startMs = time.time() result = GdriveBootstrapResult(connectionId=connectionId) @@ -352,6 +365,7 @@ async def _ingestOne( mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, + neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "google", diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py index 827add6b..21fec83d 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncGmail.py @@ -42,6 +42,10 @@ class GmailBootstrapLimits: maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT # Only fetch messages newer than N days. None disables filter. maxAgeDays: Optional[int] = 90 + # Content depth: "metadata" | "snippet" | "full" + mailContentDepth: str = "full" + # Pass-through to IngestionJob.neutralize + neutralize: bool = False @dataclass @@ -112,7 +116,18 @@ def _headerMap(payload: Dict[str, Any]) -> Dict[str, str]: } -def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]: +def _buildContentObjects( + message: Dict[str, Any], + maxBodyChars: int, + mailContentDepth: str = "full", +) -> List[Dict[str, Any]]: + """Build content objects for a Gmail message. + + `mailContentDepth` controls how much is embedded: + - "metadata": header only (subject, from, to, date) + - "snippet": header + Gmail snippet (~155 chars, no full body) + - "full": header + snippet + cleaned full body (default) + """ payload = message.get("payload") or {} headers = _headerMap(payload) subject = headers.get("subject") or "(no subject)" @@ -122,10 +137,6 @@ def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dic date = headers.get("date") or "" snippet = message.get("snippet") or "" - bodies = _walkPayloadForBody(payload) - rawBody = bodies["text"] or bodies["html"] - cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else "" - parts: List[Dict[str, Any]] = [] header = ( f"Subject: {subject}\n" @@ -140,20 +151,24 @@ def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dic "data": header, "contextRef": {"part": "header"}, }) - if snippet: + if mailContentDepth in ("snippet", "full") and snippet: parts.append({ "contentObjectId": "snippet", "contentType": "text", "data": snippet, "contextRef": {"part": "snippet"}, }) - if cleanedBody: - parts.append({ - "contentObjectId": "body", - "contentType": "text", - "data": cleanedBody, - "contextRef": {"part": "body"}, - }) + if mailContentDepth == "full": + bodies = _walkPayloadForBody(payload) + rawBody = bodies["text"] or bodies["html"] + cleanedBody = cleanEmailBody(rawBody, maxChars=maxBodyChars) if rawBody else "" + if cleanedBody: + parts.append({ + "contentObjectId": "body", + "contentType": "text", + "data": cleanedBody, + "contextRef": {"part": "body"}, + }) return parts @@ -168,7 +183,17 @@ async def bootstrapGmail( googleGetFn: Optional[Callable[..., Any]] = None, ) -> Dict[str, Any]: """Enumerate Gmail labels (INBOX + SENT default) and ingest messages.""" - limits = limits or GmailBootstrapLimits() + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = GmailBootstrapLimits( + includeAttachments=prefs.mailIndexAttachments, + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + mailContentDepth=prefs.mailContentDepth, + neutralize=prefs.neutralizeBeforeEmbed, + ) + startMs = time.time() result = GmailBootstrapResult(connectionId=connectionId) @@ -344,7 +369,9 @@ async def _ingestMessage( syntheticId = _syntheticMessageId(connectionId, messageId) fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" - contentObjects = _buildContentObjects(message, limits.maxBodyChars) + contentObjects = _buildContentObjects( + message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth + ) try: handle = await knowledgeService.requestIngestion( IngestionJob( @@ -356,6 +383,7 @@ async def _ingestMessage( mandateId=mandateId, contentObjects=contentObjects, contentVersion=str(revision) if revision else None, + neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "google", @@ -363,7 +391,7 @@ async def _ingestMessage( "externalItemId": messageId, "label": labelId, "threadId": message.get("threadId"), - "tier": "body", + "tier": limits.mailContentDepth, }, ) ) diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py index b3f425ac..64a3545f 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncOutlook.py @@ -40,6 +40,10 @@ class OutlookBootstrapLimits: maxAttachmentBytes: int = MAX_ATTACHMENT_BYTES_DEFAULT # Only fetch messages newer than N days. None disables filter. maxAgeDays: Optional[int] = 90 + # Content depth: "metadata" | "snippet" | "full" + mailContentDepth: str = "full" + # Pass-through to IngestionJob.neutralize + neutralize: bool = False @dataclass @@ -78,7 +82,18 @@ def _joinRecipients(recipients: List[Dict[str, Any]]) -> str: return ", ".join(filter(None, [_extractRecipient(r) for r in recipients or []])) -def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dict[str, Any]]: +def _buildContentObjects( + message: Dict[str, Any], + maxBodyChars: int, + mailContentDepth: str = "full", +) -> List[Dict[str, Any]]: + """Build content objects for an Outlook message. + + `mailContentDepth` mirrors the Gmail walker: + - "metadata": header only + - "snippet": header + bodyPreview (~255 chars) + - "full": header + snippet + cleaned body (default) + """ subject = message.get("subject") or "(no subject)" fromAddr = _extractRecipient(message.get("from") or {}) toAddr = _joinRecipients(message.get("toRecipients") or []) @@ -86,14 +101,6 @@ def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dic received = message.get("receivedDateTime") or "" snippet = message.get("bodyPreview") or "" - body = message.get("body") or {} - bodyContent = body.get("content") or "" - bodyType = (body.get("contentType") or "").lower() - if bodyType == "html" or (bodyContent and "<" in bodyContent and ">" in bodyContent): - cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) - else: - cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else "" - parts: List[Dict[str, Any]] = [] header = ( f"Subject: {subject}\n" @@ -108,20 +115,24 @@ def _buildContentObjects(message: Dict[str, Any], maxBodyChars: int) -> List[Dic "data": header, "contextRef": {"part": "header"}, }) - if snippet: + if mailContentDepth in ("snippet", "full") and snippet: parts.append({ "contentObjectId": "snippet", "contentType": "text", "data": snippet, "contextRef": {"part": "snippet"}, }) - if cleanedBody: - parts.append({ - "contentObjectId": "body", - "contentType": "text", - "data": cleanedBody, - "contextRef": {"part": "body"}, - }) + if mailContentDepth == "full": + body = message.get("body") or {} + bodyContent = body.get("content") or "" + cleanedBody = cleanEmailBody(bodyContent, maxChars=maxBodyChars) if bodyContent else "" + if cleanedBody: + parts.append({ + "contentObjectId": "body", + "contentType": "text", + "data": cleanedBody, + "contextRef": {"part": "body"}, + }) return parts @@ -135,7 +146,17 @@ async def bootstrapOutlook( limits: Optional[OutlookBootstrapLimits] = None, ) -> Dict[str, Any]: """Enumerate Outlook folders (inbox + sent by default) and ingest messages.""" - limits = limits or OutlookBootstrapLimits() + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = OutlookBootstrapLimits( + includeAttachments=prefs.mailIndexAttachments, + maxAgeDays=prefs.maxAgeDays if prefs.maxAgeDays > 0 else None, + mailContentDepth=prefs.mailContentDepth, + neutralize=prefs.neutralizeBeforeEmbed, + ) + startMs = time.time() result = OutlookBootstrapResult(connectionId=connectionId) @@ -330,7 +351,9 @@ async def _ingestMessage( syntheticId = _syntheticMessageId(connectionId, messageId) fileName = f"{subject[:80].strip()}.eml" if subject else f"{messageId}.eml" - contentObjects = _buildContentObjects(message, limits.maxBodyChars) + contentObjects = _buildContentObjects( + message, limits.maxBodyChars, mailContentDepth=limits.mailContentDepth + ) # Always at least the header is emitted, so `contentObjects` is non-empty. try: handle = await knowledgeService.requestIngestion( @@ -343,13 +366,14 @@ async def _ingestMessage( mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, + neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "msft", "service": "outlook", "externalItemId": messageId, "internetMessageId": message.get("internetMessageId"), - "tier": "body", + "tier": limits.mailContentDepth, }, ) ) @@ -504,6 +528,7 @@ async def _ingestAttachments( userId=userId, mandateId=mandateId, contentObjects=contentObjects, + neutralize=limits.neutralize, provenance={ "connectionId": connectionId, "authority": "msft", diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py index 0bceecac..07fef7a8 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorSyncSharepoint.py @@ -39,6 +39,8 @@ class SharepointBootstrapLimits: skipMimePrefixes: tuple = SKIP_MIME_PREFIXES_DEFAULT maxDepth: int = MAX_DEPTH_DEFAULT maxSites: int = MAX_SITES_DEFAULT + # Pass-through to IngestionJob.neutralize + neutralize: bool = False @dataclass @@ -105,7 +107,12 @@ async def bootstrapSharepoint( `connectionId` (and optionally a progressCb) and everything else is resolved against the registered services. """ - limits = limits or SharepointBootstrapLimits() + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + prefs = loadConnectionPrefs(connectionId) + + if not limits: + limits = SharepointBootstrapLimits(neutralize=prefs.neutralizeBeforeEmbed) + startMs = time.time() result = SharepointBootstrapResult(connectionId=connectionId) @@ -349,6 +356,7 @@ async def _ingestOne( mandateId=mandateId, contentObjects=contentObjects, contentVersion=revision, + neutralize=limits.neutralize, provenance=provenance, ) ) diff --git a/tests/unit/services/test_p1d_consent_prefs.py b/tests/unit/services/test_p1d_consent_prefs.py new file mode 100644 index 00000000..e00b0dfc --- /dev/null +++ b/tests/unit/services/test_p1d_consent_prefs.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +"""Unit tests for P1d: consent gating, preference parsing, and walker behaviour. + +Tests +----- +1. Bootstrap runner skips when ``knowledgeIngestionEnabled=False``. +2. ``loadConnectionPrefs`` returns safe defaults when preferences are absent. +3. ``loadConnectionPrefs`` maps all §2.6 keys correctly from a full prefs dict. +4. Gmail walker passes ``neutralize=True`` and ``mailContentDepth`` to IngestionJob. +5. Gmail walker produces only a header content-object when depth="metadata". +6. ClickUp walker skips description when scope="titles". +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import types +import unittest +from typing import Any, Dict, Optional +from unittest.mock import AsyncMock, MagicMock, patch + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../..")) + + +# --------------------------------------------------------------------------- +# 1. Bootstrap runner consent gate +# --------------------------------------------------------------------------- + +class TestBootstrapConsentGate(unittest.TestCase): + """_bootstrapJobHandler must no-op when knowledgeIngestionEnabled is False.""" + + def _makeJob(self, connectionId="c-test", authority="google"): + return {"payload": {"connectionId": connectionId, "authority": authority}} + + def _makeConn(self, enabled: bool): + conn = MagicMock() + conn.knowledgeIngestionEnabled = enabled + return conn + + def test_skips_when_consent_disabled(self): + from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut + + fake_root = MagicMock() + fake_root.getUserConnectionById.return_value = self._makeConn(False) + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root): + result = asyncio.get_event_loop().run_until_complete( + sut._bootstrapJobHandler(self._makeJob(), lambda *a: None) + ) + + assert result.get("skipped") is True + assert result.get("reason") == "consent_disabled" + fake_root.getUserConnectionById.assert_called_once_with("c-test") + + def test_proceeds_when_consent_enabled(self): + """When consent is enabled, the handler should call at least one walker.""" + from modules.serviceCenter.services.serviceKnowledge import subConnectorIngestConsumer as sut + + fake_root = MagicMock() + fake_root.getUserConnectionById.return_value = self._makeConn(True) + + # Patch the inner walker so it doesn't do real I/O. + async def _fakeBootstrap(**kwargs): + return {"indexed": 0} + + with ( + patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=fake_root), + patch( + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGdrive.bootstrapGdrive", + new=AsyncMock(return_value={"indexed": 0}), + ), + patch( + "modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail.bootstrapGmail", + new=AsyncMock(return_value={"indexed": 0}), + ), + ): + result = asyncio.get_event_loop().run_until_complete( + sut._bootstrapJobHandler(self._makeJob(authority="google"), lambda *a: None) + ) + + # Should not have 'skipped' at the top level. + assert result.get("skipped") is not True + assert result.get("authority") == "google" + + +# --------------------------------------------------------------------------- +# 2 + 3. loadConnectionPrefs +# --------------------------------------------------------------------------- + +class TestLoadConnectionPrefs(unittest.TestCase): + def _makeConn(self, prefs: Optional[Dict[str, Any]]): + conn = MagicMock() + conn.knowledgePreferences = prefs + return conn + + def _mockRoot(self, prefs): + root = MagicMock() + root.getUserConnectionById.return_value = self._makeConn(prefs) + return root + + def test_returns_safe_defaults_when_prefs_none(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import ( + ConnectionIngestionPrefs, + loadConnectionPrefs, + ) + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(None)): + prefs = loadConnectionPrefs("x") + + assert prefs.neutralizeBeforeEmbed is False + assert prefs.mailContentDepth == "full" + assert prefs.mailIndexAttachments is False + assert prefs.maxAgeDays == 90 + assert prefs.clickupScope == "title_description" + assert prefs.gmailEnabled is True + assert prefs.driveEnabled is True + + def test_maps_all_keys(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + + raw = { + "neutralizeBeforeEmbed": True, + "mailContentDepth": "metadata", + "mailIndexAttachments": True, + "filesIndexBinaries": False, + "clickupScope": "with_comments", + "maxAgeDays": 30, + "surfaceToggles": { + "google": {"gmail": False, "drive": True}, + "msft": {"sharepoint": False, "outlook": True}, + }, + } + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)): + prefs = loadConnectionPrefs("x") + + assert prefs.neutralizeBeforeEmbed is True + assert prefs.mailContentDepth == "metadata" + assert prefs.mailIndexAttachments is True + assert prefs.filesIndexBinaries is False + assert prefs.clickupScope == "with_comments" + assert prefs.maxAgeDays == 30 + assert prefs.gmailEnabled is False + assert prefs.driveEnabled is True + assert prefs.sharepointEnabled is False + assert prefs.outlookEnabled is True + + def test_invalid_depth_falls_back_to_default(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorPrefs import loadConnectionPrefs + + raw = {"mailContentDepth": "everything_please"} + + with patch("modules.interfaces.interfaceDbApp.getRootInterface", return_value=self._mockRoot(raw)): + prefs = loadConnectionPrefs("x") + + assert prefs.mailContentDepth == "full" + + +# --------------------------------------------------------------------------- +# 4. Gmail walker passes neutralize + mailContentDepth to IngestionJob +# --------------------------------------------------------------------------- + +class TestGmailWalkerPrefs(unittest.TestCase): + def _make_message(self, *, subject="Test", snippet="hello", body_text="full body"): + import base64 + encoded = base64.urlsafe_b64encode(body_text.encode()).decode() + return { + "id": "msg-1", + "historyId": "h-42", + "threadId": "t-1", + "snippet": snippet, + "payload": { + "mimeType": "multipart/alternative", + "headers": [ + {"name": "Subject", "value": subject}, + {"name": "From", "value": "alice@example.com"}, + {"name": "To", "value": "bob@example.com"}, + {"name": "Date", "value": "Mon, 20 Apr 2026 10:00:00 +0000"}, + ], + "parts": [ + { + "mimeType": "text/plain", + "body": {"data": encoded}, + } + ], + }, + } + + def test_neutralize_flag_forwarded(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + GmailBootstrapLimits, + _ingestMessage, + GmailBootstrapResult, + ) + from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob + + captured_jobs = [] + + async def fake_requestIngestion(job: IngestionJob): + captured_jobs.append(job) + return MagicMock(status="indexed", error=None) + + ks = MagicMock() + ks.requestIngestion = fake_requestIngestion + + limits = GmailBootstrapLimits(neutralize=True, mailContentDepth="full") + result = GmailBootstrapResult(connectionId="c-1") + + asyncio.get_event_loop().run_until_complete( + _ingestMessage( + googleGetFn=AsyncMock(return_value={}), + knowledgeService=ks, + connectionId="c-1", + mandateId="", + userId="u-1", + labelId="INBOX", + message=self._make_message(), + limits=limits, + result=result, + progressCb=None, + ) + ) + + assert len(captured_jobs) == 1 + assert captured_jobs[0].neutralize is True + + def test_metadata_depth_yields_only_header(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + _buildContentObjects, + ) + + message = self._make_message(snippet="hi", body_text="should be excluded") + parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="metadata") + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header"] + + def test_snippet_depth_yields_header_and_snippet(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncGmail import ( + _buildContentObjects, + ) + + message = self._make_message(snippet="hi", body_text="should be excluded") + parts = _buildContentObjects(message, maxBodyChars=4000, mailContentDepth="snippet") + ids = [p["contentObjectId"] for p in parts] + assert "header" in ids + assert "snippet" in ids + assert "body" not in ids + + +# --------------------------------------------------------------------------- +# 5. ClickUp walker respects clickupScope="titles" +# --------------------------------------------------------------------------- + +class TestClickupWalkerScope(unittest.TestCase): + def _make_task(self): + return { + "id": "task-1", + "name": "Ship feature X", + "date_updated": "1713888000000", + "description": "This should be omitted", + "text_content": "Also omitted", + "status": {"status": "open"}, + "assignees": [], + "tags": [], + "list": {"name": "Backlog"}, + "folder": {}, + "space": {"name": "Engineering"}, + } + + def test_titles_scope_omits_description(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + ClickupBootstrapLimits, + _buildContentObjects, + ) + + limits = ClickupBootstrapLimits(clickupScope="titles") + parts = _buildContentObjects(self._make_task(), limits) + ids = [p["contentObjectId"] for p in parts] + assert ids == ["header"] + assert "description" not in ids + + def test_with_description_scope_includes_description(self): + from modules.serviceCenter.services.serviceKnowledge.subConnectorSyncClickup import ( + ClickupBootstrapLimits, + _buildContentObjects, + ) + + limits = ClickupBootstrapLimits(clickupScope="title_description") + parts = _buildContentObjects(self._make_task(), limits) + ids = [p["contentObjectId"] for p in parts] + assert "header" in ids + assert "description" in ids + + +if __name__ == "__main__": + unittest.main() From 4a840e9e6e9a0951e885d1f390b2ca1551c6a9ce Mon Sep 17 00:00:00 2001 From: Ida Date: Wed, 29 Apr 2026 14:16:52 +0200 Subject: [PATCH 08/18] added neutralization option to indexing new connections --- .../services/serviceAi/mainServiceAi.py | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/modules/serviceCenter/services/serviceAi/mainServiceAi.py b/modules/serviceCenter/services/serviceAi/mainServiceAi.py index 6428bed3..f3742356 100644 --- a/modules/serviceCenter/services/serviceAi/mainServiceAi.py +++ b/modules/serviceCenter/services/serviceAi/mainServiceAi.py @@ -164,12 +164,29 @@ class AiService: # SPEECH_TEAMS: Dedicated pipeline, bypasses standard model selection if request.options and request.options.operationType == OperationTypeEnum.SPEECH_TEAMS: return await self._handleSpeechTeams(request) - - # FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check) - self._preflightBillingCheck() - - # Balance & provider permission checks - await self._checkBillingBeforeAiCall() + + _opType = request.options.operationType if request.options else None + _isNeutralizationCall = _opType in ( + OperationTypeEnum.NEUTRALIZATION_TEXT, + OperationTypeEnum.NEUTRALIZATION_IMAGE, + ) + + if not _isNeutralizationCall: + # FAIL-SAFE: Pre-flight billing validation (like 0 CHF credit card check) + self._preflightBillingCheck() + # Balance & provider permission checks + await self._checkBillingBeforeAiCall() + else: + # Neutralization calls are system-level operations (connector anonymization). + # They run without a mandate context (e.g. personal-scope connections) and + # are billed the same way as embedding calls: best-effort, skipped when no + # billing settings exist for an empty mandate. + logger.debug( + "callAi: skipping billing preflight for neutralization call " + "(operationType=%s, user=%s)", + _opType, + getattr(getattr(self.services, 'user', None), 'id', 'unknown'), + ) # Calculate effective allowedProviders: RBAC ∩ Workflow effectiveProviders = self._calculateEffectiveProviders() @@ -218,8 +235,15 @@ class AiService: Rehydration happens on the final AiCallResponse (not on individual str deltas). """ await self.ensureAiObjectsInitialized() - self._preflightBillingCheck() - await self._checkBillingBeforeAiCall() + + _streamOpType = request.options.operationType if request.options else None + _isNeutralizationStream = _streamOpType in ( + OperationTypeEnum.NEUTRALIZATION_TEXT, + OperationTypeEnum.NEUTRALIZATION_IMAGE, + ) + if not _isNeutralizationStream: + self._preflightBillingCheck() + await self._checkBillingBeforeAiCall() effectiveProviders = self._calculateEffectiveProviders() if effectiveProviders and request.options: From ce671f61b6c4862b4709a3f40a373658eac114a1 Mon Sep 17 00:00:00 2001 From: Ida Date: Wed, 29 Apr 2026 14:26:32 +0200 Subject: [PATCH 09/18] feat: app-scheduler ausgebaut um nachts bestehende connections zu indexieren --- modules/interfaces/interfaceDbApp.py | 22 +++++ .../subConnectorIngestConsumer.py | 86 ++++++++++++++++++- 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/modules/interfaces/interfaceDbApp.py b/modules/interfaces/interfaceDbApp.py index 04ae82ff..e7c7e6be 100644 --- a/modules/interfaces/interfaceDbApp.py +++ b/modules/interfaces/interfaceDbApp.py @@ -1281,6 +1281,28 @@ class AppObjects: logger.error(f"Error getting user connections: {str(e)}") return [] + def getActiveKnowledgeConnections(self) -> List[UserConnection]: + """Return all UserConnections with knowledgeIngestionEnabled=True and status=active. + + Used by the daily re-sync scheduler to determine which connections to re-index. + """ + try: + rows = self.db.getRecordset( + UserConnection, + recordFilter={"knowledgeIngestionEnabled": True, "status": ConnectionStatus.ACTIVE.value}, + ) + result = [] + for row in rows or []: + try: + conn = UserConnection.model_validate(row) if isinstance(row, dict) else row + result.append(conn) + except Exception as _e: + logger.warning(f"getActiveKnowledgeConnections: could not parse row: {_e}") + return result + except Exception as e: + logger.error(f"getActiveKnowledgeConnections failed: {e}") + return [] + def getUserConnectionById(self, connectionId: str) -> Optional[UserConnection]: """Get a single UserConnection by ID or by reference string (connection:authority:username).""" try: diff --git a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py index e27e2d29..97ac61d5 100644 --- a/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py +++ b/modules/serviceCenter/services/serviceKnowledge/subConnectorIngestConsumer.py @@ -238,6 +238,89 @@ async def _bootstrapJobHandler( } +async def _scheduledDailyResync() -> None: + """Enqueue a connection.bootstrap job for every active knowledge connection. + + Runs once per day (default 2 AM Europe/Zurich). Each job re-walks the + connector and hands new / changed items to KnowledgeService.requestIngestion. + Unchanged items are deduplicated by content-hash and skipped automatically. + """ + try: + from modules.interfaces.interfaceDbApp import getRootInterface + rootInterface = getRootInterface() + connections = rootInterface.getActiveKnowledgeConnections() + except Exception as exc: + logger.error("knowledge.daily_resync: could not load connections: %s", exc, exc_info=True) + return + + if not connections: + logger.info("knowledge.daily_resync: no active knowledge connections — nothing to do") + return + + logger.info( + "knowledge.daily_resync: enqueuing bootstrap for %d connection(s)", + len(connections), + extra={"event": "knowledge.daily_resync.started", "count": len(connections)}, + ) + + enqueued = 0 + skipped = 0 + for conn in connections: + connectionId = str(conn.id) + authority = conn.authority.value if hasattr(conn.authority, "value") else str(conn.authority) + userId = str(conn.userId) + payload: Dict[str, Any] = { + "connectionId": connectionId, + "authority": authority.lower(), + "userId": userId, + } + try: + await startJob( + BOOTSTRAP_JOB_TYPE, + payload, + triggeredBy="scheduler.daily_resync", + ) + enqueued += 1 + logger.debug( + "knowledge.daily_resync: queued connectionId=%s authority=%s", + connectionId, authority, + ) + except Exception as exc: + skipped += 1 + logger.error( + "knowledge.daily_resync: failed to enqueue connectionId=%s: %s", + connectionId, exc, + ) + + logger.info( + "knowledge.daily_resync: done — enqueued=%d skipped=%d", + enqueued, skipped, + extra={"event": "knowledge.daily_resync.done", "enqueued": enqueued, "skipped": skipped}, + ) + + +def registerDailyResyncScheduler(*, hour: int = 2, minute: int = 0) -> None: + """Register the daily knowledge re-sync cron job. Idempotent. + + Args: + hour: Hour of day to run (0–23, default 2 → 2 AM Europe/Zurich). + minute: Minute within the hour (default 0). + """ + try: + from modules.shared.eventManagement import eventManager + eventManager.registerCron( + jobId="knowledge.daily_resync", + func=_scheduledDailyResync, + cronKwargs={"hour": str(hour), "minute": str(minute)}, + ) + logger.info( + "knowledge.daily_resync scheduler registered (daily %02d:%02d Europe/Zurich)", + hour, minute, + ) + except Exception as exc: + logger.warning("knowledge.daily_resync scheduler registration failed (non-critical): %s", exc) + + def registerKnowledgeIngestionConsumer() -> None: """Register callback subscribers + background job handler. Idempotent.""" global _registered @@ -246,5 +329,6 @@ def registerKnowledgeIngestionConsumer() -> None: callbackRegistry.register("connection.established", _onConnectionEstablished) callbackRegistry.register("connection.revoked", _onConnectionRevoked) registerJobHandler(BOOTSTRAP_JOB_TYPE, _bootstrapJobHandler) + registerDailyResyncScheduler() _registered = True - logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler)", BOOTSTRAP_JOB_TYPE) + logger.info("KnowledgeIngestionConsumer registered (established/revoked + %s handler + daily resync)", BOOTSTRAP_JOB_TYPE) From 72d3175f49928b242f45c81b4eafab0dfefd7171 Mon Sep 17 00:00:00 2001 From: Ida Date: Wed, 29 Apr 2026 18:16:02 +0200 Subject: [PATCH 10/18] Gruppierung im Formgenerator fertig --- modules/datamodels/datamodelPagination.py | 84 +++++++++++-- modules/interfaces/interfaceDbApp.py | 53 +++++++++ modules/routes/routeDataConnections.py | 75 ++++++------ modules/routes/routeDataFiles.py | 27 +++-- modules/routes/routeDataMandates.py | 76 ++++++------ modules/routes/routeDataPrompts.py | 59 +++++---- modules/routes/routeDataUsers.py | 58 +++++---- modules/routes/routeHelpers.py | 139 ++++++++++++++++++++++ 8 files changed, 424 insertions(+), 147 deletions(-) diff --git a/modules/datamodels/datamodelPagination.py b/modules/datamodels/datamodelPagination.py index 2719327b..7bda7717 100644 --- a/modules/datamodels/datamodelPagination.py +++ b/modules/datamodels/datamodelPagination.py @@ -13,6 +13,42 @@ import math T = TypeVar('T') +# --------------------------------------------------------------------------- +# Table Grouping models +# --------------------------------------------------------------------------- + +class TableGroupNode(BaseModel): + """ + A single node in a user-defined group tree for a FormGeneratorTable. + + Items belong to exactly one group (no multi-membership). + Groups can be nested to arbitrary depth via subGroups. + """ + id: str + name: str + itemIds: List[str] = Field(default_factory=list) + subGroups: List['TableGroupNode'] = Field(default_factory=list) + order: int = 0 + isExpanded: bool = True + +TableGroupNode.model_rebuild() + + +class TableGrouping(BaseModel): + """ + Persisted grouping configuration for one (user, contextKey) pair. + Stored in table_groupings in poweron_app (auto-created). + + contextKey convention: API path without /api/ prefix and without trailing slash. + Examples: "connections", "prompts", "admin/users", "trustee/{instanceId}/documents" + """ + id: str + userId: str + contextKey: str + rootGroups: List[TableGroupNode] = Field(default_factory=list) + updatedAt: Optional[float] = None + + class SortField(BaseModel): """ Single sort field configuration. @@ -24,12 +60,23 @@ class SortField(BaseModel): class PaginationParams(BaseModel): """ Complete pagination state including page, sorting, and filters. + + Grouping extensions (both optional — omit when not using grouping): + groupId — Scope the request to items belonging to this group. + The backend resolves it to an itemIds IN-filter before + applying normal pagination/search/filter logic. + Also applied for mode=ids and mode=filterValues so that + bulk-select and filter-dropdowns respect the group scope. + saveGroupTree — If present the backend persists this tree for the current + (user, contextKey) pair *before* fetching, then returns + the confirmed tree in the response groupTree field. + Omit on every request that does not change the group tree. """ page: int = Field(ge=1, description="Current page number (1-based)") pageSize: int = Field(ge=1, le=1000, description="Number of items per page") sort: List[SortField] = Field(default_factory=list, description="List of sort fields in priority order") filters: Optional[Dict[str, Any]] = Field( - default=None, + default=None, description="""Filter criteria dictionary. Supports: - General search: {"search": "text"} - searches across all text fields (case-insensitive) - Field-specific filters: @@ -38,6 +85,14 @@ class PaginationParams(BaseModel): - Supported operators: equals/eq, contains, startsWith, endsWith, gt, gte, lt, lte, in, notIn - Multiple filters are combined with AND logic""" ) + groupId: Optional[str] = Field( + default=None, + description="Scope request to items of this group (resolved server-side to itemIds IN-filter)", + ) + saveGroupTree: Optional[List[Dict[str, Any]]] = Field( + default=None, + description="If set, persist this group tree before fetching (optimistic save)", + ) class PaginationRequest(BaseModel): @@ -74,10 +129,19 @@ class PaginationMetadata(BaseModel): class PaginatedResponse(BaseModel, Generic[T]): """ Response containing paginated data and metadata. + + groupTree is included when the endpoint supports table grouping and the + current user has a saved group tree for the requested contextKey. + It is None when grouping is not configured for the endpoint or the user + has not created any groups yet. Frontend must treat None as an empty tree. """ items: List[T] = Field(..., description="Array of items for current page") pagination: Optional[PaginationMetadata] = Field(..., description="Pagination metadata (None if pagination not applied)") - + groupTree: Optional[List[TableGroupNode]] = Field( + default=None, + description="Current group tree for this (user, contextKey) pair — None if no grouping configured", + ) + model_config = ConfigDict(arbitrary_types_allowed=True) @@ -85,29 +149,33 @@ def normalize_pagination_dict(pagination_dict: Dict[str, Any]) -> Dict[str, Any] """ Normalize pagination dictionary to handle frontend variations. Moves top-level "search" field into filters if present. - + Grouping fields (groupId, saveGroupTree) are passed through as-is. + Args: pagination_dict: Raw pagination dictionary from frontend - + Returns: Normalized pagination dictionary ready for PaginationParams parsing """ if not pagination_dict: return pagination_dict - + # Create a copy to avoid modifying the original normalized = dict(pagination_dict) - + # Ensure required fields have sensible defaults if "page" not in normalized: normalized["page"] = 1 if "pageSize" not in normalized: normalized["pageSize"] = 25 - + # Move top-level "search" into filters if present if "search" in normalized: if "filters" not in normalized or normalized["filters"] is None: normalized["filters"] = {} normalized["filters"]["search"] = normalized.pop("search") - + + # groupId / saveGroupTree are valid PaginationParams fields — pass through unchanged. + # No transformation needed; Pydantic will validate them. + return normalized diff --git a/modules/interfaces/interfaceDbApp.py b/modules/interfaces/interfaceDbApp.py index e7c7e6be..6f1d9487 100644 --- a/modules/interfaces/interfaceDbApp.py +++ b/modules/interfaces/interfaceDbApp.py @@ -4027,6 +4027,59 @@ class AppObjects: logger.error(f"Error deleting role {roleId}: {str(e)}") raise + # ------------------------------------------------------------------------- + # Table Grouping (user-defined groups for FormGeneratorTable instances) + # ------------------------------------------------------------------------- + + def getTableGrouping(self, contextKey: str): + """ + Load the group tree for the current user and the given contextKey. + + Returns a TableGrouping instance or None if no grouping has been saved yet. + contextKey identifies the table instance, e.g. "connections", "prompts", + "admin/users", "trustee/{instanceId}/documents". + """ + from modules.datamodels.datamodelPagination import TableGrouping + try: + records = self.db.getRecordset( + TableGrouping, + recordFilter={"userId": str(self.userId), "contextKey": contextKey}, + ) + if not records: + return None + row = records[0] + return TableGrouping.model_validate(row) if isinstance(row, dict) else row + except Exception as e: + logger.error(f"getTableGrouping failed for user={self.userId} key={contextKey}: {e}") + return None + + def upsertTableGrouping(self, contextKey: str, rootGroups: list): + """ + Create or replace the group tree for the current user and contextKey. + + rootGroups is a list of TableGroupNode-compatible dicts (the full tree). + Returns the saved TableGrouping instance. + """ + from modules.datamodels.datamodelPagination import TableGrouping + from modules.shared.timeUtils import getUtcTimestamp + try: + existing = self.getTableGrouping(contextKey) + data = { + "id": existing.id if existing else str(uuid.uuid4()), + "userId": str(self.userId), + "contextKey": contextKey, + "rootGroups": rootGroups, + "updatedAt": getUtcTimestamp(), + } + if existing: + self.db.recordModify(TableGrouping, existing.id, data) + else: + self.db.recordCreate(TableGrouping, data) + return TableGrouping.model_validate(data) + except Exception as e: + logger.error(f"upsertTableGrouping failed for user={self.userId} key={contextKey}: {e}") + raise + # Public Methods diff --git a/modules/routes/routeDataConnections.py b/modules/routes/routeDataConnections.py index 51549d6a..124d2fb4 100644 --- a/modules/routes/routeDataConnections.py +++ b/modules/routes/routeDataConnections.py @@ -152,10 +152,28 @@ async def get_connections( - GET /api/connections/?mode=filterValues&column=status - GET /api/connections/?mode=ids """ - from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels + from modules.routes.routeHelpers import ( + handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels, + handleGroupingInRequest, applyGroupScopeFilter, + ) + + CONTEXT_KEY = "connections" + + # Parse pagination params early — needed for grouping in all modes + paginationParams = None + if pagination: + try: + paginationDict = json.loads(pagination) + if paginationDict: + paginationDict = normalize_pagination_dict(paginationDict) + paginationParams = PaginationParams(**paginationDict) + except (json.JSONDecodeError, ValueError) as e: + raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}") + + interface = getInterface(currentUser) + groupCtx = handleGroupingInRequest(paginationParams, interface, CONTEXT_KEY) def _buildEnhancedItems(): - interface = getInterface(currentUser) connections = interface.getUserConnections(currentUser.id) items = [] for connection in connections: @@ -182,6 +200,7 @@ async def get_connections( try: items = _buildEnhancedItems() enrichRowsWithFkLabels(items, UserConnection) + items = applyGroupScopeFilter(items, groupCtx.itemIds) return handleFilterValuesInMemory(items, column, pagination) except Exception as e: logger.error(f"Error getting filter values for connections: {str(e)}") @@ -189,63 +208,40 @@ async def get_connections( if mode == "ids": try: - return handleIdsInMemory(_buildEnhancedItems(), pagination) + items = applyGroupScopeFilter(_buildEnhancedItems(), groupCtx.itemIds) + return handleIdsInMemory(items, pagination) except Exception as e: logger.error(f"Error getting IDs for connections: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) try: - interface = getInterface(currentUser) - # NOTE: Cannot use db.getRecordsetPaginated() here because each connection # is enriched with computed tokenStatus/tokenExpiresAt (requires per-row DB lookup). # Token refresh also may trigger re-fetch. Connections per user are typically < 10, # so in-memory pagination is acceptable. - - # Parse pagination parameter - paginationParams = None - if pagination: - try: - paginationDict = json.loads(pagination) - if paginationDict: - # Normalize pagination dict (handles top-level "search" field) - paginationDict = normalize_pagination_dict(paginationDict) - paginationParams = PaginationParams(**paginationDict) - except (json.JSONDecodeError, ValueError) as e: - raise HTTPException( - status_code=400, - detail=f"Invalid pagination parameter: {str(e)}" - ) - + # SECURITY FIX: All users (including admins) can only see their own connections - # This prevents admin from seeing other users' connections and causing confusion connections = interface.getUserConnections(currentUser.id) - + # Perform silent token refresh for expired OAuth connections try: refresh_result = await token_refresh_service.refresh_expired_tokens(currentUser.id) if refresh_result.get("refreshed", 0) > 0: logger.info(f"Silently refreshed {refresh_result['refreshed']} tokens for user {currentUser.id}") - # Re-fetch connections to get updated token status connections = interface.getUserConnections(currentUser.id) except Exception as e: logger.warning(f"Silent token refresh failed for user {currentUser.id}: {str(e)}") - # Continue with original connections even if refresh fails - - # Enhance each connection with token status information and convert to dict + enhanced_connections_dict = [] for connection in connections: - # Get token status for this connection tokenStatus, tokenExpiresAt = getTokenStatusForConnection(interface, connection.id) - - # Convert to dict for filtering/sorting connection_dict = { "id": connection.id, "userId": connection.userId, "authority": connection.authority.value if hasattr(connection.authority, 'value') else str(connection.authority), "externalId": connection.externalId, "externalUsername": connection.externalUsername or "", - "externalEmail": connection.externalEmail, # Keep None instead of converting to empty string + "externalEmail": connection.externalEmail, "status": connection.status.value if hasattr(connection.status, 'value') else str(connection.status), "connectedAt": connection.connectedAt, "lastChecked": connection.lastChecked, @@ -254,24 +250,26 @@ async def get_connections( "tokenExpiresAt": tokenExpiresAt } enhanced_connections_dict.append(connection_dict) - + enrichRowsWithFkLabels(enhanced_connections_dict, UserConnection) + enhanced_connections_dict = applyGroupScopeFilter(enhanced_connections_dict, groupCtx.itemIds) if paginationParams is None: return { "items": enhanced_connections_dict, "pagination": None, + "groupTree": groupCtx.groupTree, } - + # Apply filtering if provided if paginationParams.filters: component_interface = ComponentObjects() component_interface.setUserContext(currentUser) enhanced_connections_dict = component_interface._applyFilters( - enhanced_connections_dict, + enhanced_connections_dict, paginationParams.filters ) - + # Apply sorting if provided if paginationParams.sort: component_interface = ComponentObjects() @@ -280,14 +278,14 @@ async def get_connections( enhanced_connections_dict, paginationParams.sort ) - + totalItems = len(enhanced_connections_dict) totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0 - + startIdx = (paginationParams.page - 1) * paginationParams.pageSize endIdx = startIdx + paginationParams.pageSize paged_connections = enhanced_connections_dict[startIdx:endIdx] - + return { "items": paged_connections, "pagination": PaginationMetadata( @@ -298,6 +296,7 @@ async def get_connections( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": groupCtx.groupTree, } except HTTPException: diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index 3abccdc4..8168d8d2 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -279,7 +279,6 @@ def get_files( try: paginationDict = json.loads(pagination) if paginationDict: - # Normalize pagination dict (handles top-level "search" field) paginationDict = normalize_pagination_dict(paginationDict) paginationParams = PaginationParams(**paginationDict) except (json.JSONDecodeError, ValueError) as e: @@ -287,25 +286,33 @@ def get_files( status_code=400, detail=f"Invalid pagination parameter: {str(e)}" ) - + from modules.routes.routeHelpers import ( handleIdsMode, handleFilterValuesInMemory, + handleGroupingInRequest, applyGroupScopeFilter, ) + import modules.interfaces.interfaceDbApp as _appIface managementInterface = interfaceDbManagement.getInterface( currentUser, mandateId=str(context.mandateId) if context.mandateId else None, featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None ) + appInterface = _appIface.getInterface(currentUser) + groupCtx = handleGroupingInRequest(paginationParams, appInterface, "files/list") + + def _filesToDicts(fileItems): + return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in fileItems] if mode == "filterValues": if not column: raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") allFiles = managementInterface.getAllFiles() items = allFiles if isinstance(allFiles, list) else (allFiles.items if hasattr(allFiles, "items") else []) - itemDicts = [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items] + itemDicts = _filesToDicts(items) enrichRowsWithFkLabels(itemDicts, FileItem) + itemDicts = applyGroupScopeFilter(itemDicts, groupCtx.itemIds) return handleFilterValuesInMemory(itemDicts, column, pagination) if mode == "ids": @@ -315,10 +322,6 @@ def get_files( recordFilter = None if paginationParams and paginationParams.filters and "folderId" in paginationParams.filters: fVal = paginationParams.filters.get("folderId") - # For a concrete folderId we use recordFilter (exact equality). - # For null / empty (= "root") we keep it in pagination.filters so the - # connector applies `IS NULL OR = ''` – files predating the folderId - # fix were stored with an empty string instead of NULL. if fVal is None or (isinstance(fVal, str) and fVal.strip() == ""): paginationParams.filters["folderId"] = None else: @@ -327,11 +330,8 @@ def get_files( result = managementInterface.getAllFiles(pagination=paginationParams, recordFilter=recordFilter) - def _filesToDicts(items): - return [f.model_dump() if hasattr(f, "model_dump") else (dict(f) if not isinstance(f, dict) else f) for f in items] - if paginationParams: - enriched = enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem) + enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem), groupCtx.itemIds) return { "items": enriched, "pagination": PaginationMetadata( @@ -342,11 +342,12 @@ def get_files( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": groupCtx.groupTree, } else: items = result if isinstance(result, list) else (result.items if hasattr(result, "items") else [result]) - enriched = enrichRowsWithFkLabels(_filesToDicts(items), FileItem) - return {"items": enriched, "pagination": None} + enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(items), FileItem), groupCtx.itemIds) + return {"items": enriched, "pagination": None, "groupTree": groupCtx.groupTree} except HTTPException: raise except Exception as e: diff --git a/modules/routes/routeDataMandates.py b/modules/routes/routeDataMandates.py index ef058ed9..47eaee02 100644 --- a/modules/routes/routeDataMandates.py +++ b/modules/routes/routeDataMandates.py @@ -112,8 +112,8 @@ def get_mandates( status_code=status.HTTP_403_FORBIDDEN, detail=routeApiMsg("Admin role required") ) - - # Parse pagination parameter + + # Parse pagination parameter early — needed for grouping in all modes paginationParams = None if pagination: try: @@ -126,14 +126,24 @@ def get_mandates( status_code=400, detail=f"Invalid pagination parameter: {str(e)}" ) - + from modules.routes.routeHelpers import ( handleFilterValuesInMemory, handleIdsInMemory, handleFilterValuesMode, handleIdsMode, parseCrossFilterPagination, + handleGroupingInRequest, applyGroupScopeFilter, ) appInterface = interfaceDbApp.getRootInterface() + groupCtx = handleGroupingInRequest(paginationParams, appInterface, "mandates") + + def _mandateItemsForAdmin(): + items = [] + for mid in adminMandateIds: + m = appInterface.getMandate(mid) + if m and getattr(m, "enabled", True): + items.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m)) + return items if mode == "filterValues": if not column: @@ -144,54 +154,42 @@ def get_mandates( values = appInterface.db.getDistinctColumnValues(Mandate, column, crossPagination) return JSONResponse(content=sorted(values, key=lambda v: str(v).lower())) else: - mandateItems = [] - for mid in adminMandateIds: - m = appInterface.getMandate(mid) - if m and getattr(m, "enabled", True): - mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m)) + mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds) return handleFilterValuesInMemory(mandateItems, column, pagination) if mode == "ids": if isPlatformAdmin: return handleIdsMode(appInterface.db, Mandate, pagination) else: - mandateItems = [] - for mid in adminMandateIds: - m = appInterface.getMandate(mid) - if m and getattr(m, "enabled", True): - mandateItems.append(m.model_dump() if hasattr(m, 'model_dump') else m if isinstance(m, dict) else vars(m)) + mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds) return handleIdsInMemory(mandateItems, pagination) if isPlatformAdmin: result = appInterface.getAllMandates(pagination=paginationParams) - else: - allMandates = [] - for mandateId in adminMandateIds: - mandate = appInterface.getMandate(mandateId) - if mandate and getattr(mandate, "enabled", True): - mandateDict = mandate if isinstance(mandate, dict) else mandate.model_dump() if hasattr(mandate, 'model_dump') else vars(mandate) - allMandates.append(mandateDict) - result = allMandates - paginationParams = None - - if paginationParams and hasattr(result, 'items'): - return PaginatedResponse( - items=result.items, - pagination=PaginationMetadata( - currentPage=paginationParams.page, - pageSize=paginationParams.pageSize, - totalItems=result.totalItems, - totalPages=result.totalPages, - sort=paginationParams.sort, - filters=paginationParams.filters + items = result.items if hasattr(result, 'items') else (result if isinstance(result, list) else []) + items = applyGroupScopeFilter( + [i.model_dump() if hasattr(i, 'model_dump') else (i if isinstance(i, dict) else vars(i)) for i in items], + groupCtx.itemIds, + ) + if paginationParams and hasattr(result, 'items'): + return PaginatedResponse( + items=items, + pagination=PaginationMetadata( + currentPage=paginationParams.page, + pageSize=paginationParams.pageSize, + totalItems=result.totalItems, + totalPages=result.totalPages, + sort=paginationParams.sort, + filters=paginationParams.filters + ), + groupTree=groupCtx.groupTree, ) - ) + else: + return PaginatedResponse(items=items, pagination=None, groupTree=groupCtx.groupTree) else: - items = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else result) - return PaginatedResponse( - items=items, - pagination=None - ) + mandateItems = applyGroupScopeFilter(_mandateItemsForAdmin(), groupCtx.itemIds) + return PaginatedResponse(items=mandateItems, pagination=None, groupTree=groupCtx.groupTree) + except HTTPException: raise except Exception as e: diff --git a/modules/routes/routeDataPrompts.py b/modules/routes/routeDataPrompts.py index ee99b912..84559ebb 100644 --- a/modules/routes/routeDataPrompts.py +++ b/modules/routes/routeDataPrompts.py @@ -44,27 +44,15 @@ def get_prompts( - filterValues: distinct values for a column (cross-filtered) - ids: all IDs matching current filters """ - from modules.routes.routeHelpers import handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels + from modules.routes.routeHelpers import ( + handleFilterValuesInMemory, handleIdsInMemory, enrichRowsWithFkLabels, + handleGroupingInRequest, applyGroupScopeFilter, + ) + from modules.interfaces.interfaceDbApp import getInterface as getAppInterface - def _promptsToEnrichedDicts(promptItems): - dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems] - enrichRowsWithFkLabels(dicts, Prompt) - return dicts - - if mode == "filterValues": - if not column: - raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") - managementInterface = interfaceDbManagement.getInterface(currentUser) - result = managementInterface.getAllPrompts(pagination=None) - items = _promptsToEnrichedDicts(result) - return handleFilterValuesInMemory(items, column, pagination) - - if mode == "ids": - managementInterface = interfaceDbManagement.getInterface(currentUser) - result = managementInterface.getAllPrompts(pagination=None) - items = _promptsToEnrichedDicts(result) - return handleIdsInMemory(items, pagination) + CONTEXT_KEY = "prompts" + # Parse pagination params early — needed for grouping in all modes paginationParams = None if pagination: try: @@ -74,12 +62,35 @@ def get_prompts( paginationParams = PaginationParams(**paginationDict) except (json.JSONDecodeError, ValueError) as e: raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}") - + + appInterface = getAppInterface(currentUser) + groupCtx = handleGroupingInRequest(paginationParams, appInterface, CONTEXT_KEY) + + def _promptsToEnrichedDicts(promptItems): + dicts = [r.model_dump() if hasattr(r, 'model_dump') else (dict(r) if not isinstance(r, dict) else r) for r in promptItems] + enrichRowsWithFkLabels(dicts, Prompt) + return dicts + managementInterface = interfaceDbManagement.getInterface(currentUser) + + if mode == "filterValues": + if not column: + raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") + result = managementInterface.getAllPrompts(pagination=None) + items = _promptsToEnrichedDicts(result) + items = applyGroupScopeFilter(items, groupCtx.itemIds) + return handleFilterValuesInMemory(items, column, pagination) + + if mode == "ids": + result = managementInterface.getAllPrompts(pagination=None) + items = _promptsToEnrichedDicts(result) + items = applyGroupScopeFilter(items, groupCtx.itemIds) + return handleIdsInMemory(items, pagination) + result = managementInterface.getAllPrompts(pagination=paginationParams) - + if paginationParams: - items = _promptsToEnrichedDicts(result.items) + items = applyGroupScopeFilter(_promptsToEnrichedDicts(result.items), groupCtx.itemIds) return { "items": items, "pagination": PaginationMetadata( @@ -90,12 +101,14 @@ def get_prompts( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": groupCtx.groupTree, } else: - items = _promptsToEnrichedDicts(result) + items = applyGroupScopeFilter(_promptsToEnrichedDicts(result), groupCtx.itemIds) return { "items": items, "pagination": None, + "groupTree": groupCtx.groupTree, } diff --git a/modules/routes/routeDataUsers.py b/modules/routes/routeDataUsers.py index 6d72b763..25d20c39 100644 --- a/modules/routes/routeDataUsers.py +++ b/modules/routes/routeDataUsers.py @@ -208,6 +208,21 @@ def get_users( - GET /api/users/ (no pagination - returns all users in mandate) - GET /api/users/?pagination={"page":1,"pageSize":10,"sort":[]} """ + # Parse pagination early — needed for grouping in all modes + _paginationParams = None + if pagination: + try: + _pd = json.loads(pagination) + if _pd: + _pd = normalize_pagination_dict(_pd) + _paginationParams = PaginationParams(**_pd) + except (json.JSONDecodeError, ValueError) as e: + raise HTTPException(status_code=400, detail=f"Invalid pagination parameter: {str(e)}") + + from modules.routes.routeHelpers import handleGroupingInRequest as _handleGrouping, applyGroupScopeFilter as _applyGroupScope + _appInterfaceForGrouping = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId) + _groupCtx = _handleGrouping(_paginationParams, _appInterfaceForGrouping, "users") + if mode == "filterValues": if not column: raise HTTPException(status_code=400, detail="column parameter required for mode=filterValues") @@ -217,27 +232,15 @@ def get_users( return _getUserFilterOrIds(context, pagination, idsMode=True) try: - paginationParams = None - if pagination: - try: - paginationDict = json.loads(pagination) - if paginationDict: - paginationDict = normalize_pagination_dict(paginationDict) - paginationParams = PaginationParams(**paginationDict) - except (json.JSONDecodeError, ValueError) as e: - raise HTTPException( - status_code=400, - detail=f"Invalid pagination parameter: {str(e)}" - ) - - appInterface = interfaceDbApp.getInterface(context.user, mandateId=context.mandateId) + paginationParams = _paginationParams + appInterface = _appInterfaceForGrouping if context.mandateId: # Get users for specific mandate using getUsersByMandate result = appInterface.getUsersByMandate(str(context.mandateId), paginationParams) - + if paginationParams and hasattr(result, 'items'): - enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User) + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds) return { "items": enriched, "pagination": PaginationMetadata( @@ -248,17 +251,18 @@ def get_users( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": _groupCtx.groupTree, } else: users = result if isinstance(result, list) else result.items if hasattr(result, 'items') else [] - enriched = enrichRowsWithFkLabels(_usersToDicts(users), User) - return {"items": enriched, "pagination": None} + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds) + return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree} elif context.isPlatformAdmin: # PlatformAdmin without mandateId — DB-level pagination via interface result = appInterface.getAllUsers(paginationParams) - + if paginationParams and hasattr(result, 'items'): - enriched = enrichRowsWithFkLabels(_usersToDicts(result.items), User) + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(result.items), User), _groupCtx.itemIds) return { "items": enriched, "pagination": PaginationMetadata( @@ -269,11 +273,12 @@ def get_users( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": _groupCtx.groupTree, } else: users = result if isinstance(result, list) else (result.items if hasattr(result, 'items') else []) - enriched = enrichRowsWithFkLabels(_usersToDicts(users), User) - return {"items": enriched, "pagination": None} + enriched = _applyGroupScope(enrichRowsWithFkLabels(_usersToDicts(users), User), _groupCtx.itemIds) + return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree} else: # Non-SysAdmin without mandateId: aggregate users across all admin mandates rootInterface = getRootInterface() @@ -313,16 +318,16 @@ def get_users( ] from modules.routes.routeHelpers import applyFiltersAndSort as _applyFiltersAndSortHelper - filteredUsers = _applyFiltersAndSortHelper(allUsers, paginationParams) + filteredUsers = _applyGroupScope(_applyFiltersAndSortHelper(allUsers, paginationParams), _groupCtx.itemIds) enriched = enrichRowsWithFkLabels(filteredUsers, User) - + if paginationParams: import math totalItems = len(enriched) totalPages = math.ceil(totalItems / paginationParams.pageSize) if totalItems > 0 else 0 startIdx = (paginationParams.page - 1) * paginationParams.pageSize endIdx = startIdx + paginationParams.pageSize - + return { "items": enriched[startIdx:endIdx], "pagination": PaginationMetadata( @@ -333,9 +338,10 @@ def get_users( sort=paginationParams.sort, filters=paginationParams.filters ).model_dump(), + "groupTree": _groupCtx.groupTree, } else: - return {"items": enriched, "pagination": None} + return {"items": enriched, "pagination": None, "groupTree": _groupCtx.groupTree} except HTTPException: raise except Exception as e: diff --git a/modules/routes/routeHelpers.py b/modules/routes/routeHelpers.py index 37bfa3b2..0f0b8ea7 100644 --- a/modules/routes/routeHelpers.py +++ b/modules/routes/routeHelpers.py @@ -701,3 +701,142 @@ def paginateInMemory( offset = (paginationParams.page - 1) * paginationParams.pageSize pageItems = items[offset:offset + paginationParams.pageSize] return pageItems, totalItems + + +# --------------------------------------------------------------------------- +# Table Grouping helpers +# --------------------------------------------------------------------------- + +from dataclasses import dataclass, field as dc_field + + +@dataclass +class GroupingContext: + """ + Result of handleGroupingInRequest. + Carries the group tree for the response and the resolved item-ID set for + group-scope filtering (None = no active group scope). + """ + groupTree: Optional[list] # List[TableGroupNode] serialised as dicts — for response + itemIds: Optional[set] # Set[str] when groupId was set, else None + + +def _collectItemIds(nodes: list, groupId: str) -> Optional[set]: + """ + Recursively search *nodes* for a node whose id == groupId and collect + all itemIds from it and all its descendant subGroups. + Returns None if the group is not found. + """ + for node in nodes: + nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None) + if nodeId == groupId: + ids: set = set() + _collectAllIds(node, ids) + return ids + subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", []) + result = _collectItemIds(subGroups, groupId) + if result is not None: + return result + return None + + +def _collectAllIds(node, ids: set) -> None: + """Collect itemIds from a node and all its descendants into ids.""" + nodeItemIds = node.get("itemIds", []) if isinstance(node, dict) else getattr(node, "itemIds", []) + for iid in nodeItemIds: + ids.add(str(iid)) + subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", []) + for child in subGroups: + _collectAllIds(child, ids) + + +def handleGroupingInRequest( + paginationParams: Optional[PaginationParams], + interface, + contextKey: str, +) -> GroupingContext: + """ + Central grouping handler — call at the start of every list route that + supports table grouping. + + Steps (in order): + 1. If paginationParams.saveGroupTree is set: + persist the new tree via interface.upsertTableGrouping, then clear + saveGroupTree from paginationParams so it is not treated as a filter. + 2. Load the current group tree from the DB (used in step 3 and response). + 3. If paginationParams.groupId is set: + resolve it to a Set[str] of itemIds (including all sub-groups), + then clear groupId from paginationParams so it is not treated as a + normal filter field. + 4. Return a GroupingContext with groupTree (for the response) and itemIds + (for applyGroupScopeFilter). + + The caller does NOT need to handle any grouping logic itself — just call + applyGroupScopeFilter(items, groupCtx.itemIds) and embed groupCtx.groupTree + in the response dict. + """ + from modules.datamodels.datamodelPagination import TableGroupNode + + groupTree = None + itemIds = None + + if paginationParams is None: + try: + existing = interface.getTableGrouping(contextKey) + if existing: + groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups] + except Exception as e: + logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}") + return GroupingContext(groupTree=groupTree, itemIds=None) + + # Step 1: persist saveGroupTree if present + if paginationParams.saveGroupTree is not None: + try: + saved = interface.upsertTableGrouping(contextKey, paginationParams.saveGroupTree) + groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in saved.rootGroups] + except Exception as e: + logger.error(f"handleGroupingInRequest: upsertTableGrouping failed: {e}") + paginationParams.saveGroupTree = None + + # Step 2: load current tree (only if not already set from save above) + if groupTree is None: + try: + existing = interface.getTableGrouping(contextKey) + if existing: + groupTree = [n.model_dump() if hasattr(n, "model_dump") else n for n in existing.rootGroups] + except Exception as e: + logger.warning(f"handleGroupingInRequest: getTableGrouping failed: {e}") + + # Step 3: resolve groupId to itemIds set + if paginationParams.groupId is not None: + targetGroupId = paginationParams.groupId + paginationParams.groupId = None # remove so it is not treated as a normal filter + if groupTree: + itemIds = _collectItemIds(groupTree, targetGroupId) + if itemIds is None: + logger.warning( + f"handleGroupingInRequest: groupId={targetGroupId!r} not found in tree " + f"for contextKey={contextKey!r} — returning empty set" + ) + itemIds = set() # unknown group → show nothing rather than everything + else: + # groupId sent but no tree saved yet → return empty (nothing belongs to any group) + logger.warning( + f"handleGroupingInRequest: groupId={targetGroupId!r} set but no tree exists " + f"for contextKey={contextKey!r} — returning empty set" + ) + itemIds = set() + + return GroupingContext(groupTree=groupTree, itemIds=itemIds) + + +def applyGroupScopeFilter(items: List[Dict[str, Any]], itemIds: Optional[set]) -> List[Dict[str, Any]]: + """ + Filter items to those whose "id" field is in itemIds. + Returns items unchanged when itemIds is None (no active group scope). + Works for both normal list items and for mode=ids / mode=filterValues flows + — call it before handleIdsInMemory / handleFilterValuesInMemory. + """ + if itemIds is None: + return items + return [item for item in items if str(item.get("id", "")) in itemIds] From 880fa4d78783889f5354af0a4bf075073469f0e8 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Wed, 29 Apr 2026 21:27:08 +0200 Subject: [PATCH 11/18] plana+c implemented --- app.py | 3 + .../datamodelFeatureGraphicalEditor.py | 13 +- .../interfaceFeatureGraphicalEditor.py | 2 + .../graphicalEditor/nodeDefinitions/ai.py | 2 +- .../graphicalEditor/nodeDefinitions/email.py | 2 +- .../routeFeatureGraphicalEditor.py | 56 +++ .../interfaces/_legacyMigrationTelemetry.py | 198 +++++++++ modules/interfaces/interfaceBootstrap.py | 221 +--------- modules/interfaces/interfaceDbKnowledge.py | 35 +- modules/interfaces/interfaceFeatures.py | 1 + modules/routes/routeAutomationWorkspace.py | 246 +++++++++++ .../workflows/automation2/executionEngine.py | 32 +- modules/workflows/scheduler/mainScheduler.py | 2 + scripts/_archive/README.md | 19 + .../check_orphan_featureinstance.py | 0 .../i18n_rekey_plaintext_keys.py | 0 .../{ => _archive}/migrate_async_to_sync.py | 0 .../script_db_cleanup_duplicate_roles.py | 0 ...cript_db_migrate_accessrules_objectkeys.py | 0 scripts/_listMandates.py | 25 -- scripts/script_db_audit_legacy_state.py | 382 ++++++++++++++++++ .../bootstrap/test_mandateNameMigration.py | 133 ------ tests/unit/rbac/test_sysadmin_migration.py | 209 ---------- .../workflows/test_automation2_graphUtils.py | 11 + 24 files changed, 977 insertions(+), 615 deletions(-) create mode 100644 modules/interfaces/_legacyMigrationTelemetry.py create mode 100644 modules/routes/routeAutomationWorkspace.py create mode 100644 scripts/_archive/README.md rename scripts/{ => _archive}/check_orphan_featureinstance.py (100%) rename scripts/{ => _archive}/i18n_rekey_plaintext_keys.py (100%) rename scripts/{ => _archive}/migrate_async_to_sync.py (100%) rename scripts/{ => _archive}/script_db_cleanup_duplicate_roles.py (100%) rename scripts/{ => _archive}/script_db_migrate_accessrules_objectkeys.py (100%) delete mode 100644 scripts/_listMandates.py create mode 100644 scripts/script_db_audit_legacy_state.py delete mode 100644 tests/unit/bootstrap/test_mandateNameMigration.py delete mode 100644 tests/unit/rbac/test_sysadmin_migration.py diff --git a/app.py b/app.py index 41271739..adcd5365 100644 --- a/app.py +++ b/app.py @@ -672,6 +672,9 @@ app.include_router(navigationRouter) from modules.routes.routeWorkflowDashboard import router as workflowDashboardRouter app.include_router(workflowDashboardRouter) +from modules.routes.routeAutomationWorkspace import router as automationWorkspaceRouter +app.include_router(automationWorkspaceRouter) + # ============================================================================ # PLUG&PLAY FEATURE ROUTERS # Dynamically load routers from feature containers in modules/features/ diff --git a/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py b/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py index 5ebf629e..10d1f47f 100644 --- a/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py +++ b/modules/features/graphicalEditor/datamodelFeatureGraphicalEditor.py @@ -72,7 +72,7 @@ class AutoWorkflow(PowerOnModel): }, ) featureInstanceId: str = Field( - description="Feature instance ID", + description="Feature instance ID (GE owner instance / RBAC scope)", json_schema_extra={ "frontend_type": "text", "frontend_readonly": True, @@ -81,6 +81,17 @@ class AutoWorkflow(PowerOnModel): "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}, }, ) + targetFeatureInstanceId: Optional[str] = Field( + default=None, + description="Target feature instance for execution data scope. NULL for templates, mandatory for non-templates.", + json_schema_extra={ + "frontend_type": "select", + "frontend_readonly": False, + "frontend_required": False, + "label": "Ziel-Instanz", + "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}, + }, + ) label: str = Field( description="User-friendly workflow name", json_schema_extra={"frontend_type": "text", "frontend_required": True, "label": "Bezeichnung"}, diff --git a/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py b/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py index c84db9d3..3b665981 100644 --- a/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py +++ b/modules/features/graphicalEditor/interfaceFeatureGraphicalEditor.py @@ -217,6 +217,8 @@ class GraphicalEditorObjects: data["id"] = str(uuid.uuid4()) data["mandateId"] = self.mandateId data["featureInstanceId"] = self.featureInstanceId + if not data.get("targetFeatureInstanceId") and not data.get("isTemplate"): + data["targetFeatureInstanceId"] = self.featureInstanceId if "active" not in data or data.get("active") is None: data["active"] = True data["invocations"] = normalize_invocations_list(data.get("invocations")) diff --git a/modules/features/graphicalEditor/nodeDefinitions/ai.py b/modules/features/graphicalEditor/nodeDefinitions/ai.py index d0e0eb22..3273540a 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/ai.py +++ b/modules/features/graphicalEditor/nodeDefinitions/ai.py @@ -10,7 +10,7 @@ AI_NODES = [ "label": t("Prompt"), "description": t("Prompt eingeben und KI führt aus"), "parameters": [ - {"name": "aiPrompt", "type": "string", "required": True, "frontendType": "textarea", + {"name": "aiPrompt", "type": "string", "required": True, "frontendType": "templateTextarea", "description": t("KI-Prompt")}, {"name": "resultType", "type": "string", "required": False, "frontendType": "select", "frontendOptions": {"options": ["txt", "json", "md", "csv", "xml", "html", "pdf", "docx", "xlsx", "pptx", "png", "jpg"]}, diff --git a/modules/features/graphicalEditor/nodeDefinitions/email.py b/modules/features/graphicalEditor/nodeDefinitions/email.py index 11ff9895..270b8d63 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/email.py +++ b/modules/features/graphicalEditor/nodeDefinitions/email.py @@ -62,7 +62,7 @@ EMAIL_NODES = [ {"name": "connectionReference", "type": "string", "required": True, "frontendType": "userConnection", "frontendOptions": {"authority": "msft"}, "description": t("E-Mail-Konto")}, - {"name": "context", "type": "string", "required": False, "frontendType": "textarea", + {"name": "context", "type": "string", "required": False, "frontendType": "templateTextarea", "description": t("Kontext / Brief-Beschreibung für die KI-Komposition"), "default": ""}, {"name": "to", "type": "string", "required": False, "frontendType": "text", "description": t("Empfänger (komma-separiert, optional für Entwurf)"), "default": ""}, diff --git a/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py b/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py index aed94a68..4748f39a 100644 --- a/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py +++ b/modules/features/graphicalEditor/routeFeatureGraphicalEditor.py @@ -111,6 +111,44 @@ def _validateInstanceAccess(instanceId: str, context: RequestContext) -> str: return str(instance.mandateId) if instance.mandateId else "" +def _validateTargetInstance( + workflowData: Dict[str, Any], + ownerInstanceId: str, + context: RequestContext, +) -> None: + """Enforce targetFeatureInstanceId rules for non-template workflows. + + - Templates (isTemplate=True) may omit targetFeatureInstanceId. + - Non-templates MUST have a non-empty targetFeatureInstanceId. + - If the targetFeatureInstanceId differs from the GE owner instance, + the user must also have FeatureAccess on that target instance. + """ + if workflowData.get("isTemplate"): + return + + targetId = workflowData.get("targetFeatureInstanceId") + if not targetId: + return + + if targetId == ownerInstanceId: + return + + from modules.interfaces.interfaceDbApp import getRootInterface + rootInterface = getRootInterface() + targetInstance = rootInterface.getFeatureInstance(targetId) + if not targetInstance: + raise HTTPException( + status_code=400, + detail=routeApiMsg("targetFeatureInstanceId refers to a non-existent feature instance"), + ) + targetAccess = rootInterface.getFeatureAccess(str(context.user.id), targetId) + if not targetAccess or not targetAccess.enabled: + raise HTTPException( + status_code=403, + detail=routeApiMsg("Access denied to target feature instance"), + ) + + @router.get("/{instanceId}/node-types") @limiter.limit("60/minute") def get_node_types( @@ -318,9 +356,12 @@ async def post_execute( workflowId = body.get("workflowId") req_nodes = graph.get("nodes") or [] workflow_for_envelope: Optional[Dict[str, Any]] = None + targetFeatureInstanceId: Optional[str] = None if workflowId and not str(workflowId).startswith("transient-"): iface = getGraphicalEditorInterface(context.user, mandateId, instanceId) workflow_for_envelope = iface.getWorkflow(workflowId) + if workflow_for_envelope: + targetFeatureInstanceId = workflow_for_envelope.get("targetFeatureInstanceId") if workflowId and len(req_nodes) == 0: iface = getGraphicalEditorInterface(context.user, mandateId, instanceId) wf = iface.getWorkflow(workflowId) @@ -328,10 +369,18 @@ async def post_execute( graph = wf["graph"] logger.info("graphicalEditor execute: loaded graph from workflow %s", workflowId) workflow_for_envelope = wf + targetFeatureInstanceId = wf.get("targetFeatureInstanceId") if not workflowId: import uuid workflowId = f"transient-{uuid.uuid4().hex[:12]}" logger.info("graphicalEditor execute: using transient workflowId=%s", workflowId) + + if targetFeatureInstanceId and targetFeatureInstanceId != instanceId: + _validateTargetInstance( + {"targetFeatureInstanceId": targetFeatureInstanceId}, + instanceId, + context, + ) nodes_count = len(graph.get("nodes") or []) connections_count = len(graph.get("connections") or []) logger.info( @@ -363,6 +412,7 @@ async def post_execute( automation2_interface=ge_interface, run_envelope=run_env, label=_wfLabel, + targetFeatureInstanceId=targetFeatureInstanceId, ) logger.info( "graphicalEditor execute result: success=%s error=%s nodeOutputs_keys=%s failedNode=%s paused=%s", @@ -1371,6 +1421,7 @@ def create_workflow( ) -> dict: """Create a new workflow.""" mandateId = _validateInstanceAccess(instanceId, context) + _validateTargetInstance(body, instanceId, context) iface = getGraphicalEditorInterface(context.user, mandateId, instanceId) created = iface.createWorkflow(body) return created @@ -1388,6 +1439,11 @@ def update_workflow( """Update a workflow.""" mandateId = _validateInstanceAccess(instanceId, context) iface = getGraphicalEditorInterface(context.user, mandateId, instanceId) + existing = iface.getWorkflow(workflowId) + if not existing: + raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found")) + merged = {**existing, **body} + _validateTargetInstance(merged, instanceId, context) updated = iface.updateWorkflow(workflowId, body) if not updated: raise HTTPException(status_code=404, detail=routeApiMsg("Workflow not found")) diff --git a/modules/interfaces/_legacyMigrationTelemetry.py b/modules/interfaces/_legacyMigrationTelemetry.py new file mode 100644 index 00000000..4a0db04c --- /dev/null +++ b/modules/interfaces/_legacyMigrationTelemetry.py @@ -0,0 +1,198 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Lightweight Bootstrap-Telemetrie fuer entfernte Migrationsroutinen. + +Wenn eine idempotente Bootstrap-Migration (z.B. ``_migrateAndDropSysAdminRole``) +aus dem Boot-Pfad entfernt wird, koennte ein theoretischer Edge-Case (alte +DB-Restore, manueller INSERT) wieder Legacy-Daten ins System bringen. Damit das +nicht still bleibt, ruft ``initBootstrap`` nach Abschluss aller Init-Schritte +einmalig ``runLegacyDataChecks`` auf -- das logged WARN bei Restbestand. + +Designprinzipien: +- KEINE Schreibzugriffe (rein lesend). +- Process-lokal gecached (``_cache``), damit identische Boots/Reloads den Check + nur einmal laufen lassen. +- Pro Check eine Recordset-Abfrage; Ausnahmen werden als WARN geloggt, nicht + re-raised, damit Telemetrie den Boot nie crasht. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from modules.connectors.connectorDbPostgre import DatabaseConnector +from modules.datamodels.datamodelRbac import Role +from modules.datamodels.datamodelUam import Mandate +from modules.shared.mandateNameUtils import isValidMandateName + +logger = logging.getLogger(__name__) + +_alreadyRan: bool = False + + +def runLegacyDataChecks(db: DatabaseConnector) -> None: + """Logged WARN, falls noch Legacy-Daten existieren, die durch entfernte + Migrationsroutinen behandelt wurden. Prozessweit nur einmal aktiv. + + Aufruf: am Ende von ``initBootstrap``. + """ + global _alreadyRan + if _alreadyRan: + return + _alreadyRan = True + + _checkMandateDescription(db) + _checkMandateSlugRules(db) + _checkLegacyRootMandate(db) + _checkSysadminRole(db) + _backfillTargetFeatureInstanceId() + + +def _safe(checkName: str, fn) -> Any: + try: + return fn() + except Exception as exc: + logger.warning( + "Legacy-data telemetry check '%s' failed: %s: %s", + checkName, type(exc).__name__, exc, + ) + return None + + +def _checkMandateDescription(db: DatabaseConnector) -> None: + def _do() -> None: + rows = db.getRecordset(Mandate) + bad = [ + r.get("id") for r in rows + if r.get("description") and not r.get("label") + ] + if bad: + logger.warning( + "Legacy-data check: %d Mandate row(s) still have description " + "but empty label (removed migration: _migrateMandateDescriptionToLabel). " + "Run scripts/script_db_audit_legacy_state.py for details. IDs: %s", + len(bad), bad[:5], + ) + + _safe("mandate-description", _do) + + +def _checkMandateSlugRules(db: DatabaseConnector) -> None: + def _do() -> None: + rows = db.getRecordset(Mandate) + seen: set[str] = set() + bad: list[str] = [] + for r in sorted(rows, key=lambda x: str(x.get("id", ""))): + mid = r.get("id") + if not mid: + continue + name = (r.get("name") or "").strip() + labelRaw = r.get("label") + labelEmpty = not (labelRaw or "").strip() if labelRaw is not None else True + invalid = not isValidMandateName(name) + collides = name in seen + if not invalid and not collides: + seen.add(name) + if labelEmpty or invalid or collides: + bad.append(str(mid)) + if bad: + logger.warning( + "Legacy-data check: %d Mandate row(s) violate slug/label rules " + "(removed migration: _migrateMandateNameLabelSlugRules). " + "Run scripts/script_db_audit_legacy_state.py for details. IDs: %s", + len(bad), bad[:5], + ) + + _safe("mandate-slug-rules", _do) + + +def _checkLegacyRootMandate(db: DatabaseConnector) -> None: + def _do() -> None: + legacy = db.getRecordset(Mandate, recordFilter={"name": "Root"}) + rootRows = db.getRecordset(Mandate, recordFilter={"name": "root"}) + legacyByFlag = [r for r in rootRows if not r.get("isSystem")] + all_ = list(legacy) + legacyByFlag + if all_: + logger.warning( + "Legacy-data check: %d Root-Mandate row(s) still in legacy form " + "(removed migration: initRootMandate-legacy-branch). IDs: %s", + len(all_), [r.get("id") for r in all_][:5], + ) + + _safe("root-mandate-legacy", _do) + + +def _checkSysadminRole(db: DatabaseConnector) -> None: + def _do() -> None: + rootMandates = db.getRecordset( + Mandate, recordFilter={"name": "root", "isSystem": True} + ) + if not rootMandates: + return + rootId = str(rootMandates[0].get("id")) + rows = db.getRecordset( + Role, + recordFilter={ + "roleLabel": "sysadmin", + "mandateId": rootId, + "featureInstanceId": None, + }, + ) + if rows: + logger.warning( + "Legacy-data check: %d 'sysadmin' role(s) still present in root mandate " + "(removed migration: _migrateAndDropSysAdminRole). " + "Authority is now User.isPlatformAdmin -- migrate manually. IDs: %s", + len(rows), [r.get("id") for r in rows], + ) + + _safe("sysadmin-role", _do) + + +def _backfillTargetFeatureInstanceId() -> None: + """Idempotent backfill: set targetFeatureInstanceId = featureInstanceId + for all non-template AutoWorkflow rows where it is still NULL. + + Connects to ``poweron_graphicaleditor`` independently. + """ + def _do() -> None: + from modules.shared.configuration import APP_CONFIG + from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow + + dbHost = APP_CONFIG.get("DB_HOST", "localhost") + dbUser = APP_CONFIG.get("DB_USER") + dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD") + dbPort = int(APP_CONFIG.get("DB_PORT", 5432)) + geDb = DatabaseConnector( + dbHost=dbHost, + dbDatabase="poweron_graphicaleditor", + dbUser=dbUser, + dbPassword=dbPassword, + dbPort=dbPort, + userId=None, + ) + if not geDb._ensureTableExists(AutoWorkflow): + return + + rows = geDb.getRecordset(AutoWorkflow) or [] + backfilled = 0 + for r in rows: + if r.get("isTemplate"): + continue + if r.get("targetFeatureInstanceId"): + continue + srcId = r.get("featureInstanceId") + if not srcId: + continue + geDb.recordModify(AutoWorkflow, r["id"], {"targetFeatureInstanceId": srcId}) + backfilled += 1 + + if backfilled: + logger.info( + "targetFeatureInstanceId backfill: set %d non-template AutoWorkflow row(s) " + "to their featureInstanceId", + backfilled, + ) + + _safe("backfill-targetFeatureInstanceId", _do) diff --git a/modules/interfaces/interfaceBootstrap.py b/modules/interfaces/interfaceBootstrap.py index a6ae0052..4bcd0e97 100644 --- a/modules/interfaces/interfaceBootstrap.py +++ b/modules/interfaces/interfaceBootstrap.py @@ -56,14 +56,8 @@ def initBootstrap(db: DatabaseConnector) -> None: logger.info("Starting system bootstrap") - # Initialize root mandate mandateId = initRootMandate(db) - # Migrate existing mandate records: description -> label - _migrateMandateDescriptionToLabel(db) - _migrateMandateNameLabelSlugRules(db) - - # Clean up duplicate roles and fix corrupted templates FIRST _deduplicateRoles(db) # Initialize system role TEMPLATES (mandateId=None, isSystemRole=True) @@ -76,14 +70,6 @@ def initBootstrap(db: DatabaseConnector) -> None: # This also serves as migration for existing mandates that don't have instance roles yet _ensureAllMandatesHaveSystemRoles(db) - # Migration: eliminate the legacy ``sysadmin`` role in root mandate - # (replaced by ``User.isPlatformAdmin`` flag — see - # wiki/c-work/4-done/2026-04-sysadmin-authority-split.md). - # Idempotent: noop after first successful run. - if mandateId: - _migrateAndDropSysAdminRole(db, mandateId) - - # Ensure UI rules for navigation items (admin/user/viewer roles) _ensureUiContextRules(db) # Initialize admin user @@ -132,6 +118,15 @@ def initBootstrap(db: DatabaseConnector) -> None: # Ensure billing settings and accounts exist for all mandates _bootstrapBilling() + # Telemetrie: warne falls Restbestaende der entfernten idempotenten + # Migrationen wieder auftauchen (Edge-Case: alter DB-Restore o.ae.). + # Schreibt nicht, scheitert nicht den Boot. + try: + from modules.interfaces._legacyMigrationTelemetry import runLegacyDataChecks + runLegacyDataChecks(db) + except Exception as e: + logger.warning(f"Legacy-data telemetry skipped: {e}") + def _bootstrapBilling() -> None: """ @@ -396,21 +391,12 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]: Returns: Mandate ID if created or found, None otherwise """ - # Find existing root mandate by name AND isSystem flag existingMandates = db.getRecordset(Mandate, recordFilter={"name": "root", "isSystem": True}) if existingMandates: mandateId = existingMandates[0].get("id") logger.info(f"Root mandate already exists with ID {mandateId}") return mandateId - - # Check for legacy root mandates (name="Root" without isSystem flag) and migrate - legacyMandates = db.getRecordset(Mandate, recordFilter={"name": "Root"}) - if legacyMandates: - mandateId = legacyMandates[0].get("id") - logger.info(f"Migrating legacy Root mandate {mandateId}: setting name='root', isSystem=True") - db.recordModify(Mandate, mandateId, {"name": "root", "isSystem": True}) - return mandateId - + logger.info("Creating Root mandate") rootMandate = Mandate(name="root", label="Root", isSystem=True, enabled=True) createdMandate = db.recordCreate(Mandate, rootMandate) @@ -419,98 +405,6 @@ def initRootMandate(db: DatabaseConnector) -> Optional[str]: return mandateId -def _migrateMandateDescriptionToLabel(db: DatabaseConnector) -> None: - """ - Migration: Rename 'description' field to 'label' in all Mandate records. - Copies existing 'description' values to 'label' and removes the old field. - Safe to run multiple times (idempotent). - """ - allMandates = db.getRecordset(Mandate) - migratedCount = 0 - for mandateRecord in allMandates: - mandateId = mandateRecord.get("id") - hasDescription = "description" in mandateRecord and mandateRecord.get("description") is not None - hasLabel = "label" in mandateRecord and mandateRecord.get("label") is not None - - if hasDescription and not hasLabel: - # Copy description to label - updateData = {"label": mandateRecord["description"]} - db.recordModify(Mandate, mandateId, updateData) - migratedCount += 1 - logger.info(f"Migrated mandate {mandateId}: description -> label") - - if migratedCount > 0: - logger.info(f"Migrated {migratedCount} mandate(s) from description to label") - else: - logger.debug("No mandate description->label migration needed") - - -def _migrateMandateNameLabelSlugRules(db: DatabaseConnector) -> None: - """ - Migration: normalize Mandate.name to the slug rules ([a-z0-9-], length 2..32, single - hyphen segments) and ensure Mandate.label is non-empty. - - Rules (see wiki/c-work/1-plan/2026-04-mandate-name-label-logic.md): - 1. If ``label`` is empty/None → set ``label := name`` (or "Mandate" when both empty). - 2. If ``name`` is not a valid slug, or collides with an earlier mandate in stable id - order, allocate a unique slug from the (now non-empty) ``label`` using - ``slugifyMandateName`` + ``allocateUniqueMandateSlug``. - - Idempotent: a second run is a no-op because all valid names stay valid and stay unique. - Each rename and label fill-in is logged for audit. - """ - from modules.shared.mandateNameUtils import ( - allocateUniqueMandateSlug, - isValidMandateName, - slugifyMandateName, - ) - - allRows = db.getRecordset(Mandate) - if not allRows: - return - sortedRows = sorted(allRows, key=lambda r: str(r.get("id", ""))) - - used: set[str] = set() - labelFills = 0 - nameRenames: list[tuple[str, str, str]] = [] - - for rec in sortedRows: - mid = rec.get("id") - if not mid: - continue - name = (rec.get("name") or "").strip() - labelRaw = rec.get("label") - label = (labelRaw or "").strip() if labelRaw is not None else "" - - if not label: - label = name if name else "Mandate" - db.recordModify(Mandate, mid, {"label": label}) - labelFills += 1 - logger.info(f"Mandate {mid}: filled empty label with '{label}'") - - nameFits = isValidMandateName(name) - nameCollides = name in used - if nameFits and not nameCollides: - used.add(name) - continue - - base = slugifyMandateName(label) or "mn" - newName = allocateUniqueMandateSlug(base, used) - used.add(newName) - if newName != name: - db.recordModify(Mandate, mid, {"name": newName}) - nameRenames.append((str(mid), name, newName)) - logger.info(f"Mandate {mid}: renamed name '{name}' -> '{newName}'") - - if labelFills or nameRenames: - logger.info( - "Mandate name/label slug migration: %d label fill-in(s), %d name rename(s)", - labelFills, len(nameRenames), - ) - else: - logger.debug("No mandate name/label slug migration needed") - - def initAdminUser(db: DatabaseConnector, mandateId: Optional[str]) -> Optional[str]: """ Creates the Admin user if it doesn't exist. @@ -837,101 +731,6 @@ def copySystemRolesToMandate(db: DatabaseConnector, mandateId: str) -> int: return copiedCount -def _migrateAndDropSysAdminRole(db: DatabaseConnector, mandateId: str) -> None: - """ - One-shot migration: eliminate the legacy ``sysadmin`` role in the root mandate. - - Authority semantics moved to two orthogonal flags on User: - - ``isSysAdmin`` → Infrastructure-Operator (RBAC bypass) - - ``isPlatformAdmin`` → Cross-Mandate-Governance (no bypass) - - Migration steps (idempotent): - 1. Find sysadmin role(s) in root mandate. If none exist → done. - 2. For every UserMandateRole row referencing such a role: set - ``user.isPlatformAdmin = True`` (preserves cross-mandate authority). - 3. Delete those UserMandateRole rows. - 4. Delete AccessRules attached to the sysadmin role. - 5. Delete the sysadmin Role record. - - Args: - db: Database connector instance - mandateId: Root mandate ID - """ - sysadminRoles = db.getRecordset( - Role, - recordFilter={"roleLabel": "sysadmin", "mandateId": mandateId, "featureInstanceId": None}, - ) - if not sysadminRoles: - logger.debug("Sysadmin role migration: no legacy sysadmin role present, nothing to do") - return - - sysadminRoleIds = [str(r.get("id")) for r in sysadminRoles if r.get("id")] - logger.warning( - f"Sysadmin role migration: found {len(sysadminRoleIds)} legacy sysadmin role(s) " - f"in root mandate, migrating to isPlatformAdmin flag" - ) - - # 1) Promote every holder to isPlatformAdmin=True - promoted = 0 - for sysadminRoleId in sysadminRoleIds: - umRoleRows = db.getRecordset( - UserMandateRole, recordFilter={"roleId": sysadminRoleId} - ) - userMandateIds = [str(r.get("userMandateId")) for r in umRoleRows if r.get("userMandateId")] - if not userMandateIds: - continue - - # Resolve userIds via UserMandate - userIds = set() - for umId in userMandateIds: - ums = db.getRecordset(UserMandate, recordFilter={"id": umId}) - for um in ums: - uid = um.get("userId") if isinstance(um, dict) else getattr(um, "userId", None) - if uid: - userIds.add(str(uid)) - - for userId in userIds: - users = db.getRecordset(UserInDB, recordFilter={"id": userId}) - if not users: - continue - current = users[0].get("isPlatformAdmin", False) - if not current: - db.recordModify(UserInDB, userId, {"isPlatformAdmin": True}) - promoted += 1 - logger.warning( - f"Sysadmin role migration: granted isPlatformAdmin=True to user {userId}" - ) - - # 2) Delete UserMandateRole rows - for umRow in umRoleRows: - rowId = umRow.get("id") if isinstance(umRow, dict) else getattr(umRow, "id", None) - if rowId: - try: - db.recordDelete(UserMandateRole, str(rowId)) - except Exception as e: - logger.error(f"Sysadmin role migration: failed to drop UserMandateRole {rowId}: {e}") - - # 3) Delete AccessRules - accessRules = db.getRecordset(AccessRule, recordFilter={"roleId": sysadminRoleId}) - for ar in accessRules: - arId = ar.get("id") if isinstance(ar, dict) else getattr(ar, "id", None) - if arId: - try: - db.recordDelete(AccessRule, str(arId)) - except Exception as e: - logger.error(f"Sysadmin role migration: failed to drop AccessRule {arId}: {e}") - - # 4) Delete the Role - try: - db.recordDelete(Role, sysadminRoleId) - except Exception as e: - logger.error(f"Sysadmin role migration: failed to drop Role {sysadminRoleId}: {e}") - - logger.warning( - f"Sysadmin role migration: completed; promoted {promoted} user(s) to isPlatformAdmin" - ) - - def _getRoleId(db: DatabaseConnector, roleLabel: str) -> Optional[str]: """ Get role ID by label, using cache or database lookup. diff --git a/modules/interfaces/interfaceDbKnowledge.py b/modules/interfaces/interfaceDbKnowledge.py index f819615e..9d6ba3d4 100644 --- a/modules/interfaces/interfaceDbKnowledge.py +++ b/modules/interfaces/interfaceDbKnowledge.py @@ -603,41 +603,10 @@ def aggregateMandateRagTotalBytes(mandateId: str) -> int: if rid and str(rid) not in byId: byId[str(rid)] = row - # DEPRECATED: file-ID-correlation fallback from poweron_management. - # Only needed for pre-migration data where mandateId/featureInstanceId on the - # FileContentIndex are empty. Safe to remove once all environments are migrated. - _fallbackCount = 0 - try: - from modules.datamodels.datamodelFiles import FileItem - from modules.interfaces.interfaceDbManagement import ComponentObjects - mgmtDb = ComponentObjects().db - knowledgeIf = getInterface(None) - - fileIds: set = set() - for f in mgmtDb.getRecordset(FileItem, recordFilter={"mandateId": mandateId}): - fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None) - if fid: - fileIds.add(str(fid)) - for instId in instIds: - for f in mgmtDb.getRecordset(FileItem, recordFilter={"featureInstanceId": instId}): - fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None) - if fid: - fileIds.add(str(fid)) - - for fid in fileIds: - if fid in byId: - continue - row = knowledgeIf.getFileContentIndex(fid) - if row: - byId[fid] = row - _fallbackCount += 1 - except Exception as e: - logger.warning("aggregateMandateRagTotalBytes fallback failed: %s", e) - total = sum(int(r.get("totalSize") or 0) for r in byId.values()) logger.info( - "aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes (fallback: %d)", - mandateId, len(byId), total, _fallbackCount, + "aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes", + mandateId, len(byId), total, ) return total diff --git a/modules/interfaces/interfaceFeatures.py b/modules/interfaces/interfaceFeatures.py index ccb64a53..c965edb2 100644 --- a/modules/interfaces/interfaceFeatures.py +++ b/modules/interfaces/interfaceFeatures.py @@ -347,6 +347,7 @@ class FeatureInterface: "templateSourceId": templateId, "templateScope": "instance", "active": True, + "targetFeatureInstanceId": instanceId, }) copied += 1 except Exception as e: diff --git a/modules/routes/routeAutomationWorkspace.py b/modules/routes/routeAutomationWorkspace.py new file mode 100644 index 00000000..6efbdeb6 --- /dev/null +++ b/modules/routes/routeAutomationWorkspace.py @@ -0,0 +1,246 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +""" +User-facing Automation Workspace API. + +Lists workflow runs the user can access (via FeatureAccess on +targetFeatureInstanceId) and provides detail views with step logs +and linked files. Designed for the "Workspace" tab under +Nutzung > Automation. +""" + +import logging +import math +from typing import Optional + +from fastapi import APIRouter, Depends, Request, Query, Path, HTTPException +from slowapi import Limiter +from slowapi.util import get_remote_address + +from modules.auth.authentication import getRequestContext, RequestContext +from modules.connectors.connectorDbPostgre import DatabaseConnector +from modules.shared.configuration import APP_CONFIG +from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import ( + AutoRun, + AutoStepLog, + AutoWorkflow, +) +from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase +from modules.shared.i18nRegistry import apiRouteContext + +routeApiMsg = apiRouteContext("routeAutomationWorkspace") +logger = logging.getLogger(__name__) +limiter = Limiter(key_func=get_remote_address) + +router = APIRouter(prefix="/api/automations/runs", tags=["AutomationWorkspace"]) + + +def _getDb() -> DatabaseConnector: + return DatabaseConnector( + dbHost=APP_CONFIG.get("DB_HOST", "localhost"), + dbDatabase=graphicalEditorDatabase, + dbUser=APP_CONFIG.get("DB_USER"), + dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"), + dbPort=int(APP_CONFIG.get("DB_PORT", 5432)), + userId=None, + ) + + +def _getUserAccessibleInstanceIds(userId: str) -> list[str]: + """Return all featureInstanceIds the user has enabled FeatureAccess for.""" + from modules.interfaces.interfaceDbApp import getRootInterface + rootIface = getRootInterface() + allAccess = rootIface.getFeatureAccessesForUser(userId) or [] + return [ + a.featureInstanceId + for a in allAccess + if a.featureInstanceId and a.enabled + ] + + +@router.get("") +@limiter.limit("60/minute") +def listWorkspaceRuns( + request: Request, + scope: str = Query("mine", description="mine = own runs, mandate = all accessible"), + status: Optional[str] = Query(None, description="Filter by run status"), + targetInstanceId: Optional[str] = Query(None, description="Filter by targetFeatureInstanceId"), + workflowId: Optional[str] = Query(None, description="Filter by workflow"), + limit: int = Query(50, ge=1, le=200), + offset: int = Query(0, ge=0), + context: RequestContext = Depends(getRequestContext), +) -> dict: + """List workflow runs visible to the user. + + scope=mine: only runs owned by the user. + scope=mandate: all runs where the user has FeatureAccess on the + workflow's targetFeatureInstanceId. + """ + db = _getDb() + if not db._ensureTableExists(AutoRun): + return {"runs": [], "total": 0, "limit": limit, "offset": offset} + + userId = str(context.user.id) if context.user else None + if not userId: + raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required")) + + accessibleInstanceIds = _getUserAccessibleInstanceIds(userId) + if not accessibleInstanceIds: + return {"runs": [], "total": 0, "limit": limit, "offset": offset} + + if not db._ensureTableExists(AutoWorkflow): + return {"runs": [], "total": 0, "limit": limit, "offset": offset} + + wfFilter: dict = {} + if targetInstanceId: + if targetInstanceId not in accessibleInstanceIds: + raise HTTPException(status_code=403, detail=routeApiMsg("Access denied to target instance")) + wfFilter["targetFeatureInstanceId"] = targetInstanceId + workflows = db.getRecordset(AutoWorkflow, recordFilter=wfFilter or None) or [] + + visibleWfIds: set[str] = set() + wfMap: dict = {} + for wf in workflows: + wfDict = dict(wf) + tid = wfDict.get("targetFeatureInstanceId") or wfDict.get("featureInstanceId") + if tid and tid in accessibleInstanceIds: + wfId = wfDict.get("id") + if wfId: + visibleWfIds.add(wfId) + wfMap[wfId] = wfDict + + if workflowId: + if workflowId not in visibleWfIds: + return {"runs": [], "total": 0, "limit": limit, "offset": offset} + visibleWfIds = {workflowId} + + if not visibleWfIds: + return {"runs": [], "total": 0, "limit": limit, "offset": offset} + + allRuns = db.getRecordset(AutoRun, recordFilter={}) or [] + filtered = [] + for r in allRuns: + row = dict(r) + if row.get("workflowId") not in visibleWfIds: + continue + if scope == "mine" and row.get("ownerId") != userId: + continue + if status and row.get("status") != status: + continue + filtered.append(row) + + filtered.sort( + key=lambda x: x.get("startedAt") or x.get("sysCreatedAt") or 0, + reverse=True, + ) + total = len(filtered) + page = filtered[offset: offset + limit] + + from modules.routes.routeHelpers import enrichRowsWithFkLabels, resolveMandateLabels, resolveInstanceLabels + + for row in page: + wf = wfMap.get(row.get("workflowId"), {}) + row["workflowLabel"] = row.get("label") or wf.get("label") or row.get("workflowId", "") + row["targetFeatureInstanceId"] = wf.get("targetFeatureInstanceId") or wf.get("featureInstanceId") + + enrichRowsWithFkLabels( + page, + labelResolvers={ + "mandateId": resolveMandateLabels, + "targetFeatureInstanceId": resolveInstanceLabels, + }, + ) + for row in page: + row["targetInstanceLabel"] = row.pop("targetFeatureInstanceIdLabel", None) + row["mandateLabel"] = row.pop("mandateIdLabel", None) + + return {"runs": page, "total": total, "limit": limit, "offset": offset} + + +@router.get("/{runId}/detail") +@limiter.limit("60/minute") +def getWorkspaceRunDetail( + request: Request, + runId: str = Path(..., description="Run ID"), + context: RequestContext = Depends(getRequestContext), +) -> dict: + """Get full detail for a single run: metadata, step logs, linked files.""" + db = _getDb() + userId = str(context.user.id) if context.user else None + if not userId: + raise HTTPException(status_code=401, detail=routeApiMsg("Authentication required")) + + if not db._ensureTableExists(AutoRun): + raise HTTPException(status_code=404, detail=routeApiMsg("Run not found")) + + runs = db.getRecordset(AutoRun, recordFilter={"id": runId}) + if not runs: + raise HTTPException(status_code=404, detail=routeApiMsg("Run not found")) + run = dict(runs[0]) + + wfId = run.get("workflowId") + workflow: dict = {} + if wfId and db._ensureTableExists(AutoWorkflow): + wfs = db.getRecordset(AutoWorkflow, recordFilter={"id": wfId}) + if wfs: + workflow = dict(wfs[0]) + + tid = workflow.get("targetFeatureInstanceId") or workflow.get("featureInstanceId") + accessibleIds = _getUserAccessibleInstanceIds(userId) + isOwner = run.get("ownerId") == userId + + if not isOwner and (not tid or tid not in accessibleIds) and not context.isPlatformAdmin: + raise HTTPException(status_code=403, detail=routeApiMsg("Access denied")) + + steps: list = [] + if db._ensureTableExists(AutoStepLog): + stepRecords = db.getRecordset(AutoStepLog, recordFilter={"runId": runId}) or [] + steps = [dict(s) for s in stepRecords] + steps.sort(key=lambda s: s.get("startedAt") or 0) + + fileItems: list = [] + try: + from modules.datamodels.datamodelFiles import FileItem + from modules.interfaces.interfaceDbManagement import ComponentObjects + mgmtDb = ComponentObjects().db + if mgmtDb._ensureTableExists(FileItem): + nodeOutputs = run.get("nodeOutputs") or {} + fileIds: set[str] = set() + for nodeId, output in nodeOutputs.items(): + if not isinstance(output, dict): + continue + for key in ("fileId", "documentId", "fileIds", "documents"): + val = output.get(key) + if isinstance(val, str) and val: + fileIds.add(val) + elif isinstance(val, list): + for v in val: + if isinstance(v, str) and v: + fileIds.add(v) + elif isinstance(v, dict) and v.get("id"): + fileIds.add(v["id"]) + for fid in fileIds: + try: + rec = mgmtDb.getRecord(FileItem, fid) + if rec: + fileItems.append(dict(rec)) + except Exception: + pass + except Exception as e: + logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e) + + run["workflowLabel"] = run.get("label") or workflow.get("label") or wfId + run["targetFeatureInstanceId"] = tid + + return { + "run": run, + "workflow": { + "id": workflow.get("id"), + "label": workflow.get("label"), + "targetFeatureInstanceId": tid, + "featureInstanceId": workflow.get("featureInstanceId"), + "tags": workflow.get("tags", []), + } if workflow else None, + "steps": steps, + "files": fileItems, + } diff --git a/modules/workflows/automation2/executionEngine.py b/modules/workflows/automation2/executionEngine.py index 1d0ca5c8..55a63281 100644 --- a/modules/workflows/automation2/executionEngine.py +++ b/modules/workflows/automation2/executionEngine.py @@ -302,6 +302,30 @@ async def _executeWithRetry(executor, node, context, maxRetries: int = 0, retryD raise lastError +def _substituteFeatureInstancePlaceholders( + graph: Dict[str, Any], + targetFeatureInstanceId: str, +) -> Dict[str, Any]: + """Replace ``{{featureInstanceId}}`` placeholders in the serialised graph. + + Works on the full JSON representation so that placeholders inside nested + parameter dicts, prompt strings, etc. are all caught. Already-resolved + concrete UUIDs (pre-baked by ``_copyTemplateWorkflows``) are left untouched + because the placeholder literal ``{{featureInstanceId}}`` will not match. + """ + import json as _json + raw = _json.dumps(graph) + if "{{featureInstanceId}}" not in raw: + return graph + replaced = raw.replace("{{featureInstanceId}}", targetFeatureInstanceId) + logger.debug( + "_substituteFeatureInstancePlaceholders: resolved %d occurrence(s) -> %s", + raw.count("{{featureInstanceId}}"), + targetFeatureInstanceId, + ) + return _json.loads(replaced) + + async def executeGraph( graph: Dict[str, Any], services: Any, @@ -315,6 +339,7 @@ async def executeGraph( runId: Optional[str] = None, run_envelope: Optional[Dict[str, Any]] = None, label: Optional[str] = None, + targetFeatureInstanceId: Optional[str] = None, ) -> Dict[str, Any]: """ Execute automation2 graph. Returns { success, nodeOutputs, error?, stopped? }. @@ -322,14 +347,16 @@ async def executeGraph( pauses the run, and returns { success: False, paused: True, taskId, runId }. For resume: pass initialNodeOutputs (with result for the human node) and startAfterNodeId. For fresh runs: pass run_envelope (unified start payload for the start node); normalized with userId into context.runEnvelope. + targetFeatureInstanceId: resolves {{featureInstanceId}} placeholders in the graph JSON before execution. """ logger.info( - "executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s", + "executeGraph start: instanceId=%s workflowId=%s userId=%s mandateId=%s resume=%s targetInstance=%s", instanceId, workflowId, userId, mandateId, startAfterNodeId is not None, + targetFeatureInstanceId, ) from modules.workflows.processing.shared.methodDiscovery import discoverMethods discoverMethods(services) @@ -338,6 +365,9 @@ async def executeGraph( materializeFeatureInstanceRefs, ) + if targetFeatureInstanceId: + graph = _substituteFeatureInstancePlaceholders(graph, targetFeatureInstanceId) + # Phase-5 Schicht-4: typed-ref envelopes are materialized FIRST so the # subsequent connection-ref pass and validation see the canonical shape. graph = materializeFeatureInstanceRefs(graph) diff --git a/modules/workflows/scheduler/mainScheduler.py b/modules/workflows/scheduler/mainScheduler.py index bf2cd0fd..0dce2ec5 100644 --- a/modules/workflows/scheduler/mainScheduler.py +++ b/modules/workflows/scheduler/mainScheduler.py @@ -243,6 +243,7 @@ class WorkflowScheduler: runEnv = normalize_run_envelope(runEnv, user_id=str(eventUser.id) if eventUser else None) _wfLabel = wf.get("label") if isinstance(wf, dict) else getattr(wf, "label", None) + _targetInstanceId = wf.get("targetFeatureInstanceId") if isinstance(wf, dict) else getattr(wf, "targetFeatureInstanceId", None) result = await executeGraph( graph=wf["graph"], @@ -254,6 +255,7 @@ class WorkflowScheduler: automation2_interface=iface, run_envelope=runEnv, label=_wfLabel, + targetFeatureInstanceId=_targetInstanceId, ) logger.info( "WorkflowScheduler: executed workflow %s success=%s paused=%s", diff --git a/scripts/_archive/README.md b/scripts/_archive/README.md new file mode 100644 index 00000000..dba3deef --- /dev/null +++ b/scripts/_archive/README.md @@ -0,0 +1,19 @@ +# Archived one-shot scripts + +Diese Scripts haben einmal eine konkrete Daten- oder Code-Migration ausgefuehrt +und werden nicht mehr aktiv aufgerufen. Sie bleiben hier liegen, falls jemand +spaeter auf einem alten DB-Dump oder einem alten Branch nochmal denselben Stand +herstellen muss. + +KEIN aktives Tool. Nicht aus CI, nicht aus Docs verlinken. Bei Aufraeumarbeiten +(z.B. nach 6 Monaten ohne Anwendung) loeschen. + +## Inhalt + +| Datei | Migrationsthema | Archiviert am | Begruendung | +|-------|-----------------|---------------|-------------| +| `check_orphan_featureinstance.py` | Vor-Ort-Check mit hardcoded FeatureInstance-/Mandate-UUIDs | 2026-04-29 | Ad-hoc fuer einen konkreten Vorfall | +| `script_db_cleanup_duplicate_roles.py` | Cleanup doppelter Roles wegen `IS NULL`-Bug in `connectorDbPostgre` | 2026-04-29 | Bug ist laengst gefixt, Cleanup ueberall durchgelaufen | +| `migrate_async_to_sync.py` | One-shot Codemod `async def` -> `def` fuer FastAPI-Routes | 2026-04-29 | Refactor abgeschlossen | +| `i18n_rekey_plaintext_keys.py` | Frontend `t('dot.notation')` -> `t('Klartext')` Rekey | 2026-04-29 | Frontend-Migration abgeschlossen (siehe `wiki/c-work/4-done/2026-04-ui-i18n-dynamic-language-sets.md`) | +| `script_db_migrate_accessrules_objectkeys.py` | AccessRule-Items: kurz -> vollqualifiziert (Navigation-API) | 2026-04-29 | Navigation-API live, MIGRATION_MAP nur fuer trustee+realestate hardcoded | diff --git a/scripts/check_orphan_featureinstance.py b/scripts/_archive/check_orphan_featureinstance.py similarity index 100% rename from scripts/check_orphan_featureinstance.py rename to scripts/_archive/check_orphan_featureinstance.py diff --git a/scripts/i18n_rekey_plaintext_keys.py b/scripts/_archive/i18n_rekey_plaintext_keys.py similarity index 100% rename from scripts/i18n_rekey_plaintext_keys.py rename to scripts/_archive/i18n_rekey_plaintext_keys.py diff --git a/scripts/migrate_async_to_sync.py b/scripts/_archive/migrate_async_to_sync.py similarity index 100% rename from scripts/migrate_async_to_sync.py rename to scripts/_archive/migrate_async_to_sync.py diff --git a/scripts/script_db_cleanup_duplicate_roles.py b/scripts/_archive/script_db_cleanup_duplicate_roles.py similarity index 100% rename from scripts/script_db_cleanup_duplicate_roles.py rename to scripts/_archive/script_db_cleanup_duplicate_roles.py diff --git a/scripts/script_db_migrate_accessrules_objectkeys.py b/scripts/_archive/script_db_migrate_accessrules_objectkeys.py similarity index 100% rename from scripts/script_db_migrate_accessrules_objectkeys.py rename to scripts/_archive/script_db_migrate_accessrules_objectkeys.py diff --git a/scripts/_listMandates.py b/scripts/_listMandates.py deleted file mode 100644 index cf3e9bd2..00000000 --- a/scripts/_listMandates.py +++ /dev/null @@ -1,25 +0,0 @@ -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).resolve().parents[1])) -import psycopg2, psycopg2.extras -from modules.shared.configuration import APP_CONFIG - -c = psycopg2.connect( - host=APP_CONFIG.get('DB_HOST','localhost'), - user=APP_CONFIG.get('DB_USER'), - password=APP_CONFIG.get('DB_PASSWORD_SECRET'), - port=int(APP_CONFIG.get('DB_PORT',5432)), - dbname='poweron_app', -) -cur = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) -cur.execute('SELECT id, name, label, enabled, "deletedAt", "sysCreatedAt" FROM "Mandate" ORDER BY "sysCreatedAt"') -print("All Mandates in poweron_app:") -for r in cur.fetchall(): - print(f" id={r['id']} name={r['name']} label={r['label']} enabled={r['enabled']} deletedAt={r['deletedAt']}") - -cur.execute('SELECT COUNT(*) AS n FROM "FeatureInstance" WHERE "featureCode" = %s', ("redmine",)) -print(f"\nTotal redmine FeatureInstances in poweron_app: {cur.fetchone()['n']}") - -cur.execute('SELECT id, "mandateId", label, enabled FROM "FeatureInstance" WHERE "featureCode" = %s ORDER BY "sysCreatedAt"', ("redmine",)) -for r in cur.fetchall(): - print(f" fi={r['id']} mandate={r['mandateId']} label={r['label']} enabled={r['enabled']}") diff --git a/scripts/script_db_audit_legacy_state.py b/scripts/script_db_audit_legacy_state.py new file mode 100644 index 00000000..f51a132a --- /dev/null +++ b/scripts/script_db_audit_legacy_state.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 +"""Audit-Skript fuer Legacy-Bestaende vor Bootstrap-Cleanup (Plan C). + +Prueft fuer jede der 5 Bootstrap-Migrationsroutinen, ob noch Restbestand +existiert. Wenn alle Checks 0 / GREEN liefern, kann die jeweilige Routine +sicher aus ``interfaceBootstrap.py`` / ``interfaceDbKnowledge.py`` entfernt +werden. + +Checks: + 1. Mandate.description != NULL und Mandate.label leer + -> _migrateMandateDescriptionToLabel + 2. Mandate.label leer ODER Mandate.name verstoesst gegen Slug-Regeln + -> _migrateMandateNameLabelSlugRules + 3. Mandate mit name='Root' und isSystem=False + -> initRootMandate Legacy-Zweig + 4. Role mit roleLabel='sysadmin' im Root-Mandat + -> _migrateAndDropSysAdminRole + 5. FileContentIndex mit leerem mandateId UND leerem featureInstanceId + -> aggregateMandateRagTotalBytes Fallback-Block + +Verwendung: + python -m scripts.script_db_audit_legacy_state # text-output + python -m scripts.script_db_audit_legacy_state --json # JSON-output + python -m scripts.script_db_audit_legacy_state --purge-rag-orphans + # loescht FileContentIndex-Rows ohne mandateId UND ohne featureInstanceId + # (Voraussetzung fuer Removal des aggregateMandateRagTotalBytes-Fallback) + +Exit-Code: + 0 alle Checks GREEN (Removal sicher) + 1 mind. ein Check RED (erst Daten bereinigen) + 2 Skript-Fehler (DB nicht erreichbar etc.) + +Lese-Zugriffe sind die Default. Schreibzugriffe NUR mit explizitem +``--purge-*``-Flag. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + + +_gatewayDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _gatewayDir not in sys.path: + sys.path.insert(0, _gatewayDir) + +from dotenv import load_dotenv + +_envPath = os.path.join(_gatewayDir, "env_dev.env") +if os.path.exists(_envPath): + load_dotenv(_envPath) + +from modules.datamodels.datamodelUam import Mandate +from modules.datamodels.datamodelRbac import Role +from modules.datamodels.datamodelKnowledge import FileContentIndex +from modules.security.rootAccess import getRootDbAppConnector +from modules.interfaces.interfaceDbKnowledge import KnowledgeObjects +from modules.shared.mandateNameUtils import isValidMandateName + +logging.basicConfig(level=logging.WARNING, format="%(message)s") +logger = logging.getLogger(__name__) + + +@dataclass +class _CheckResult: + """Ergebnis eines einzelnen Audit-Checks.""" + + name: str + routine: str + location: str + count: int + status: str + samples: List[Dict[str, Any]] = field(default_factory=list) + error: Optional[str] = None + + def toDict(self) -> Dict[str, Any]: + return { + "name": self.name, + "routine": self.routine, + "location": self.location, + "count": self.count, + "status": self.status, + "samples": self.samples, + "error": self.error, + } + + +def _getAppDb(): + return getRootDbAppConnector() + + +def _getKnowledgeDb(): + return KnowledgeObjects().db + + +def _checkMandateDescription(db) -> _CheckResult: + """Mandate.description noch vorhanden und label leer?""" + rows = db.getRecordset(Mandate) + legacy = [ + { + "id": r.get("id"), + "name": r.get("name"), + "description": str(r.get("description"))[:60] if r.get("description") else None, + "label": r.get("label"), + } + for r in rows + if r.get("description") and not r.get("label") + ] + return _CheckResult( + name="mandate-description-to-label", + routine="_migrateMandateDescriptionToLabel", + location="interfaces/interfaceBootstrap.py:422-445", + count=len(legacy), + status="GREEN" if not legacy else "RED", + samples=legacy[:5], + ) + + +def _checkMandateSlugRules(db) -> _CheckResult: + """Mandate.name verletzt Slug-Regeln ODER Mandate.label leer?""" + rows = db.getRecordset(Mandate) + legacy = [] + seen: set[str] = set() + for r in sorted(rows, key=lambda x: str(x.get("id", ""))): + name = (r.get("name") or "").strip() + labelRaw = r.get("label") + labelEmpty = not (labelRaw or "").strip() if labelRaw is not None else True + nameInvalid = not isValidMandateName(name) + nameCollides = name in seen + if not nameInvalid and not nameCollides: + seen.add(name) + if labelEmpty or nameInvalid or nameCollides: + legacy.append( + { + "id": r.get("id"), + "name": name, + "label": r.get("label"), + "labelEmpty": labelEmpty, + "nameInvalid": nameInvalid, + "nameCollides": nameCollides, + } + ) + return _CheckResult( + name="mandate-name-slug-rules", + routine="_migrateMandateNameLabelSlugRules", + location="interfaces/interfaceBootstrap.py:448-511", + count=len(legacy), + status="GREEN" if not legacy else "RED", + samples=legacy[:5], + ) + + +def _checkRootMandateLegacy(db) -> _CheckResult: + """Mandate mit name='Root' (case-sensitive) ODER isSystem=False fuer root?""" + legacyByName = db.getRecordset(Mandate, recordFilter={"name": "Root"}) + rows = db.getRecordset(Mandate, recordFilter={"name": "root"}) + legacyByFlag = [r for r in rows if not r.get("isSystem")] + combined = list(legacyByName) + legacyByFlag + samples = [ + { + "id": r.get("id"), + "name": r.get("name"), + "isSystem": r.get("isSystem"), + } + for r in combined + ] + return _CheckResult( + name="root-mandate-legacy", + routine="initRootMandate-legacy-branch", + location="interfaces/interfaceBootstrap.py:406-412", + count=len(samples), + status="GREEN" if not samples else "RED", + samples=samples[:5], + ) + + +def _checkSysadminRole(db) -> _CheckResult: + """Legacy 'sysadmin'-Rolle im Root-Mandat?""" + rootMandates = db.getRecordset(Mandate, recordFilter={"name": "root", "isSystem": True}) + if not rootMandates: + return _CheckResult( + name="sysadmin-role", + routine="_migrateAndDropSysAdminRole", + location="interfaces/interfaceBootstrap.py:840-932", + count=0, + status="GREEN", + samples=[], + error="kein Root-Mandat gefunden -- Check uebersprungen (kann nicht relevant sein)", + ) + rootId = str(rootMandates[0].get("id")) + rows = db.getRecordset( + Role, + recordFilter={"roleLabel": "sysadmin", "mandateId": rootId, "featureInstanceId": None}, + ) + samples = [{"id": r.get("id"), "roleLabel": r.get("roleLabel")} for r in rows] + return _CheckResult( + name="sysadmin-role", + routine="_migrateAndDropSysAdminRole", + location="interfaces/interfaceBootstrap.py:840-932", + count=len(samples), + status="GREEN" if not samples else "RED", + samples=samples[:5], + ) + + +def _checkRagFallback(knowDb) -> _CheckResult: + """FileContentIndex-Rows ohne mandateId UND ohne featureInstanceId?""" + rows = knowDb.getRecordset(FileContentIndex) + legacy = [ + { + "id": r.get("id"), + "fileName": r.get("fileName"), + "totalSize": r.get("totalSize"), + } + for r in rows + if not (r.get("mandateId") or "").strip() and not (r.get("featureInstanceId") or "").strip() + ] + return _CheckResult( + name="rag-fallback-orphan-index", + routine="aggregateMandateRagTotalBytes-fallback", + location="interfaces/interfaceDbKnowledge.py:609-635", + count=len(legacy), + status="GREEN" if not legacy else "RED", + samples=legacy[:5], + ) + + +def _runChecks() -> List[_CheckResult]: + appDb = _getAppDb() + knowDb = _getKnowledgeDb() + + appChecks: List[Callable[[Any], _CheckResult]] = [ + _checkMandateDescription, + _checkMandateSlugRules, + _checkRootMandateLegacy, + _checkSysadminRole, + ] + + results: List[_CheckResult] = [] + for fn in appChecks: + try: + results.append(fn(appDb)) + except Exception as exc: + results.append( + _CheckResult( + name=fn.__name__, + routine="?", + location="?", + count=-1, + status="ERROR", + error=f"{type(exc).__name__}: {exc}", + ) + ) + + try: + results.append(_checkRagFallback(knowDb)) + except Exception as exc: + results.append( + _CheckResult( + name="rag-fallback-orphan-index", + routine="aggregateMandateRagTotalBytes-fallback", + location="interfaces/interfaceDbKnowledge.py:609-635", + count=-1, + status="ERROR", + error=f"{type(exc).__name__}: {exc}", + ) + ) + + return results + + +def _printText(results: List[_CheckResult]) -> None: + print("=" * 78) + print("BOOTSTRAP-MIGRATIONS LEGACY-STATE-AUDIT") + print("=" * 78) + for r in results: + marker = { + "GREEN": "[OK]", + "RED": "[!!]", + "ERROR": "[ERR]", + }.get(r.status, "[?]") + print(f"\n{marker} {r.name}") + print(f" Routine : {r.routine}") + print(f" Location: {r.location}") + print(f" Count : {r.count}") + print(f" Status : {r.status}") + if r.error: + print(f" Note : {r.error}") + if r.samples: + print(f" Samples : (max 5)") + for s in r.samples: + print(f" {s}") + + print("\n" + "=" * 78) + greens = sum(1 for r in results if r.status == "GREEN") + reds = sum(1 for r in results if r.status == "RED") + errs = sum(1 for r in results if r.status == "ERROR") + print(f"SUMMARY: {greens} GREEN {reds} RED {errs} ERROR ({len(results)} total)") + if reds == 0 and errs == 0: + print("VERDICT: alle Migrationsroutinen koennen entfernt werden.") + elif errs > 0: + print("VERDICT: Audit unvollstaendig (Fehler) -- bitte Skript fixen.") + else: + print("VERDICT: erst Daten bereinigen, dann Routinen entfernen.") + print("=" * 78) + + +def _purgeRagOrphans() -> int: + """Loescht alle FileContentIndex-Rows ohne mandateId UND ohne featureInstanceId. + + Returns: Anzahl geloeschter Rows. + """ + knowDb = _getKnowledgeDb() + rows = knowDb.getRecordset(FileContentIndex) + orphans = [ + r for r in rows + if not (r.get("mandateId") or "").strip() + and not (r.get("featureInstanceId") or "").strip() + ] + if not orphans: + print("Keine RAG-Orphans gefunden -- nichts zu purgen.") + return 0 + + print(f"Purge {len(orphans)} RAG-Orphan(s):") + deleted = 0 + for r in orphans: + rid = r.get("id") + try: + knowDb.recordDelete(FileContentIndex, str(rid)) + deleted += 1 + print(f" geloescht: {rid} {r.get('fileName')}") + except Exception as exc: + print(f" FEHLER {rid}: {type(exc).__name__}: {exc}", file=sys.stderr) + print(f"Purge abgeschlossen: {deleted}/{len(orphans)} geloescht.") + return deleted + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Audit-Skript fuer Legacy-Bestaende (Bootstrap-Cleanup Plan C)" + ) + parser.add_argument("--json", action="store_true", help="JSON-Output statt Text") + parser.add_argument( + "--purge-rag-orphans", + action="store_true", + help="WRITE: loescht FileContentIndex-Rows ohne mandateId UND featureInstanceId", + ) + args = parser.parse_args() + + if args.purge_rag_orphans: + try: + _purgeRagOrphans() + except Exception as exc: + print(f"FATAL: Purge fehlgeschlagen -- {type(exc).__name__}: {exc}", file=sys.stderr) + return 2 + print() + + try: + results = _runChecks() + except Exception as exc: + print(f"FATAL: konnte Audit nicht starten -- {type(exc).__name__}: {exc}", file=sys.stderr) + return 2 + + if args.json: + print(json.dumps([r.toDict() for r in results], indent=2, default=str)) + else: + _printText(results) + + if any(r.status == "ERROR" for r in results): + return 2 + if any(r.status == "RED" for r in results): + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/unit/bootstrap/test_mandateNameMigration.py b/tests/unit/bootstrap/test_mandateNameMigration.py deleted file mode 100644 index d09a6846..00000000 --- a/tests/unit/bootstrap/test_mandateNameMigration.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -Unit tests for ``_migrateMandateNameLabelSlugRules`` in interfaceBootstrap. - -Covers: -- legacy ``name``/``label`` rows get fixed (label fill, slug rename), -- collisions across legacy rows resolve via -2/-3 suffixes in stable id order, -- valid rows are left untouched (idempotency), -- second invocation is a no-op. -""" - -from typing import Any, Dict, List, Optional - -import pytest - -from modules.datamodels.datamodelUam import Mandate -from modules.interfaces.interfaceBootstrap import _migrateMandateNameLabelSlugRules -from modules.shared.mandateNameUtils import isValidMandateName - - -class _FakeDb: - """Minimal connector simulating getRecordset(Mandate)+recordModify(Mandate, id, data).""" - - def __init__(self, rows: List[Dict[str, Any]]): - self.rows: List[Dict[str, Any]] = [dict(r) for r in rows] - self.modifyCalls: List[Dict[str, Any]] = [] - - def getRecordset(self, model, recordFilter: Optional[Dict[str, Any]] = None): - if model is not Mandate: - return [] - if not recordFilter: - return [dict(r) for r in self.rows] - out = [] - for r in self.rows: - if all(r.get(k) == v for k, v in recordFilter.items()): - out.append(dict(r)) - return out - - def recordModify(self, model, recordId: str, data: Dict[str, Any]): - self.modifyCalls.append({"id": str(recordId), "data": dict(data)}) - for r in self.rows: - if str(r.get("id")) == str(recordId): - r.update(data) - return r - return None - - -def _row(mid: str, name: Any, label: Any = None) -> Dict[str, Any]: - return {"id": mid, "name": name, "label": label} - - -class TestMigrationFillsLabel: - def test_emptyLabelGetsNameAsLabel(self): - db = _FakeDb([_row("a1", "good-name", None)]) - _migrateMandateNameLabelSlugRules(db) - assert db.rows[0]["label"] == "good-name" - assert db.rows[0]["name"] == "good-name" - - def test_emptyLabelAndEmptyNameFallsBackToMandate(self): - db = _FakeDb([_row("a1", "", "")]) - _migrateMandateNameLabelSlugRules(db) - assert db.rows[0]["label"] == "Mandate" - assert isValidMandateName(db.rows[0]["name"]) - - -class TestMigrationRenamesInvalidNames: - def test_invalidNameGetsSlugFromLabel(self): - db = _FakeDb([_row("a1", "Home patrick", "Home Patrick")]) - _migrateMandateNameLabelSlugRules(db) - assert db.rows[0]["name"] == "home-patrick" - assert db.rows[0]["label"] == "Home Patrick" - - def test_umlautsTransliterated(self): - db = _FakeDb([_row("a1", "Müller AG", "Müller AG")]) - _migrateMandateNameLabelSlugRules(db) - assert db.rows[0]["name"] == "mueller-ag" - - -class TestMigrationCollisions: - def test_collisionsResolveByStableIdOrder(self): - rows = [ - _row("z1", "Home patrick", "Home Patrick"), - _row("a1", "home-patrick", "Home Patrick Two"), - ] - db = _FakeDb(rows) - _migrateMandateNameLabelSlugRules(db) - byId = {r["id"]: r for r in db.rows} - assert byId["a1"]["name"] == "home-patrick" - assert byId["z1"]["name"] == "home-patrick-2" - - def test_threeWayCollisionGetsThirdSuffix(self): - rows = [ - _row("id-aaa", "home-patrick", "Home Patrick"), - _row("id-bbb", "Home patrick", "Home Patrick"), - _row("id-ccc", "home patrick", "Home Patrick"), - ] - db = _FakeDb(rows) - _migrateMandateNameLabelSlugRules(db) - names = sorted(r["name"] for r in db.rows) - assert names == ["home-patrick", "home-patrick-2", "home-patrick-3"] - - -class TestMigrationIdempotency: - def test_secondRunIsNoop(self): - rows = [ - _row("a1", "home-patrick", "Home Patrick"), - _row("b1", "Home Müller", ""), - ] - db = _FakeDb(rows) - _migrateMandateNameLabelSlugRules(db) - assert all(isValidMandateName(r["name"]) for r in db.rows) - firstChanges = list(db.modifyCalls) - db.modifyCalls.clear() - _migrateMandateNameLabelSlugRules(db) - assert db.modifyCalls == [], ( - f"expected no further changes after first migration, got {db.modifyCalls}; " - f"firstRun changes: {firstChanges}" - ) - - def test_validRowsLeftUntouched(self): - rows = [_row("a1", "root", "Root"), _row("b1", "alpina-treuhand", "Alpina Treuhand AG")] - db = _FakeDb(rows) - _migrateMandateNameLabelSlugRules(db) - assert db.modifyCalls == [] - - -class TestMigrationEmpty: - def test_emptyDbDoesNothing(self): - db = _FakeDb([]) - _migrateMandateNameLabelSlugRules(db) - assert db.modifyCalls == [] diff --git a/tests/unit/rbac/test_sysadmin_migration.py b/tests/unit/rbac/test_sysadmin_migration.py deleted file mode 100644 index 8ca077bf..00000000 --- a/tests/unit/rbac/test_sysadmin_migration.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -""" -Unit tests for the one-shot sysadmin role -> isPlatformAdmin migration. - -Covers acceptance criteria from -``wiki/c-work/4-done/2026-04-sysadmin-authority-split.md``: - -- AC#4 -> Existing sysadmin role-holders are promoted to ``isPlatformAdmin=True`` - and the legacy role is removed (Role + UserMandateRole + AccessRules) - when the gateway boots. -- AC#10 -> The migration is idempotent and removes ALL artefacts (Role, - AccessRules, UserMandateRole) of the legacy ``sysadmin`` role. - -Strategy: use an in-memory fake ``DatabaseConnector`` that records calls -and returns deterministic recordsets for ``Role``/``UserMandateRole``/ -``UserMandate``/``UserInDB``/``AccessRule`` lookups. -""" - -from __future__ import annotations - -from typing import Any, Dict, List -from unittest.mock import Mock - -from modules.interfaces.interfaceBootstrap import _migrateAndDropSysAdminRole -from modules.datamodels.datamodelMembership import UserMandate, UserMandateRole -from modules.datamodels.datamodelRbac import AccessRule, Role -from modules.datamodels.datamodelUam import UserInDB - - -_ROOT_MANDATE_ID = "root-mandate-id" -_SYSADMIN_ROLE_ID = "sysadmin-role-id" -_USER_MANDATE_ID = "user-mandate-id" -_USER_ID = "legacy-user-id" -_UMR_ROW_ID = "umr-row-id" -_ACCESS_RULE_ID = "access-rule-id" - - -def _buildFakeDb( - *, - sysadminRoles: List[Dict[str, Any]], - umRoleRows: List[Dict[str, Any]], - userMandateRows: List[Dict[str, Any]], - users: List[Dict[str, Any]], - accessRules: List[Dict[str, Any]], -) -> Mock: - """Build a fake ``DatabaseConnector`` that maps model -> recordset.""" - - deletes: List[tuple] = [] - modifies: List[tuple] = [] - - def _getRecordset(model, recordFilter=None, **_): # noqa: ANN001 - recordFilter = recordFilter or {} - if model is Role: - label = recordFilter.get("roleLabel") - mandateId = recordFilter.get("mandateId") - if label == "sysadmin" and mandateId == _ROOT_MANDATE_ID: - return list(sysadminRoles) - return [] - if model is UserMandateRole: - wanted = recordFilter.get("roleId") - return [r for r in umRoleRows if r.get("roleId") == wanted] - if model is UserMandate: - wanted = recordFilter.get("id") - return [r for r in userMandateRows if r.get("id") == wanted] - if model is UserInDB: - wanted = recordFilter.get("id") - return [r for r in users if r.get("id") == wanted] - if model is AccessRule: - wanted = recordFilter.get("roleId") - return [r for r in accessRules if r.get("roleId") == wanted] - return [] - - def _recordModify(model, recordId, payload): # noqa: ANN001 - modifies.append((model, recordId, payload)) - # Reflect the change so a subsequent migration call is idempotent. - if model is UserInDB: - for u in users: - if u.get("id") == recordId: - u.update(payload) - return True - - def _recordDelete(model, recordId): # noqa: ANN001 - deletes.append((model, recordId)) - if model is UserMandateRole: - umRoleRows[:] = [r for r in umRoleRows if r.get("id") != recordId] - elif model is AccessRule: - accessRules[:] = [r for r in accessRules if r.get("id") != recordId] - elif model is Role: - sysadminRoles[:] = [r for r in sysadminRoles if r.get("id") != recordId] - return True - - db = Mock() - db.getRecordset = Mock(side_effect=_getRecordset) - db.recordModify = Mock(side_effect=_recordModify) - db.recordDelete = Mock(side_effect=_recordDelete) - db._modifies = modifies # exposed for assertions - db._deletes = deletes - return db - - -def _seed(): - return { - "sysadminRoles": [{"id": _SYSADMIN_ROLE_ID, "roleLabel": "sysadmin", - "mandateId": _ROOT_MANDATE_ID}], - "umRoleRows": [{"id": _UMR_ROW_ID, "roleId": _SYSADMIN_ROLE_ID, - "userMandateId": _USER_MANDATE_ID}], - "userMandateRows": [{"id": _USER_MANDATE_ID, "userId": _USER_ID, - "mandateId": _ROOT_MANDATE_ID}], - "users": [{"id": _USER_ID, "username": "legacy", - "isSysAdmin": False, "isPlatformAdmin": False}], - "accessRules": [{"id": _ACCESS_RULE_ID, "roleId": _SYSADMIN_ROLE_ID}], - } - - -# --------------------------------------------------------------------------- -# AC #4 — promote + drop on first run -# --------------------------------------------------------------------------- - - -def testMigrationPromotesUserAndDropsArtefacts(): - """AC#4: legacy holder is promoted; Role+AccessRule+UMR are deleted.""" - seed = _seed() - db = _buildFakeDb(**seed) - - _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID) - - # User got isPlatformAdmin=True - assert seed["users"][0]["isPlatformAdmin"] is True - assert any( - m[0] is UserInDB and m[2] == {"isPlatformAdmin": True} - for m in db._modifies - ), "Expected UserInDB.isPlatformAdmin promotion call" - - # All three artefact tables had their rows deleted. - deletedModels = {m[0] for m in db._deletes} - assert UserMandateRole in deletedModels, "UserMandateRole row not deleted" - assert AccessRule in deletedModels, "AccessRule row not deleted" - assert Role in deletedModels, "Sysadmin Role record not deleted" - - # And the seeded lists are empty after the migration. - assert seed["umRoleRows"] == [] - assert seed["accessRules"] == [] - assert seed["sysadminRoles"] == [] - - -# --------------------------------------------------------------------------- -# AC #10 — idempotent: a second run is a no-op -# --------------------------------------------------------------------------- - - -def testMigrationIsIdempotent(): - """AC#10: a second invocation finds no sysadmin role and exits silently.""" - seed = _seed() - db = _buildFakeDb(**seed) - - _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID) - firstModifies = list(db._modifies) - firstDeletes = list(db._deletes) - - _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID) - - # No additional writes on the second call. - assert db._modifies == firstModifies, ( - "Second migration call must not perform additional writes" - ) - assert db._deletes == firstDeletes, ( - "Second migration call must not perform additional deletes" - ) - - -def testMigrationSkipsAlreadyPromotedUsers(): - """If a user already has ``isPlatformAdmin=True``, no redundant write.""" - seed = _seed() - seed["users"][0]["isPlatformAdmin"] = True # already promoted - db = _buildFakeDb(**seed) - - _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID) - - # No promotion write for an already-promoted user. - promotionWrites = [ - m for m in db._modifies - if m[0] is UserInDB and m[2].get("isPlatformAdmin") is True - ] - assert promotionWrites == [], ( - "Should not re-write isPlatformAdmin if user already has it" - ) - - # But role + access-rule cleanup still happens. - deletedModels = {m[0] for m in db._deletes} - assert Role in deletedModels - assert AccessRule in deletedModels - assert UserMandateRole in deletedModels - - -def testMigrationOnEmptyDbIsNoop(): - """No legacy sysadmin role at all -> no calls, no errors.""" - db = _buildFakeDb( - sysadminRoles=[], - umRoleRows=[], - userMandateRows=[], - users=[], - accessRules=[], - ) - - _migrateAndDropSysAdminRole(db, _ROOT_MANDATE_ID) - - assert db._modifies == [] - assert db._deletes == [] diff --git a/tests/unit/workflows/test_automation2_graphUtils.py b/tests/unit/workflows/test_automation2_graphUtils.py index ff5df2cc..5ea7126a 100644 --- a/tests/unit/workflows/test_automation2_graphUtils.py +++ b/tests/unit/workflows/test_automation2_graphUtils.py @@ -66,6 +66,17 @@ class TestResolveParameterReferences: value = "Land: {{n1.country}}" assert resolveParameterReferences(value, node_outputs) == "Land: CH" + def test_legacy_string_template_loop_current_item_nested(self): + """Same shape as executionEngine sets on loop node id during body iteration.""" + node_outputs = { + "loop93": { + "currentItem": {"subject": "Hello", "body": {"content": "World"}}, + "currentIndex": 0, + }, + } + value = "Subj: {{loop93.currentItem.subject}} Body: {{loop93.currentItem.body.content}}" + assert resolveParameterReferences(value, node_outputs) == "Subj: Hello Body: World" + class TestWildcardIteration: """Phase-4 typed Bindings-Resolver: ``*`` segment iterates over a list. From b12671bbb531b11cfaa986ddee861cddf1693069 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Wed, 29 Apr 2026 22:54:17 +0200 Subject: [PATCH 12/18] fixes before document generation refactory styles --- modules/features/trustee/mainTrustee.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/modules/features/trustee/mainTrustee.py b/modules/features/trustee/mainTrustee.py index fba4346a..05e01e8a 100644 --- a/modules/features/trustee/mainTrustee.py +++ b/modules/features/trustee/mainTrustee.py @@ -440,15 +440,24 @@ TEMPLATE_WORKFLOWS = [ {"id": "analyse", "type": "ai.prompt", "label": "Budget-Analyse", "_method": "ai", "_action": "process", "parameters": { "aiPrompt": ( - "Fuehre einen Budget-Soll/Ist-Vergleich durch.\n" - "Die Budget-Datei (Excel) wurde als Dokument uebergeben. " - "Die aktuellen Buchhaltungsdaten sind im Kontext verfuegbar.\n" - "1. Lies die Soll-Werte aus dem uebergebenen Budget-Dokument\n" - "2. Vergleiche sie mit den Ist-Werten aus der Buchhaltung pro Konto\n" - "3. Berechne die Abweichung (absolut und prozentual)\n" - "4. Erstelle ein Abweichungs-Chart (Balkendiagramm: Soll vs. Ist pro Konto)\n" - "5. Markiere kritische Abweichungen (>10%) und gib eine kurze Einschaetzung" + "Fuehre einen Budget-Soll/Ist-Vergleich durch und liefere EIN Excel-Dokument " + "mit folgender Struktur:\n\n" + "1. Tabelle \"Konten-Vergleich\" -- EINE Tabelle, EINE Zeile pro Konto:\n" + " Spalten: Konto-Nr | Konto-Name | Soll | Ist | Abweichung absolut | " + "Abweichung % | Status (OK / Warnung / Kritisch).\n" + "2. EINE Visualisierung \"Soll vs. Ist gesamt\" -- ein einziges " + "Balkendiagramm UNTER der Tabelle, das ALLE Konten in einer Grafik " + "gegenueberstellt (gruppierte Balken: Soll und Ist je Konto).\n" + "3. Kurzer Management-Summary-Absatz (3-5 Saetze) UNTER dem Chart " + "mit den 3 groessten Abweichungen (>10%) und einer fachlichen " + "Einschaetzung.\n\n" + "Verwende die uebergebene Budget-Datei als Soll-Quelle und die im " + "Kontext bereitgestellten Buchhaltungsdaten als Ist-Quelle.\n" + "WICHTIG: Erstelle KEINEN separaten Chart pro Konto. Nur EIN " + "Uebersichts-Chart ueber alle Konten ist gewuenscht." ), + "resultType": "xlsx", + "documentTheme": "finance", "documentList": {"type": "ref", "nodeId": "trigger", "path": ["payload", "documentList"]}, "context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]}, "simpleMode": False, From afd7e9d941042534c314dee74c7d6cd965c37d59 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Wed, 29 Apr 2026 23:12:46 +0200 Subject: [PATCH 13/18] plan d implemented - generationn styles --- modules/datamodels/datamodelAi.py | 1 + modules/datamodels/datamodelJson.py | 17 +- .../graphicalEditor/nodeDefinitions/ai.py | 25 +- modules/features/trustee/mainTrustee.py | 25 +- .../workspace/datamodelFeatureWorkspace.py | 19 +- .../workspace/routeFeatureWorkspace.py | 76 +++++ modules/interfaces/interfaceAiObjects.py | 34 ++ .../serviceAgent/coreTools/_mediaTools.py | 248 +++++--------- .../services/serviceAi/mainServiceAi.py | 49 ++- .../mainServiceGeneration.py | 7 +- .../renderers/documentRendererBaseTemplate.py | 111 +++++- .../renderers/rendererDocx.py | 323 +++++++++--------- .../renderers/rendererHtml.py | 205 ++++++++++- .../renderers/rendererPdf.py | 144 +++++--- .../renderers/rendererPptx.py | 159 ++++++--- .../renderers/rendererXlsx.py | 55 +-- .../serviceGeneration/styleDefaults.py | 75 ++++ .../serviceGeneration/subDocumentUtility.py | 111 ++++-- modules/workflows/methods/methodAi/_common.py | 18 + .../methods/methodAi/actions/consolidate.py | 2 + .../methodAi/actions/convertDocument.py | 4 + .../methods/methodAi/actions/generateCode.py | 10 + .../methodAi/actions/generateDocument.py | 10 + .../methods/methodAi/actions/process.py | 13 + .../methodAi/actions/summarizeDocument.py | 4 + .../methodAi/actions/translateDocument.py | 4 + tests/serviceAi/__init__.py | 0 .../test_allowed_models_whitelist.py | 14 + tests/serviceGeneration/__init__.py | 0 .../test_inline_image_paragraph.py | 23 ++ .../test_md_to_json_consolidation.py | 71 ++++ .../serviceGeneration/test_style_resolver.py | 39 +++ 32 files changed, 1411 insertions(+), 485 deletions(-) create mode 100644 modules/serviceCenter/services/serviceGeneration/styleDefaults.py create mode 100644 modules/workflows/methods/methodAi/_common.py create mode 100644 tests/serviceAi/__init__.py create mode 100644 tests/serviceAi/test_allowed_models_whitelist.py create mode 100644 tests/serviceGeneration/__init__.py create mode 100644 tests/serviceGeneration/test_inline_image_paragraph.py create mode 100644 tests/serviceGeneration/test_md_to_json_consolidation.py create mode 100644 tests/serviceGeneration/test_style_resolver.py diff --git a/modules/datamodels/datamodelAi.py b/modules/datamodels/datamodelAi.py index cfc10db2..786eea7d 100644 --- a/modules/datamodels/datamodelAi.py +++ b/modules/datamodels/datamodelAi.py @@ -162,6 +162,7 @@ class AiCallOptions(BaseModel): # Provider filtering (from UI multiselect or automation config) allowedProviders: Optional[List[str]] = Field(default=None, description="List of allowed AI providers to use (empty = all RBAC-permitted)") + allowedModels: Optional[List[str]] = Field(default=None, description="Whitelist of allowed model names (AND-filter with allowedProviders). None/empty = all allowed.") class AiCallRequest(BaseModel): diff --git a/modules/datamodels/datamodelJson.py b/modules/datamodels/datamodelJson.py index 784cc042..0228fbad 100644 --- a/modules/datamodels/datamodelJson.py +++ b/modules/datamodels/datamodelJson.py @@ -6,7 +6,7 @@ Unified JSON document schema and helpers used by both generation prompts and ren This defines a single canonical template and the supported section types. """ -from typing import List +from typing import List, Literal, TypedDict # Canonical list of supported section types across the system supportedSectionTypes: List[str] = [ @@ -18,6 +18,21 @@ supportedSectionTypes: List[str] = [ "image", ] +class InlineRun(TypedDict, total=False): + """Single inline content run. Every paragraph/cell/list-item is a List[InlineRun].""" + type: Literal["text", "image", "link", "bold", "italic", "code"] + value: str # text content (for text/bold/italic/code/link-label) + fileId: str # for type=image: reference to FileItem + base64Data: str # for type=image: resolved base64 (post-processing) + mimeType: str # for type=image: e.g. "image/png" + widthPt: int # for type=image: optional render width + href: str # for type=link: URL target + +supportedInlineRunTypes: List[str] = [ + "text", "image", "link", "bold", "italic", "code", +] + + # Canonical JSON template used for AI generation (documents array + sections) # This template is used for STRUCTURE generation - sections have empty elements arrays. # For content generation, elements arrays will be populated later. diff --git a/modules/features/graphicalEditor/nodeDefinitions/ai.py b/modules/features/graphicalEditor/nodeDefinitions/ai.py index 3273540a..0336e382 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/ai.py +++ b/modules/features/graphicalEditor/nodeDefinitions/ai.py @@ -3,6 +3,15 @@ from modules.shared.i18nRegistry import t +_AI_COMMON_PARAMS = [ + {"name": "requireNeutralization", "type": "boolean", "required": False, + "frontendType": "checkbox", "default": False, + "description": t("Eingaben fuer diesen Call neutralisieren")}, + {"name": "allowedModels", "type": "array", "required": False, + "frontendType": "modelMultiSelect", "default": [], + "description": t("Erlaubte LLM-Modelle (leer = alle erlaubten)")}, +] + AI_NODES = [ { "id": "ai.prompt", @@ -19,7 +28,7 @@ AI_NODES = [ "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""}, {"name": "simpleMode", "type": "boolean", "required": False, "frontendType": "checkbox", "description": t("Einfacher Modus"), "default": True}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": [ @@ -38,7 +47,7 @@ AI_NODES = [ "parameters": [ {"name": "prompt", "type": "string", "required": True, "frontendType": "textarea", "description": t("Recherche-Anfrage")}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["Transit"]}}, @@ -58,7 +67,7 @@ AI_NODES = [ {"name": "summaryLength", "type": "string", "required": False, "frontendType": "select", "frontendOptions": {"options": ["brief", "medium", "detailed"]}, "description": t("Kurz, mittel oder ausführlich"), "default": "medium"}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}}, @@ -77,7 +86,7 @@ AI_NODES = [ "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""}, {"name": "targetLanguage", "type": "string", "required": True, "frontendType": "text", "description": t("Zielsprache (z.B. de, en, French)")}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}}, @@ -97,7 +106,7 @@ AI_NODES = [ {"name": "targetFormat", "type": "string", "required": True, "frontendType": "select", "frontendOptions": {"options": ["docx", "pdf", "xlsx", "csv", "txt", "html", "json", "md"]}, "description": t("Zielformat")}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["DocumentList", "Transit"]}}, @@ -114,7 +123,7 @@ AI_NODES = [ "parameters": [ {"name": "prompt", "type": "string", "required": True, "frontendType": "textarea", "description": t("Generierungs-Prompt")}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["Transit"]}}, @@ -134,7 +143,7 @@ AI_NODES = [ {"name": "resultType", "type": "string", "required": False, "frontendType": "select", "frontendOptions": {"options": ["py", "js", "ts", "html", "java", "cpp", "txt", "json", "csv", "xml"]}, "description": t("Datei-Endung der erzeugten Code-Datei"), "default": "py"}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["Transit"]}}, @@ -154,7 +163,7 @@ AI_NODES = [ "description": t("Konsolidierungsmodus"), "default": "summarize"}, {"name": "prompt", "type": "string", "required": False, "frontendType": "textarea", "description": t("Optionaler Prompt für die Konsolidierung"), "default": ""}, - ], + ] + _AI_COMMON_PARAMS, "inputs": 1, "outputs": 1, "inputPorts": {0: {"accepts": ["AggregateResult", "Transit"]}}, diff --git a/modules/features/trustee/mainTrustee.py b/modules/features/trustee/mainTrustee.py index 05e01e8a..d8f7a804 100644 --- a/modules/features/trustee/mainTrustee.py +++ b/modules/features/trustee/mainTrustee.py @@ -361,6 +361,17 @@ QUICK_ACTIONS = [ # The placeholder {{featureInstanceId}} is replaced by _copyTemplateWorkflows. # --------------------------------------------------------------------------- +_FINANCE_STYLE_HINT = ( + "\n\nWenn du ein Dokument erstellst, verwende einen professionellen Finanz-Stil:\n" + "- Schriftart: Calibri\n" + "- Primaerfarbe: #1F3864 (Dunkelblau)\n" + "- Akzentfarbe: #2980B9\n" + "- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n" + "- Konservatives, seriöses Layout\n" + "Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen." +) + + def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]: """Build a standard analysis graph: trigger -> refreshAccountingData -> ai.prompt.""" return { @@ -370,8 +381,9 @@ def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]: "parameters": {"featureInstanceId": "{{featureInstanceId}}", "forceRefresh": False}, "position": {"x": 250, "y": 0}}, {"id": "analyse", "type": "ai.prompt", "label": "Analyse", "_method": "ai", "_action": "process", "parameters": { - "aiPrompt": prompt, + "aiPrompt": prompt + _FINANCE_STYLE_HINT, "context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]}, + "requireNeutralization": True, "simpleMode": False, }, "position": {"x": 500, "y": 0}}, ], @@ -454,10 +466,19 @@ TEMPLATE_WORKFLOWS = [ "Verwende die uebergebene Budget-Datei als Soll-Quelle und die im " "Kontext bereitgestellten Buchhaltungsdaten als Ist-Quelle.\n" "WICHTIG: Erstelle KEINEN separaten Chart pro Konto. Nur EIN " - "Uebersichts-Chart ueber alle Konten ist gewuenscht." + "Uebersichts-Chart ueber alle Konten ist gewuenscht.\n\n" + "Hinweis: Das documentTheme ist 'finance'. Wenn du ein Dokument erstellst, " + "verwende einen professionellen Finanz-Stil:\n" + "- Schriftart: Calibri\n" + "- Primaerfarbe: #1F3864 (Dunkelblau)\n" + "- Akzentfarbe: #2980B9\n" + "- Tabellen mit dunklem Header (#1F3864, weisse Schrift)\n" + "- Konservatives, seriöses Layout\n" + "Nutze den style-Parameter von renderDocument um diese Vorgaben umzusetzen." ), "resultType": "xlsx", "documentTheme": "finance", + "requireNeutralization": True, "documentList": {"type": "ref", "nodeId": "trigger", "path": ["payload", "documentList"]}, "context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]}, "simpleMode": False, diff --git a/modules/features/workspace/datamodelFeatureWorkspace.py b/modules/features/workspace/datamodelFeatureWorkspace.py index b12d4b84..4e32702c 100644 --- a/modules/features/workspace/datamodelFeatureWorkspace.py +++ b/modules/features/workspace/datamodelFeatureWorkspace.py @@ -2,8 +2,8 @@ # All rights reserved. """Workspace feature data models — WorkspaceUserSettings.""" -from typing import Optional -from pydantic import BaseModel, Field +from typing import List, Optional +from pydantic import Field from modules.datamodels.datamodelBase import PowerOnModel from modules.shared.i18nRegistry import i18nModel import uuid @@ -52,3 +52,18 @@ class WorkspaceUserSettings(PowerOnModel): description="Max agent rounds override (None = instance default)", json_schema_extra={"label": "Max. Agenten-Runden", "frontend_type": "number", "frontend_readonly": False, "frontend_required": False}, ) + requireNeutralization: bool = Field( + default=False, + description="Default neutralization setting for this user", + json_schema_extra={"label": "Neutralisierung", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False}, + ) + allowedProviders: List[str] = Field( + default_factory=list, + description="Allowed AI providers (empty = all permitted by RBAC)", + json_schema_extra={"label": "Erlaubte Provider", "frontend_type": "multiselect", "frontend_readonly": False, "frontend_required": False}, + ) + allowedModels: List[str] = Field( + default_factory=list, + description="Allowed AI models (empty = all permitted)", + json_schema_extra={"label": "Erlaubte Modelle", "frontend_type": "modelMultiSelect", "frontend_readonly": False, "frontend_required": False}, + ) diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py index 3e1a54b7..5b0d4d7a 100644 --- a/modules/features/workspace/routeFeatureWorkspace.py +++ b/modules/features/workspace/routeFeatureWorkspace.py @@ -110,6 +110,7 @@ class WorkspaceInputRequest(BaseModel): workflowId: Optional[str] = Field(default=None, description="Continue existing workflow") userLanguage: str = Field(default="en", description="User language code") allowedProviders: List[str] = Field(default_factory=list, description="Restrict AI to these providers") + allowedModels: List[str] = Field(default_factory=list, description="Restrict AI to these models") requireNeutralization: Optional[bool] = Field(default=None, description="Per-request neutralization override") @@ -635,6 +636,7 @@ async def streamWorkspaceStart( userLanguage=userInput.userLanguage, instanceConfig=instanceConfig, allowedProviders=userInput.allowedProviders, + allowedModels=userInput.allowedModels, requireNeutralization=userInput.requireNeutralization, billingFeatureCode=wsBillingFeatureCode, ) @@ -692,6 +694,7 @@ async def _runWorkspaceAgent( userLanguage: str = "en", instanceConfig: Dict[str, Any] = None, allowedProviders: List[str] = None, + allowedModels: List[str] = None, requireNeutralization: Optional[bool] = None, billingFeatureCode: Optional[str] = None, ): @@ -715,6 +718,9 @@ async def _runWorkspaceAgent( logger.info(f"Workspace agent: allowedProviders={allowedProviders}") else: logger.debug("Workspace agent: no allowedProviders in request") + if allowedModels: + aiService.services.allowedModels = allowedModels + logger.info(f"Workspace agent: allowedModels={allowedModels}") if requireNeutralization is not None: ctx.requireNeutralization = requireNeutralization @@ -2139,6 +2145,76 @@ async def updateGeneralSettings( return await getGeneralSettings(request, instanceId, context) +# ========================================================================= +# User-level AI settings (neutralisation, providers, models) +# ========================================================================= + +@router.get("/{instanceId}/user-settings") +@limiter.limit("120/minute") +async def getWorkspaceUserSettings( + request: Request, + instanceId: str = Path(...), + context: RequestContext = Depends(getRequestContext), +): + """Get the current user's workspace AI settings (auto-creates with defaults if not exists).""" + _mandateId, _ = _validateInstanceAccess(instanceId, context) + wsInterface = _getWorkspaceInterface(context, instanceId) + userId = str(context.user.id) + + settings = wsInterface.getWorkspaceUserSettings(userId) + if settings: + return JSONResponse({ + "requireNeutralization": settings.requireNeutralization, + "allowedProviders": settings.allowedProviders, + "allowedModels": settings.allowedModels, + }) + + data = { + "userId": userId, + "mandateId": str(context.mandateId) if context.mandateId else "", + "featureInstanceId": instanceId, + } + created = wsInterface.saveWorkspaceUserSettings(data) + return JSONResponse({ + "requireNeutralization": created.requireNeutralization, + "allowedProviders": created.allowedProviders, + "allowedModels": created.allowedModels, + }) + + +@router.put("/{instanceId}/user-settings") +@limiter.limit("120/minute") +async def putWorkspaceUserSettings( + request: Request, + instanceId: str = Path(...), + body: dict = Body(...), + context: RequestContext = Depends(getRequestContext), +): + """Save the current user's workspace AI settings.""" + _mandateId, _ = _validateInstanceAccess(instanceId, context) + wsInterface = _getWorkspaceInterface(context, instanceId) + userId = str(context.user.id) + + data = { + "userId": userId, + "mandateId": str(context.mandateId) if context.mandateId else "", + "featureInstanceId": instanceId, + } + if "requireNeutralization" in body: + data["requireNeutralization"] = bool(body["requireNeutralization"]) + if "allowedProviders" in body: + data["allowedProviders"] = body["allowedProviders"] + if "allowedModels" in body: + data["allowedModels"] = body["allowedModels"] + + saved = wsInterface.saveWorkspaceUserSettings(data) + return JSONResponse({ + "requireNeutralization": saved.requireNeutralization, + "allowedProviders": saved.allowedProviders, + "allowedModels": saved.allowedModels, + }) + + # ========================================================================= # RAG / Knowledge — anonymised instance statistics (presentation / KPIs) # ========================================================================= diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index a859ffa7..dcf819cc 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -111,6 +111,19 @@ class AiObjects: processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1, ) + allowedModels = getattr(options, 'allowedModels', None) if options else None + if allowedModels: + filteredModels = [m for m in availableModels if m.name in allowedModels] + if filteredModels: + availableModels = filteredModels + else: + errorMsg = f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}" + logger.error(errorMsg) + return AiCallResponse( + content=errorMsg, modelName="error", priceCHF=0.0, + processingTime=0.0, bytesSent=0, bytesReceived=0, errorCount=1, + ) + failoverModelList = modelSelector.getFailoverModelList(prompt, context, options, availableModels) if not failoverModelList: @@ -364,6 +377,19 @@ class AiObjects: ) return + allowedModels = getattr(options, 'allowedModels', None) if options else None + if allowedModels: + filtered = [m for m in availableModels if m.name in allowedModels] + if filtered: + availableModels = filtered + else: + yield AiCallResponse( + content=f"No models match allowedModels {allowedModels} (providers={allowedProviders}) for operation {options.operationType}", + modelName="error", priceCHF=0.0, processingTime=0.0, + bytesSent=0, bytesReceived=0, errorCount=1, + ) + return + failoverModelList = modelSelector.getFailoverModelList( request.prompt, request.context or "", options, availableModels ) @@ -516,6 +542,14 @@ class AiObjects: else: logger.warning(f"No embedding models match allowedProviders {allowedProviders}") + allowedModels = getattr(options, 'allowedModels', None) if options else None + if allowedModels: + filtered = [m for m in availableModels if m.name in allowedModels] + if filtered: + availableModels = filtered + else: + logger.warning(f"No embedding models match allowedModels {allowedModels}") + failoverModelList = modelSelector.getFailoverModelList( combinedText, "", options, availableModels ) diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py index c2a4842b..7b071996 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py @@ -25,142 +25,11 @@ def _registerMediaTools(registry: ToolRegistry, services): # ---- Document rendering tool ---- def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]: - """Convert markdown content to the standard document JSON format expected by renderers.""" - import re as _re - - sections = [] - order = 0 - lines = markdown.split("\n") - i = 0 - - def _nextId(): - nonlocal order - order += 1 - return f"s_{order}" - - while i < len(lines): - line = lines[i] - - # --- Headings --- - headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line) - if headingMatch: - level = len(headingMatch.group(1)) - text = headingMatch.group(2).strip() - sections.append({ - "id": _nextId(), "content_type": "heading", "order": order, - "elements": [{"content": {"text": text, "level": level}}], - }) - i += 1 - continue - - # --- Fenced code blocks --- - codeMatch = _re.match(r'^```(\w*)', line) - if codeMatch: - lang = codeMatch.group(1) or "text" - codeLines = [] - i += 1 - while i < len(lines) and not lines[i].startswith("```"): - codeLines.append(lines[i]) - i += 1 - i += 1 - sections.append({ - "id": _nextId(), "content_type": "code_block", "order": order, - "elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}], - }) - continue - - # --- Tables --- - tableMatch = _re.match(r'^\|(.+)\|$', line) - if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]): - headerCells = [c.strip() for c in tableMatch.group(1).split("|")] - i += 2 - rows = [] - while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]): - rowCells = [c.strip() for c in lines[i][1:-1].split("|")] - rows.append(rowCells) - i += 1 - sections.append({ - "id": _nextId(), "content_type": "table", "order": order, - "elements": [{"content": {"headers": headerCells, "rows": rows}}], - }) - continue - - # --- Bullet / numbered lists --- - listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line) - if listMatch: - isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2))) - items = [] - while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]): - m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]) - items.append({"text": m.group(3).strip()}) - i += 1 - sections.append({ - "id": _nextId(), "content_type": "bullet_list", "order": order, - "elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}], - }) - continue - - # --- Empty lines (skip) --- - if not line.strip(): - i += 1 - continue - - # --- Images: ![alt](file:fileId) or ![alt](url) --- - imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line) - if imgMatch: - altText = imgMatch.group(1).strip() or "Image" - src = imgMatch.group(2).strip() - fileId = "" - if src.startswith("file:"): - fileId = src[5:] - sections.append({ - "id": _nextId(), "content_type": "image", "order": order, - "elements": [{ - "content": { - "altText": altText, - "base64Data": "", - "_fileRef": fileId, - "_srcUrl": src if not fileId else "", - } - }], - }) - i += 1 - continue - - # --- Paragraph (collect consecutive non-empty lines) --- - paraLines = [] - while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]): - paraLines.append(lines[i]) - i += 1 - if paraLines: - sections.append({ - "id": _nextId(), "content_type": "paragraph", "order": order, - "elements": [{"content": {"text": " ".join(paraLines)}}], - }) - continue - - i += 1 - - if not sections: - sections.append({ - "id": _nextId(), "content_type": "paragraph", "order": order, - "elements": [{"content": {"text": markdown.strip() or "(empty)"}}], - }) - - return { - "metadata": { - "split_strategy": "single_document", - "source_documents": [], - "extraction_method": "agent_rendering", - "title": title, - "language": language, - }, - "documents": [{ - "id": "doc_1", - "title": title, - "sections": sections, - }], - } + """Delegate to the consolidated parser in subDocumentUtility.""" + from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson + result = markdownToDocumentJson(markdown, title, language) + result["metadata"]["extraction_method"] = "agent_rendering" + return result async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]): """Render agent-produced markdown content into any document format via the RendererRegistry.""" @@ -245,35 +114,75 @@ def _registerMediaTools(registry: ToolRegistry, services): except Exception as e: logger.warning(f"renderDocument: knowledge service unavailable: {e}") resolvedImages = 0 + + def _resolveImageRef(targetObj, fileRefKey="_fileRef", fileIdKey="fileId"): + """Resolve a single image reference dict to base64Data in-place.""" + nonlocal resolvedImages + fileRef = targetObj.get(fileRefKey, "") or targetObj.get(fileIdKey, "") + if not fileRef or targetObj.get("base64Data"): + return + if knowledgeService: + chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef) + imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"] + if imageChunks: + targetObj["base64Data"] = imageChunks[0].get("data", "") + chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png") + targetObj["mimeType"] = chunkMime + resolvedImages += 1 + if not targetObj.get("base64Data"): + try: + rawBytes = services.chat.getFileData(fileRef) + if rawBytes: + import base64 as _b64 + targetObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii") + targetObj["mimeType"] = "image/png" + resolvedImages += 1 + except Exception as e: + logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}") + targetObj.pop("_fileRef", None) + targetObj.pop("_srcUrl", None) + + def _resolveInlineRuns(runsList): + """Scan a list of inline runs and resolve any image runs with fileId.""" + for run in runsList: + if run.get("type") == "image" and run.get("fileId") and not run.get("base64Data"): + _resolveImageRef(run, fileRefKey="fileId", fileIdKey="fileId") + for doc in structuredContent.get("documents", []): for section in doc.get("sections", []): - if section.get("content_type") != "image": + cType = section.get("content_type") + # Block-level image sections + if cType == "image": + for element in section.get("elements", []): + contentObj = element.get("content", {}) + _resolveImageRef(contentObj) continue - for element in section.get("elements", []): - contentObj = element.get("content", {}) - fileRef = contentObj.get("_fileRef", "") - if not fileRef or contentObj.get("base64Data"): - continue - if knowledgeService: - chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef) - imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"] - if imageChunks: - contentObj["base64Data"] = imageChunks[0].get("data", "") - chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png") - contentObj["mimeType"] = chunkMime - resolvedImages += 1 - if not contentObj.get("base64Data"): - try: - rawBytes = services.chat.getFileData(fileRef) - if rawBytes: - import base64 as _b64 - contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii") - contentObj["mimeType"] = "image/png" - resolvedImages += 1 - except Exception as e: - logger.warning(f"renderDocument: image resolve failed for fileRef={fileRef}: {e}") - contentObj.pop("_fileRef", None) - contentObj.pop("_srcUrl", None) + # Paragraphs with inlineRuns + if cType == "paragraph": + for element in section.get("elements", []): + runs = element.get("content", {}).get("inlineRuns") + if runs: + _resolveInlineRuns(runs) + continue + # Bullet lists - items are List[List[InlineRun]] + if cType == "bullet_list": + for element in section.get("elements", []): + items = element.get("content", {}).get("items", []) + for item in items: + if isinstance(item, list): + _resolveInlineRuns(item) + continue + # Tables - headers and row cells are List[InlineRun] + if cType == "table": + for element in section.get("elements", []): + contentObj = element.get("content", {}) + for cell in contentObj.get("headers", []): + if isinstance(cell, list): + _resolveInlineRuns(cell) + for row in contentObj.get("rows", []): + for cell in row: + if isinstance(cell, list): + _resolveInlineRuns(cell) sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", [])) logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}") @@ -285,6 +194,7 @@ def _registerMediaTools(registry: ToolRegistry, services): language=language, title=title, userPrompt=content, + style=args.get("style"), ) if not documents: @@ -367,6 +277,20 @@ def _registerMediaTools(registry: ToolRegistry, services): "outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"}, "title": {"type": "string", "description": "Document title", "default": "Document"}, "language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"}, + "style": { + "type": "object", + "description": ( + "Optional style overrides for the rendered document. Supports nested keys: " + "fonts (primary, monospace), colors (primary, secondary, accent, background), " + "headings (h1-h4 with sizePt, weight, color, spaceBeforePt, spaceAfterPt), " + "paragraph (sizePt, lineSpacing, color), table (headerBg, headerFg, headerSizePt, " + "bodySizePt, rowBandingEven, rowBandingOdd, borderColor, borderWidthPt), " + "list (bulletChar, indentPt, sizePt), image (defaultWidthPt, maxWidthPt, alignment), " + "codeBlock (fontSizePt, background, borderColor), " + "page (format, marginsPt, showPageNumbers, headerHeight, footerHeight, headerLogo, headerText, footerText). " + "Only provided keys override defaults; omitted keys keep their default values." + ), + }, }, }, readOnly=False, diff --git a/modules/serviceCenter/services/serviceAi/mainServiceAi.py b/modules/serviceCenter/services/serviceAi/mainServiceAi.py index 6428bed3..18ac46bc 100644 --- a/modules/serviceCenter/services/serviceAi/mainServiceAi.py +++ b/modules/serviceCenter/services/serviceAi/mainServiceAi.py @@ -86,7 +86,7 @@ class _ServicesAdapter: return getattr(w, "featureCode", None) if w else None def __getattr__(self, name: str): - if name in ("allowedProviders", "preferredProviders", "currentUserLanguage"): + if name in ("allowedProviders", "allowedModels", "preferredProviders", "currentUserLanguage"): return getattr(self.workflow, name, None) if self.workflow else None raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") @@ -177,6 +177,11 @@ class AiService: request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders}) logger.debug(f"Effective allowedProviders for AI request: {effectiveProviders}") + # Calculate effective allowedModels: Workflow ∩ Request (node-level) + effectiveModels = self._calculateEffectiveModels(request) + if effectiveModels and request.options: + request.options = request.options.model_copy(update={'allowedModels': effectiveModels}) + # Neutralize prompt if enabled (before AI call) _wasNeutralized = False _excludedDocs: List[str] = [] @@ -225,6 +230,11 @@ class AiService: if effectiveProviders and request.options: request.options = request.options.model_copy(update={'allowedProviders': effectiveProviders}) + # Calculate effective allowedModels: Workflow ∩ Request (node-level) + effectiveModels = self._calculateEffectiveModels(request) + if effectiveModels and request.options: + request.options = request.options.model_copy(update={'allowedModels': effectiveModels}) + # Neutralize prompt if enabled (before streaming) _wasNeutralized = False _excludedDocs: List[str] = [] @@ -1240,6 +1250,43 @@ detectedIntent-Werte: logger.warning(f"Error calculating effective providers: {e}") return None + def _calculateEffectiveModels(self, request: AiCallRequest = None) -> Optional[List[str]]: + """ + Calculate effective allowed models: Workflow.allowedModels ∩ request.options.allowedModels. + + AND-logic intersection: + - If workflow specifies allowedModels, start with those. + - If request (node-level) also specifies allowedModels, intersect. + - Returns None if no model filtering is needed. + """ + try: + effectiveModels = None + + # Workflow-level allowedModels (from automation config) + workflowModels = getattr(self.services, 'allowedModels', None) + if workflowModels: + effectiveModels = list(workflowModels) + + # Request-level (node-level) allowedModels + requestModels = None + if request and request.options and request.options.allowedModels: + requestModels = request.options.allowedModels + + if requestModels: + if effectiveModels: + effectiveModels = [m for m in effectiveModels if m in requestModels] + else: + effectiveModels = list(requestModels) + + if effectiveModels: + logger.debug(f"Model filter: Workflow={workflowModels}, Request={requestModels}, Effective={effectiveModels}") + + return effectiveModels if effectiveModels else None + + except Exception as e: + logger.warning(f"Error calculating effective models: {e}") + return None + async def ensureAiObjectsInitialized(self): """Ensure aiObjects is initialized and submodules are ready.""" if self.aiObjects is None: diff --git a/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py b/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py index b9377404..6afcc0a8 100644 --- a/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/serviceCenter/services/serviceGeneration/mainServiceGeneration.py @@ -14,6 +14,7 @@ from .subDocumentUtility import ( detectMimeTypeFromData, convertDocumentDataToString ) +from .styleDefaults import resolveStyle logger = logging.getLogger(__name__) @@ -382,7 +383,7 @@ class GenerationService: 'workflowId': 'unknown' } - async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None) -> List[RenderedDocument]: + async def renderReport(self, extractedContent: Dict[str, Any], outputFormat: str, language: str, title: str, userPrompt: str = None, aiService=None, parentOperationId: Optional[str] = None, style: Optional[Dict[str, Any]] = None) -> List[RenderedDocument]: """ Render extracted JSON content to the specified output format. Processes EACH document separately and calls renderer for each. @@ -399,12 +400,14 @@ class GenerationService: userPrompt: User's original prompt for report generation aiService: AI service instance for generation prompt creation parentOperationId: Optional parent operation ID for hierarchical logging + style: Optional style overrides (deep-merged with DEFAULT_STYLE) Returns: List of RenderedDocument objects. Each RenderedDocument represents one rendered file (main document or supporting file) """ try: + resolvedStyle = resolveStyle(style) # Validate JSON input if not isinstance(extractedContent, dict): raise ValueError("extractedContent must be a JSON dictionary") @@ -469,7 +472,7 @@ class GenerationService: docTitle = doc.get("title", title) # Render this document (can return multiple files, e.g., HTML + images) - renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService) + renderedDocs = await renderer.render(singleDocContent, docTitle, userPrompt, aiService, style=resolvedStyle) allRenderedDocuments.extend(renderedDocs) logger.info(f"Rendered {len(documents)} document(s) into {len(allRenderedDocuments)} file(s)") diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py b/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py index b080ce88..583c423c 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/documentRendererBaseTemplate.py @@ -84,7 +84,7 @@ class BaseRenderer(ABC): return list(supportedSectionTypes) @abstractmethod - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render extracted JSON content to multiple documents. Each renderer must implement this method. @@ -95,6 +95,9 @@ class BaseRenderer(ABC): title: Report title userPrompt: Original user prompt for context aiService: AI service instance for additional processing + style: Fully-resolved unified style dict from styleDefaults.resolveStyle(). + When provided, renderers use these values instead of their + own defaults / AI-generated styles. Returns: List of RenderedDocument objects. @@ -102,6 +105,112 @@ class BaseRenderer(ABC): Even if only one document is returned, it must be wrapped in a list. """ pass + + def _convertUnifiedStyleToInternal(self, style: Dict[str, Any]) -> Dict[str, Any]: + """Convert the unified resolvedStyle dict (from styleDefaults) into + the renderer-internal style-set format that all rendering methods already + consume. Override in subclasses for format-specific tweaks.""" + h1 = style["headings"]["h1"] + h2 = style["headings"]["h2"] + h3 = style["headings"].get("h3", h2) + h4 = style["headings"].get("h4", h3) + tbl = style["table"] + para = style["paragraph"] + lst = style["list"] + cb = style["codeBlock"] + return { + "title": { + "font_size": h1["sizePt"], "color": h1["color"], + "bold": h1.get("weight") == "bold", "align": "left", + }, + "heading1": { + "font_size": h1["sizePt"], "color": h1["color"], + "bold": h1.get("weight") == "bold", "align": "left", + }, + "heading2": { + "font_size": h2["sizePt"], "color": h2["color"], + "bold": h2.get("weight") == "bold", "align": "left", + }, + "heading3": { + "font_size": h3["sizePt"], "color": h3["color"], + "bold": h3.get("weight") == "bold", "align": "left", + }, + "heading4": { + "font_size": h4["sizePt"], "color": h4["color"], + "bold": h4.get("weight") == "bold", "align": "left", + }, + "paragraph": { + "font_size": para["sizePt"], "color": para["color"], + "bold": False, "align": "left", + }, + "table_header": { + "background": tbl["headerBg"], "text_color": tbl["headerFg"], + "bold": True, "align": "center", + }, + "table_cell": { + "background": tbl["rowBandingOdd"], "text_color": para["color"], + "bold": False, "align": "left", + }, + "table_border": { + "style": "grid", "color": tbl["borderColor"], + }, + "bullet_list": { + "font_size": lst["sizePt"], "color": para["color"], + "indent": lst["indentPt"], + }, + "code_block": { + "font": style["fonts"]["monospace"], + "font_size": cb["fontSizePt"], "color": para["color"], + "background": cb["background"], + }, + } + + @staticmethod + def _inlineRunsFromContent(content: Dict[str, Any], *, itemsKey: str = None) -> Any: + """Extract inline runs from new-format content, falling back to old format. + + For paragraphs (itemsKey=None): + new: content["inlineRuns"] -> List[InlineRun] + old: content["text"] -> wrapped in [{"type":"text","value":text}] + + For list items (itemsKey="items"): + new: content["items"] is List[List[InlineRun]] + old: content["items"] is List[str] or List[{"text":…}] + Returns the items list (caller decides per-item conversion). + + For table headers/cells: + new: each header/cell is List[InlineRun] + old: each header/cell is a plain str + Caller handles per-cell. + """ + if itemsKey: + return content.get(itemsKey, []) + inlineRuns = content.get("inlineRuns") + if inlineRuns: + return inlineRuns + text = content.get("text", "") + if text: + return [{"type": "text", "value": text}] + return [] + + @staticmethod + def _inlineRunsForCell(cell) -> list: + """Normalize a single table header or cell value to List[InlineRun]. + Accepts either a plain string or an already-correct list of run dicts.""" + if isinstance(cell, list): + return cell + return [{"type": "text", "value": str(cell) if cell is not None else ""}] + + @staticmethod + def _inlineRunsForListItem(item) -> list: + """Normalize a single list item to List[InlineRun]. + Accepts a plain string, a dict with 'text', or an already-correct list of run dicts.""" + if isinstance(item, list): + return item + if isinstance(item, dict): + text = item.get("text", "") + return [{"type": "text", "value": text}] + return [{"type": "text", "value": str(item)}] def _determineFilename(self, title: str, mimeType: str) -> str: """Determine filename from title and mimeType.""" diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py index 7a1277ca..ab37f756 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererDocx.py @@ -53,18 +53,17 @@ class RendererDocx(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to DOCX format using AI-analyzed styling.""" self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER") try: if not DOCX_AVAILABLE: - # Fallback to HTML if python-docx not available from .rendererHtml import RendererHtml htmlRenderer = RendererHtml() - return await htmlRenderer.render(extractedContent, title, userPrompt, aiService) + return await htmlRenderer.render(extractedContent, title, userPrompt, aiService, style=style) # Generate DOCX using AI-analyzed styling - docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService) + docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style) # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} @@ -114,23 +113,27 @@ class RendererDocx(BaseRenderer): ) ] - async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: + async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, unifiedStyle: Dict[str, Any] = None) -> str: """Generate DOCX content from structured JSON document.""" import time start_time = time.time() try: self.logger.debug("_generateDocxFromJson: Starting document generation") - # Create new document doc = Document() self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s") - # Get style set: use styles from metadata if available, otherwise enhance with AI - template_from_metadata = None - if json_content and isinstance(json_content.get("metadata"), dict): - template_from_metadata = json_content["metadata"].get("templateName") + # Phase 3: prefer unified style when provided style_start = time.time() self.logger.debug("_generateDocxFromJson: About to get style set") - styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata) + if unifiedStyle: + styleSet = self._convertUnifiedStyleToInternal(unifiedStyle) + self._unifiedStyle = unifiedStyle + else: + template_from_metadata = None + if json_content and isinstance(json_content.get("metadata"), dict): + template_from_metadata = json_content["metadata"].get("templateName") + styleSet = await self._getStyleSet(json_content, userPrompt, aiService, templateName=template_from_metadata) + self._unifiedStyle = None self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s") # Setup basic document styles and create all styles from style set @@ -298,11 +301,11 @@ class RendererDocx(BaseRenderer): def _setupBasicDocumentStyles(self, doc: Document) -> None: """Set up basic document styles.""" try: - # Set default font style = doc.styles['Normal'] font = style.font - font.name = 'Calibri' - font.size = Pt(11) + us = getattr(self, '_unifiedStyle', None) + font.name = us["fonts"]["primary"] if us else 'Calibri' + font.size = Pt(us["paragraph"]["sizePt"] if us else 11) except Exception as e: self.logger.warning(f"Could not set up basic document styles: {str(e)}") @@ -421,6 +424,8 @@ class RendererDocx(BaseRenderer): def _addMarkdownInlineRuns(self, paragraph, text: str) -> None: """Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph.""" pos = 0 + us = getattr(self, '_unifiedStyle', None) + monoFont = us["fonts"]["monospace"] if us else "Courier New" for m in self._MD_INLINE_RE.finditer(text): if m.start() > pos: paragraph.add_run(text[pos:m.start()]) @@ -434,12 +439,45 @@ class RendererDocx(BaseRenderer): paragraph.add_run(m.group(6)).italic = True elif m.group(7): run = paragraph.add_run(m.group(7)) - run.font.name = "Courier New" + run.font.name = monoFont run.font.size = Pt(9) pos = m.end() if pos < len(text): paragraph.add_run(text[pos:]) + def _renderInlineRuns(self, runs: list, paragraph, styleSet: Dict[str, Any]) -> None: + """Process a list of InlineRun dicts into python-docx Runs on a paragraph.""" + us = getattr(self, '_unifiedStyle', None) + monoFont = us["fonts"]["monospace"] if us else "Courier New" + for run in runs: + runType = run.get("type", "text") + value = run.get("value", "") + if runType == "text": + paragraph.add_run(value) + elif runType == "bold": + paragraph.add_run(value).bold = True + elif runType == "italic": + paragraph.add_run(value).italic = True + elif runType == "code": + r = paragraph.add_run(value) + r.font.name = monoFont + r.font.size = Pt(9) + elif runType == "link": + r = paragraph.add_run(value) + r.font.underline = True + r.font.color.rgb = RGBColor(0x29, 0x80, 0xB9) + elif runType == "image": + b64 = run.get("base64Data", "") + if b64: + try: + imgBytes = base64.b64decode(b64) + imgStream = io.BytesIO(imgBytes) + paragraph.add_run().add_picture(imgStream, width=Inches(2)) + except Exception: + paragraph.add_run(f"[Image: {run.get('altText', '')}]") + else: + paragraph.add_run(value) + def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """ Render a JSON table to DOCX using AI-generated styles. @@ -485,7 +523,7 @@ class RendererDocx(BaseRenderer): except Exception as e: self.logger.error(f"Error rendering table: {str(e)}", exc_info=True) - def _renderTableFastXml(self, doc: Document, headers: List[str], rows: List[List[Any]], styles: Dict[str, Any]) -> None: + def _renderTableFastXml(self, doc: Document, headers: list, rows: list, styles: Dict[str, Any]) -> None: """ High-performance table rendering using direct XML manipulation. @@ -546,24 +584,34 @@ class RendererDocx(BaseRenderer): # Build all rows using fast XML rows_start = time.time() - # Header row - headerRow = self._createTableRowXml(headers, isHeader=True) + # Resolve header style colors + tableStyle = styles.get("table_header", {}) + headerBg = tableStyle.get("background", "") + headerFg = tableStyle.get("text_color", "") + + # Flatten inline-run headers to plain strings for fast XML path + flatHeaders = [] + for h in headers: + runs = self._inlineRunsForCell(h) + flatHeaders.append("".join(r.get("value", "") for r in runs)) + + headerRow = self._createTableRowXml(flatHeaders, isHeader=True, headerBgHex=headerBg or None, headerFgHex=headerFg or None) tbl.append(headerRow) - + header_time = time.time() - rows_start self.logger.debug(f"_renderTableFastXml: Header row created in {header_time:.3f}s") - - # Data rows - batch process for performance + data_start = time.time() rowCount = len(rows) - + for idx, rowData in enumerate(rows): - # Convert all cells to strings - cellTexts = [str(cell) if cell is not None else '' for cell in rowData] - # Pad if needed - while len(cellTexts) < len(headers): + cellTexts = [] + for cell in rowData: + runs = self._inlineRunsForCell(cell) + cellTexts.append("".join(r.get("value", "") for r in runs)) + while len(cellTexts) < len(flatHeaders): cellTexts.append('') - + row = self._createTableRowXml(cellTexts, isHeader=False) tbl.append(row) @@ -641,74 +689,64 @@ class RendererDocx(BaseRenderer): return tblBorders - def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any: - """ - Create a table row XML element with cells. - - This is the core fast-path: builds the row XML directly without - going through python-docx's slow cell.text assignment. - """ + def _createTableRowXml(self, cells: list, isHeader: bool = False, headerBgHex: str = None, headerFgHex: str = None) -> Any: + """Create a table row XML element with cells. + Fast-path: builds row XML directly via lxml.""" from docx.oxml.shared import OxmlElement, qn - + + if headerBgHex is None: + us = getattr(self, '_unifiedStyle', None) + headerBgHex = us["table"]["headerBg"].lstrip('#') if us else '1F3864' + else: + headerBgHex = headerBgHex.lstrip('#') + if headerFgHex is None: + us = getattr(self, '_unifiedStyle', None) + headerFgHex = us["table"]["headerFg"].lstrip('#') if us else 'FFFFFF' + else: + headerFgHex = headerFgHex.lstrip('#') + tr = OxmlElement('w:tr') - - # Row properties for header if isHeader: trPr = OxmlElement('w:trPr') - tblHeader = OxmlElement('w:tblHeader') - trPr.append(tblHeader) + trPr.append(OxmlElement('w:tblHeader')) tr.append(trPr) - + for cellText in cells: - # Create cell tc = OxmlElement('w:tc') - - # Cell properties tcPr = OxmlElement('w:tcPr') tcW = OxmlElement('w:tcW') tcW.set(qn('w:type'), 'auto') tcW.set(qn('w:w'), '0') tcPr.append(tcW) - - # Header cell styling - light blue background + if isHeader: shd = OxmlElement('w:shd') shd.set(qn('w:val'), 'clear') shd.set(qn('w:color'), 'auto') - shd.set(qn('w:fill'), '4472C4') # Professional blue + shd.set(qn('w:fill'), headerBgHex) tcPr.append(shd) - + tc.append(tcPr) - - # Paragraph with text p = OxmlElement('w:p') - - # Add run with text r = OxmlElement('w:r') - - # Header text styling - bold and white + if isHeader: rPr = OxmlElement('w:rPr') - b = OxmlElement('w:b') - rPr.append(b) - # White text color + rPr.append(OxmlElement('w:b')) color = OxmlElement('w:color') - color.set(qn('w:val'), 'FFFFFF') + color.set(qn('w:val'), headerFgHex) rPr.append(color) r.append(rPr) - - # Text element + t = OxmlElement('w:t') - # Preserve spaces if text starts/ends with whitespace if cellText and (cellText[0] == ' ' or cellText[-1] == ' '): t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') t.text = cellText r.append(t) - p.append(r) tc.append(p) tr.append(tc) - + return tr def _applyHorizontalBordersOnly(self, table) -> None: @@ -836,47 +874,37 @@ class RendererDocx(BaseRenderer): def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON bullet list to DOCX using AI-generated styles - OPTIMIZED for performance.""" try: - # Extract from nested content structure content = list_data.get("content", {}) if not isinstance(content, dict): return items = content.get("items", []) bullet_style = styles.get("bullet_list", {}) - - # Pre-calculate and cache style objects to avoid repeated parsing - font_size_pt = None + + font_size_pt = Pt(bullet_style["font_size"]) if bullet_style.get("font_size") else None text_color_rgb = None - if bullet_style: - if "font_size" in bullet_style: - font_size_pt = Pt(bullet_style["font_size"]) - if "color" in bullet_style: - color_hex = bullet_style["color"].lstrip('#') - text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) - + if bullet_style.get("color"): + color_hex = bullet_style["color"].lstrip('#') + text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) + for item in items: - itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "") - if not itemText: + itemRuns = self._inlineRunsForListItem(item) + if not itemRuns or not any(r.get("value") for r in itemRuns): continue para = doc.add_paragraph(style='List Bullet') - self._addMarkdownInlineRuns(para, itemText) - - # Apply bullet list styling from style set - use cached objects - if bullet_style and para.runs: - # Use direct access instead of iterating - if len(para.runs) > 0: - run = para.runs[0] - if font_size_pt: - run.font.size = font_size_pt - if text_color_rgb: - run.font.color.rgb = text_color_rgb - else: - # Create run if none exists - run = para.add_run() - if font_size_pt: - run.font.size = font_size_pt - if text_color_rgb: - run.font.color.rgb = text_color_rgb - + isNewRunFormat = isinstance(item, list) + if isNewRunFormat: + self._renderInlineRuns(itemRuns, para, styles) + else: + itemText = "".join(r.get("value", "") for r in itemRuns) + self._addMarkdownInlineRuns(para, itemText) + + if bullet_style and para.runs and len(para.runs) > 0: + run = para.runs[0] + if font_size_pt: + run.font.size = font_size_pt + if text_color_rgb: + run.font.color.rgb = text_color_rgb + except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") @@ -905,90 +933,79 @@ class RendererDocx(BaseRenderer): def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON paragraph to DOCX using AI-generated styles.""" try: - # Extract from nested content structure content = paragraph_data.get("content", {}) if isinstance(content, dict): - text = content.get("text", "") + inlineRuns = self._inlineRunsFromContent(content) elif isinstance(content, str): - text = content + inlineRuns = [{"type": "text", "value": content}] else: - text = "" - - # CRITICAL: Prevent rendering base64 image data as text - # Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG) - if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or - (len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))): - # This looks like base64 data - don't render as text - self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})") + inlineRuns = [] + + if not inlineRuns: + return + + plainText = "".join(r.get("value", "") for r in inlineRuns) + if plainText and (plainText.startswith("/9j/") or plainText.startswith("iVBORw0KGgo") or + (len(plainText) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in plainText[:100]))): + self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(plainText)})") para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]") if para.runs: - para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error + para.runs[0].font.color.rgb = RGBColor(255, 0, 0) return - - if text: - para = doc.add_paragraph() - self._addMarkdownInlineRuns(para, text) - paragraph_style = styles.get("paragraph", {}) - if paragraph_style: - # Pre-calculate and cache style objects - font_size_pt = None - text_color_rgb = None - if "font_size" in paragraph_style: - font_size_pt = Pt(paragraph_style["font_size"]) - if "color" in paragraph_style: - color_hex = paragraph_style["color"].lstrip('#') - text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) - bold = paragraph_style.get("bold", False) - - # Use direct access instead of iterating - if len(para.runs) > 0: - run = para.runs[0] - if font_size_pt: - run.font.size = font_size_pt - run.font.bold = bold - if text_color_rgb: - run.font.color.rgb = text_color_rgb + + para = doc.add_paragraph() + hasNewRuns = content.get("inlineRuns") if isinstance(content, dict) else None + if hasNewRuns: + self._renderInlineRuns(inlineRuns, para, styles) + else: + self._addMarkdownInlineRuns(para, plainText) + + paragraph_style = styles.get("paragraph", {}) + if paragraph_style: + font_size_pt = Pt(paragraph_style["font_size"]) if "font_size" in paragraph_style else None + text_color_rgb = None + if "color" in paragraph_style: + color_hex = paragraph_style["color"].lstrip('#') + text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) + bold = paragraph_style.get("bold", False) + if len(para.runs) > 0: + run = para.runs[0] + if font_size_pt: + run.font.size = font_size_pt + run.font.bold = bold + if text_color_rgb: + run.font.color.rgb = text_color_rgb + if "align" in paragraph_style: + align = paragraph_style["align"] + if align == "center": + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif align == "right": + para.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: - # Create run if none exists - run = para.add_run() - if font_size_pt: - run.font.size = font_size_pt - run.font.bold = bold - if text_color_rgb: - run.font.color.rgb = text_color_rgb - - if "align" in paragraph_style: - align = paragraph_style["align"] - if align == "center": - para.alignment = WD_ALIGN_PARAGRAPH.CENTER - elif align == "right": - para.alignment = WD_ALIGN_PARAGRAPH.RIGHT - else: - para.alignment = WD_ALIGN_PARAGRAPH.LEFT - + para.alignment = WD_ALIGN_PARAGRAPH.LEFT + except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON code block to DOCX using AI-generated styles.""" try: - # Extract from nested content structure content = code_data.get("content", {}) if not isinstance(content, dict): return code = content.get("code", "") language = content.get("language", "") code_style = styles.get("code_block", {}) - + us = getattr(self, '_unifiedStyle', None) + if code: if language: lang_para = doc.add_paragraph(f"Code ({language}):") if len(lang_para.runs) > 0: lang_para.runs[0].bold = True - - # Pre-calculate and cache style objects - code_font_name = code_style.get("font", "Courier New") - code_font_size_pt = Pt(code_style.get("font_size", 9)) + + code_font_name = code_style.get("font", us["fonts"]["monospace"] if us else "Courier New") + code_font_size_pt = Pt(code_style.get("font_size", us["codeBlock"]["fontSizePt"] if us else 9)) code_text_color_rgb = None if "color" in code_style: color_hex = code_style["color"].lstrip('#') diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py index 58143ac2..b39efd50 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererHtml.py @@ -40,7 +40,7 @@ class RendererHtml(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render HTML document with images as separate files. Returns list of documents: [HTML document, image1, image2, ...] @@ -54,7 +54,7 @@ class RendererHtml(BaseRenderer): self._renderedImages = images # Generate HTML using AI-analyzed styling - htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService) + htmlContent = await self._generateHtmlFromJson(extractedContent, title, userPrompt, aiService, style=style) # Replace base64 data URIs with relative file paths if images exist if images: @@ -107,11 +107,16 @@ class RendererHtml(BaseRenderer): return resultDocuments - async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: + async def _generateHtmlFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str: """Generate HTML content from structured JSON document using AI-generated styling.""" try: - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(jsonContent, userPrompt, aiService) + # Use unified style when provided, otherwise fall back to existing flow + if style: + styles = self._convertUnifiedStyleToInternal(style) + self._unifiedStyle = style + else: + styles = await self._getStyleSet(jsonContent, userPrompt, aiService) + self._unifiedStyle = None # Validate JSON structure if not self._validateJsonStructure(jsonContent): @@ -272,6 +277,10 @@ class RendererHtml(BaseRenderer): def _generateCssStyles(self, styles: Dict[str, Any]) -> str: """Generate CSS from style definitions.""" + # When unified style is available, generate CSS directly from it + if getattr(self, "_unifiedStyle", None): + return self._generateCssFromUnifiedStyle(self._unifiedStyle) + css_parts = [] # Body styles @@ -368,6 +377,164 @@ class RendererHtml(BaseRenderer): return '\n'.join(css_parts) + def _generateCssFromUnifiedStyle(self, style: Dict[str, Any]) -> str: + """Generate CSS directly from unified style dict.""" + fonts = style.get("fonts", {}) + colors = style.get("colors", {}) + headings = style.get("headings", {}) + para = style.get("paragraph", {}) + tbl = style.get("table", {}) + lst = style.get("list", {}) + cb = style.get("codeBlock", {}) + page = style.get("page", {}) + + primaryFont = fonts.get("primary", "Arial, sans-serif") + monoFont = fonts.get("monospace", "Courier New, monospace") + bgColor = colors.get("background", "#FFFFFF") + primaryColor = colors.get("primary", "#1F3864") + paraColor = para.get("color", "#333333") + paraSizePt = para.get("sizePt", 11) + lineSpacing = para.get("lineSpacing", 1.15) + + css_parts = [] + + # Body + css_parts.append("body {") + css_parts.append(f" font-family: {primaryFont};") + css_parts.append(f" background: {bgColor};") + css_parts.append(f" color: {paraColor};") + css_parts.append(f" font-size: {paraSizePt}pt;") + css_parts.append(f" line-height: {lineSpacing};") + margins = page.get("marginsPt", {}) + if margins: + css_parts.append(f" margin: {margins.get('top', 60)}pt {margins.get('right', 60)}pt {margins.get('bottom', 60)}pt {margins.get('left', 60)}pt;") + else: + css_parts.append(" margin: 0; padding: 20px;") + css_parts.append("}") + + # Document title (uses h1 style) + h1 = headings.get("h1", {}) + css_parts.append(".document-title {") + css_parts.append(f" font-size: {h1.get('sizePt', 24)}pt;") + css_parts.append(f" color: {h1.get('color', primaryColor)};") + css_parts.append(f" font-weight: {h1.get('weight', 'bold')};") + css_parts.append(" margin: 0 0 1em 0;") + css_parts.append("}") + + # Headings h1-h4 + for level in range(1, 5): + key = f"h{level}" + h = headings.get(key, h1 if level == 1 else headings.get(f"h{level-1}", {})) + css_parts.append(f"h{level} {{") + css_parts.append(f" font-size: {h.get('sizePt', max(24 - (level-1)*4, 12))}pt;") + css_parts.append(f" color: {h.get('color', primaryColor)};") + css_parts.append(f" font-weight: {h.get('weight', 'bold')};") + css_parts.append(f" margin: 1.2em 0 0.4em 0;") + css_parts.append("}") + + # Paragraphs + css_parts.append("p {") + css_parts.append(f" font-size: {paraSizePt}pt;") + css_parts.append(f" color: {paraColor};") + css_parts.append(f" line-height: {lineSpacing};") + css_parts.append(" margin: 0 0 1em 0;") + css_parts.append("}") + + # Tables + borderColor = tbl.get("borderColor", "#DEE2E6") + css_parts.append("table {") + css_parts.append(f" border-collapse: collapse;") + css_parts.append(f" width: 100%;") + css_parts.append(f" margin: 1em 0;") + css_parts.append(f" border: 1px solid {borderColor};") + css_parts.append("}") + + # Table headers + css_parts.append("th {") + css_parts.append(f" background: {tbl.get('headerBg', '#1F3864')};") + css_parts.append(f" color: {tbl.get('headerFg', '#FFFFFF')};") + css_parts.append(" font-weight: bold;") + css_parts.append(" text-align: center;") + css_parts.append(f" padding: 10px;") + css_parts.append(f" border: 1px solid {borderColor};") + css_parts.append("}") + + # Table cells + css_parts.append("td {") + css_parts.append(f" color: {paraColor};") + css_parts.append(" padding: 8px;") + css_parts.append(f" border: 1px solid {borderColor};") + css_parts.append("}") + + # Lists + css_parts.append("ul {") + css_parts.append(f" font-size: {lst.get('sizePt', paraSizePt)}pt;") + css_parts.append(f" color: {paraColor};") + css_parts.append(f" padding-left: {lst.get('indentPt', 18)}pt;") + css_parts.append(" margin: 0 0 1em 0;") + css_parts.append("}") + + # Code blocks + css_parts.append("pre {") + css_parts.append(f" font-family: {monoFont};") + css_parts.append(f" font-size: {cb.get('fontSizePt', 9)}pt;") + css_parts.append(f" color: {paraColor};") + css_parts.append(f" background: {cb.get('background', '#F8F9FA')};") + css_parts.append(f" border: 1px solid {cb.get('borderColor', '#E2E8F0')};") + css_parts.append(" border-radius: 4px;") + css_parts.append(" padding: 1em;") + css_parts.append(" margin: 1em 0;") + css_parts.append(" overflow-x: auto;") + css_parts.append("}") + + # Images + css_parts.append("img {") + css_parts.append(" max-width: 100%;") + css_parts.append(" height: auto;") + css_parts.append(" margin: 1em 0;") + css_parts.append(" border-radius: 4px;") + css_parts.append("}") + + # Generated info + css_parts.append(".generated-info {") + css_parts.append(" font-size: 0.9em;") + css_parts.append(" color: #666;") + css_parts.append(" text-align: center;") + css_parts.append(" margin-top: 2em;") + css_parts.append(" padding-top: 1em;") + css_parts.append(" border-top: 1px solid #ddd;") + css_parts.append("}") + + return '\n'.join(css_parts) + + def _renderInlineRuns(self, runs: list) -> str: + """Convert inline runs to HTML markup.""" + import html as htmlLib + parts = [] + for run in runs: + runType = run.get("type", "text") + value = htmlLib.escape(run.get("value", "")) + if runType == "text": + parts.append(value) + elif runType == "bold": + parts.append(f"{value}") + elif runType == "italic": + parts.append(f"{value}") + elif runType == "code": + parts.append(f"{value}") + elif runType == "link": + href = htmlLib.escape(run.get("href", "")) + parts.append(f'{value}') + elif runType == "image": + b64 = run.get("base64Data", "") + mime = run.get("mimeType", "image/png") + alt = value + if b64: + parts.append(f'{alt}') + else: + parts.append(value) + return "".join(parts) + def _renderJsonSection(self, section: Dict[str, Any], styles: Dict[str, Any]) -> str: """Render a single JSON section to HTML using AI-generated styles. Supports three content formats: reference, object (base64), extracted_text. @@ -419,6 +586,11 @@ class RendererHtml(BaseRenderer): # Regular paragraph element - extract from nested content structure (standard JSON format) content = element.get("content", {}) if isinstance(content, dict): + # New format: inlineRuns + inlineRuns = content.get("inlineRuns") + if inlineRuns and isinstance(inlineRuns, list): + htmlParts.append(f'

{self._renderInlineRuns(inlineRuns)}

') + continue text = content.get("text", "") elif isinstance(content, str): text = content @@ -495,7 +667,8 @@ class RendererHtml(BaseRenderer): # Table header htmlParts.append('') for header in headers: - htmlParts.append(f'{header}') + runs = self._inlineRunsForCell(header) + htmlParts.append(f'{self._renderInlineRuns(runs)}') htmlParts.append('') # Table body @@ -503,7 +676,8 @@ class RendererHtml(BaseRenderer): for row in rows: htmlParts.append('') for cellData in row: - htmlParts.append(f'{cellData}') + runs = self._inlineRunsForCell(cellData) + htmlParts.append(f'{self._renderInlineRuns(runs)}') htmlParts.append('') htmlParts.append('') @@ -528,10 +702,8 @@ class RendererHtml(BaseRenderer): htmlParts = ['
    '] for item in items: - if isinstance(item, str): - htmlParts.append(f'
  • {item}
  • ') - elif isinstance(item, dict) and "text" in item: - htmlParts.append(f'
  • {item["text"]}
  • ') + runs = self._inlineRunsForListItem(item) + htmlParts.append(f'
  • {self._renderInlineRuns(runs)}
  • ') htmlParts.append('
') return '\n'.join(htmlParts) @@ -571,6 +743,11 @@ class RendererHtml(BaseRenderer): if isinstance(el, dict): content = el.get("content", {}) if isinstance(content, dict): + # New format: inlineRuns + inlineRuns = content.get("inlineRuns") + if inlineRuns and isinstance(inlineRuns, list): + texts.append(self._renderInlineRuns(inlineRuns)) + continue text = content.get("text", "") elif isinstance(content, str): text = content @@ -581,16 +758,18 @@ class RendererHtml(BaseRenderer): elif isinstance(el, str): texts.append(el) if texts: - # Join multiple paragraphs with

tags return '\n'.join(f'

{text}

' for text in texts) return "" elif isinstance(paragraphData, str): return f'

{paragraphData}

' elif isinstance(paragraphData, dict): - # Handle nested content structure: element.content vs element.text # Extract from nested content structure content = paragraphData.get("content", {}) if isinstance(content, dict): + # New format: inlineRuns + inlineRuns = content.get("inlineRuns") + if inlineRuns and isinstance(inlineRuns, list): + return f'

{self._renderInlineRuns(inlineRuns)}

' text = content.get("text", "") elif isinstance(content, str): text = content diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py index df2aff10..31537980 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py @@ -106,17 +106,17 @@ class RendererPdf(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to PDF format using AI-analyzed styling.""" try: if not REPORTLAB_AVAILABLE: # Fallback to HTML if reportlab not available from .rendererHtml import RendererHtml html_renderer = RendererHtml() - return await html_renderer.render(extractedContent, title, userPrompt, aiService) + return await html_renderer.render(extractedContent, title, userPrompt, aiService, style=style) # Generate PDF using AI-analyzed styling - pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService) + pdf_content = await self._generatePdfFromJson(extractedContent, title, userPrompt, aiService, unifiedStyle=style) # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} @@ -163,11 +163,28 @@ class RendererPdf(BaseRenderer): ) ] - async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: + async def _generatePdfFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, unifiedStyle: Dict[str, Any] = None) -> str: """Generate PDF content from structured JSON document using AI-generated styling.""" try: - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(json_content, userPrompt, aiService) + # Get style set from unified style or legacy approach + if unifiedStyle: + styles = self._convertUnifiedStyleToInternal(unifiedStyle) + self._unifiedStyle = unifiedStyle + for level in range(1, 7): + hKey = f"heading{level}" + if hKey not in styles: + styles[hKey] = self._defaultHeadingStyleDef(level) + else: + styles[hKey].setdefault("space_after", 12) + styles[hKey].setdefault("space_before", 12) + styles["paragraph"].setdefault("space_after", 6) + styles["paragraph"].setdefault("line_height", unifiedStyle["paragraph"].get("lineSpacing", 1.2)) + styles["bullet_list"].setdefault("space_after", 3) + styles["code_block"].setdefault("space_after", 6) + styles["code_block"].setdefault("align", "left") + else: + styles = await self._getStyleSet(json_content, userPrompt, aiService) + self._unifiedStyle = None # Validate JSON structure if not self._validateJsonStructure(json_content): @@ -179,15 +196,13 @@ class RendererPdf(BaseRenderer): # Create a buffer to hold the PDF buffer = io.BytesIO() - # Create PDF document - doc = SimpleDocTemplate( - buffer, - pagesize=A4, - rightMargin=72, - leftMargin=72, - topMargin=72, - bottomMargin=18 - ) + # Create PDF document with unified page margins or defaults + pageCfg = unifiedStyle["page"] if unifiedStyle else None + if pageCfg: + m = pageCfg["marginsPt"] + doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=m["right"], leftMargin=m["left"], topMargin=m["top"], bottomMargin=m["bottom"]) + else: + doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18) # Build PDF content (no cover page — body starts on page 1; filename still uses `title`) story = [] @@ -609,6 +624,31 @@ class RendererPdf(BaseRenderer): .replace(">", ">") ) + def _renderInlineRunsToPdfXml(self, runs: list) -> str: + """Convert inline runs to ReportLab Paragraph XML.""" + parts = [] + us = getattr(self, '_unifiedStyle', None) + monoFont = us["fonts"]["monospace"] if us else "Courier" + for run in runs: + runType = run.get("type", "text") + value = self._escapeReportlabXml(run.get("value", "")) + if runType == "text": + parts.append(value) + elif runType == "bold": + parts.append(f"{value}") + elif runType == "italic": + parts.append(f"{value}") + elif runType == "code": + parts.append(f'{value}') + elif runType == "link": + href = self._escapeReportlabXml(run.get("href", "")) + parts.append(f'{value}') + elif runType == "image": + parts.append(f"[Image: {value}]") + else: + parts.append(value) + return "".join(parts) + def _applyInlineMarkdownToEscapedPlain(self, text: str) -> str: """Escape XML then apply bold/italic to a segment with no `code` spans (code is handled separately).""" if not text: @@ -744,10 +784,10 @@ class RendererPdf(BaseRenderer): return [] headers = content.get("headers", []) rows = content.get("rows", []) - + if not headers or not rows: return [] - + numCols = len(headers) colWidth = _PDF_CONTENT_WIDTH_PT / max(numCols, 1) colWidths = [colWidth] * numCols @@ -755,8 +795,12 @@ class RendererPdf(BaseRenderer): hdrPs = self._createTableCellParagraphStyle(styles, header=True, tableStyleKey="table_header") cellPs = self._createTableCellParagraphStyle(styles, header=False, tableStyleKey="table_cell") - def _cellPara(val, ps): - return self._paragraphFromInlineMarkdown(str(val) if val is not None else "", ps) + def _cellPara(cell, ps): + runs = self._inlineRunsForCell(cell) + if isinstance(cell, list): + xml = self._renderInlineRunsToPdfXml(runs) + return Paragraph(_wrapEmojiSpansInXml(xml), ps) + return self._paragraphFromInlineMarkdown(str(cell) if cell is not None else "", ps) headerRow = [_cellPara(h, hdrPs) for h in headers] bodyRows = [] @@ -786,7 +830,7 @@ class RendererPdf(BaseRenderer): ] table.setStyle(TableStyle(table_style)) return [table, Spacer(1, 12)] - + except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") return [] @@ -794,32 +838,29 @@ class RendererPdf(BaseRenderer): def _renderJsonBulletList(self, list_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON bullet list to PDF elements using AI-generated styles.""" try: - # Extract from nested content structure content = list_data.get("content", {}) if not isinstance(content, dict): return [] items = content.get("items", []) - bullet_style_def = styles.get("bullet_list", {}) - + bulletStyleDef = styles.get("bullet_list", {}) + normalStyle = self._createNormalStyle(styles) + elements = [] for item in items: - if isinstance(item, str): - elements.append( - Paragraph(f"• {self._markdownInlineToReportlabXml(item)}", self._createNormalStyle(styles)) - ) + runs = self._inlineRunsForListItem(item) + if isinstance(item, list): + xml = self._renderInlineRunsToPdfXml(runs) + elements.append(Paragraph(f"\u2022 {_wrapEmojiSpansInXml(xml)}", normalStyle)) + elif isinstance(item, str): + elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item)}", normalStyle)) elif isinstance(item, dict) and "text" in item: - elements.append( - Paragraph( - f"• {self._markdownInlineToReportlabXml(item['text'])}", - self._createNormalStyle(styles), - ) - ) - + elements.append(Paragraph(f"\u2022 {self._markdownInlineToReportlabXml(item['text'])}", normalStyle)) + if elements: - elements.append(Spacer(1, bullet_style_def.get("space_after", 3))) - + elements.append(Spacer(1, bulletStyleDef.get("space_after", 3))) + return elements - + except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") return [] @@ -848,20 +889,27 @@ class RendererPdf(BaseRenderer): def _renderJsonParagraph(self, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> List[Any]: """Render a JSON paragraph to PDF elements using AI-generated styles.""" try: - # Extract from nested content structure content = paragraph_data.get("content", {}) - if isinstance(content, dict): - text = content.get("text", "") - elif isinstance(content, str): - text = content - else: - text = "" - + if isinstance(content, str): + content = {"text": content} + if not isinstance(content, dict): + return [] + + normalStyle = self._createNormalStyle(styles) + + if "inlineRuns" in content: + runs = self._inlineRunsFromContent(content) + xml = self._renderInlineRunsToPdfXml(runs) + if xml: + return [Paragraph(_wrapEmojiSpansInXml(xml), normalStyle)] + return [] + + text = content.get("text", "") if text: - return [self._paragraphFromInlineMarkdown(text, self._createNormalStyle(styles))] - + return [self._paragraphFromInlineMarkdown(text, normalStyle)] + return [] - + except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") return [] diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py index 3bdff7f1..49ee8048 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPptx.py @@ -59,7 +59,7 @@ class RendererPptx(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render content as PowerPoint presentation from JSON data. @@ -68,7 +68,7 @@ class RendererPptx(BaseRenderer): title: Title for the presentation userPrompt: User prompt for AI styling aiService: AI service for styling - **kwargs: Additional rendering options + style: Unified style dict from pipeline (preferred over AI-generated styles) Returns: Base64-encoded PowerPoint presentation as string @@ -81,8 +81,19 @@ class RendererPptx(BaseRenderer): from pptx.dml.color import RGBColor import re - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(extractedContent, userPrompt, aiService) + # Get style set: prefer unified style, then metadata, then AI-enhanced + if style: + internalStyle = self._convertUnifiedStyleToInternal(style) + defaultPptx = self._getDefaultStyleSet() + for key in ("slide_size", "content_per_slide", "design_theme", "color_scheme", "background_style", "accent_colors", "professional_grade", "executive_ready"): + internalStyle[key] = defaultPptx.get(key) + internalStyle["heading"] = internalStyle["heading1"] + internalStyle["subheading"] = internalStyle["heading2"] + styles = internalStyle + self._unifiedStyle = style + else: + styles = await self._getStyleSet(extractedContent, userPrompt, aiService) + self._unifiedStyle = None # Create new presentation prs = Presentation() @@ -910,15 +921,17 @@ JSON ONLY. NO OTHER TEXT.""" # Extract from nested content structure content = paragraph_data.get("content", {}) if isinstance(content, dict): - text = content.get("text", "") + if content.get("inlineRuns"): + text = "".join(r.get("value", "") for r in content["inlineRuns"]) + else: + text = content.get("text", "") elif isinstance(content, str): text = content else: text = "" if text: - # Limit paragraph length based on content density - max_length = 200 # Default limit + max_length = 200 if len(text) > max_length: text = text[:max_length] + "..." @@ -1303,6 +1316,32 @@ JSON ONLY. NO OTHER TEXT.""" r.text = text[pos:] _applyBase(r) + def _renderInlineRunsPptx(self, runs, paragraph, fontSize=None, fontColor=None): + """Process InlineRun dicts into pptx text runs.""" + from pptx.util import Pt + paragraph.text = "" + us = getattr(self, '_unifiedStyle', None) + monoFont = us["fonts"]["monospace"] if us else "Courier New" + for run in runs: + runType = run.get("type", "text") + value = run.get("value", "") + r = paragraph.add_run() + r.text = value + if fontSize: + r.font.size = fontSize + if fontColor: + r.font.color.rgb = fontColor + if runType == "bold": + r.font.bold = True + elif runType == "italic": + r.font.italic = True + elif runType == "code": + r.font.name = monoFont + if fontSize and hasattr(fontSize, 'pt'): + r.font.size = Pt(max(8, int(fontSize.pt * 0.85))) + elif runType == "link": + r.font.underline = True + def _addTableToSlide(self, slide, element: Dict[str, Any], styles: Dict[str, Any], top: float = None, max_width: float = None) -> None: """Add a PowerPoint table to slide.""" try: @@ -1374,7 +1413,8 @@ JSON ONLY. NO OTHER TEXT.""" cell = table.cell(0, col_idx) # Clear existing text and set new text cell.text_frame.clear() - header_text = str(header) if header else "" + cellRuns = self._inlineRunsForCell(header) + header_text = "".join(r.get("value", "") for r in cellRuns) cell.text = header_text # Ensure paragraph exists @@ -1420,7 +1460,8 @@ JSON ONLY. NO OTHER TEXT.""" cell = table.cell(row_idx, col_idx) # Clear existing text and set new text cell.text_frame.clear() - cell_text = str(cell_data) if cell_data is not None else "" + cellRuns = self._inlineRunsForCell(cell_data) + cell_text = "".join(r.get("value", "") for r in cellRuns) cell.text = cell_text # Ensure paragraph exists @@ -1462,9 +1503,8 @@ JSON ONLY. NO OTHER TEXT.""" fontColor = RGBColor(*self._getSafeColor(listStyle.get("color", (47, 47, 47)))) for item in items: - itemText = item.get("text", "") if isinstance(item, dict) else str(item) - if not itemText or not itemText.strip(): - continue + runs = self._inlineRunsForListItem(item) + isNewFormat = isinstance(item, list) p = text_frame.add_paragraph() p.level = 0 @@ -1472,21 +1512,33 @@ JSON ONLY. NO OTHER TEXT.""" p.space_before = Pt(2) p.space_after = Pt(2) - # Consistent bullet prefix - self._addMarkdownInlineRuns(p, f" • {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) + if isNewFormat: + bulletRuns = [{"type": "text", "value": " \u2022 "}] + runs + self._renderInlineRunsPptx(bulletRuns, p, fontSize=fontSize, fontColor=fontColor) + else: + itemText = item.get("text", "") if isinstance(item, dict) else str(item) + if not itemText or not itemText.strip(): + continue + self._addMarkdownInlineRuns(p, f" \u2022 {itemText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) - # Subitems + # Subitems (only for dict-style items) if isinstance(item, dict): for sub in item.get("subitems", []): - subText = sub.get("text", "") if isinstance(sub, dict) else str(sub) - if not subText: - continue + subRuns = self._inlineRunsForListItem(sub) + isSubNew = isinstance(sub, list) sp = text_frame.add_paragraph() sp.level = 0 sp.alignment = PP_ALIGN.LEFT sp.space_before = Pt(1) sp.space_after = Pt(1) - self._addMarkdownInlineRuns(sp, f" – {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) + if isSubNew: + subBulletRuns = [{"type": "text", "value": " \u2013 "}] + subRuns + self._renderInlineRunsPptx(subBulletRuns, sp, fontSize=fontSize, fontColor=fontColor) + else: + subText = sub.get("text", "") if isinstance(sub, dict) else str(sub) + if not subText: + continue + self._addMarkdownInlineRuns(sp, f" \u2013 {subText}", fontSize=fontSize, fontColor=fontColor, fontBold=False) except Exception as e: logger.warning(f"Error adding bullet list to slide: {str(e)}") @@ -1540,42 +1592,53 @@ JSON ONLY. NO OTHER TEXT.""" # Extract from nested content structure content = element.get("content", {}) if isinstance(content, dict): + inlineRuns = self._inlineRunsFromContent(content) + hasInlineRuns = content.get("inlineRuns") is not None text = content.get("text", "") elif isinstance(content, str): text = content + inlineRuns = [{"type": "text", "value": text}] if text else [] + hasInlineRuns = False else: text = "" + inlineRuns = [] + hasInlineRuns = False - if text: - p = text_frame.add_paragraph() - p.level = 0 - - try: - if hasattr(p, 'paragraph_format'): - p.paragraph_format.bullet.type = None - except (AttributeError, TypeError): - pass - - paragraph_style = styles.get("paragraph", {}) - base_font_size = paragraph_style.get("font_size", 14) - calculated_size = max(10, int(base_font_size * font_size_multiplier)) - fSize = Pt(calculated_size) - fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) - fBold = paragraph_style.get("bold", False) + if not inlineRuns and not text: + return + + p = text_frame.add_paragraph() + p.level = 0 + + try: + if hasattr(p, 'paragraph_format'): + p.paragraph_format.bullet.type = None + except (AttributeError, TypeError): + pass + + paragraph_style = styles.get("paragraph", {}) + base_font_size = paragraph_style.get("font_size", 14) + calculated_size = max(10, int(base_font_size * font_size_multiplier)) + fSize = Pt(calculated_size) + fColor = RGBColor(*self._getSafeColor(paragraph_style.get("color", (47, 47, 47)))) + fBold = paragraph_style.get("bold", False) + + if hasInlineRuns: + self._renderInlineRunsPptx(inlineRuns, p, fontSize=fSize, fontColor=fColor) + else: self._addMarkdownInlineRuns(p, text, fontSize=fSize, fontColor=fColor, fontBold=fBold) - - # Add proper spacing - p.space_before = Pt(6) # Space before paragraph - p.space_after = Pt(6) # Space after paragraph - p.line_spacing = 1.2 # Line spacing for readability - - align = paragraph_style.get("align", "left") - if align == "center": - p.alignment = PP_ALIGN.CENTER - elif align == "right": - p.alignment = PP_ALIGN.RIGHT - else: - p.alignment = PP_ALIGN.LEFT + + p.space_before = Pt(6) + p.space_after = Pt(6) + p.line_spacing = 1.2 + + align = paragraph_style.get("align", "left") + if align == "center": + p.alignment = PP_ALIGN.CENTER + elif align == "right": + p.alignment = PP_ALIGN.RIGHT + else: + p.alignment = PP_ALIGN.LEFT except Exception as e: logger.warning(f"Error adding paragraph to slide: {str(e)}") diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py index 79f5688c..3c6fdd5e 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererXlsx.py @@ -68,17 +68,17 @@ class RendererXlsx(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return list(supportedSectionTypes) - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to Excel format using AI-analyzed styling.""" try: if not OPENPYXL_AVAILABLE: # Fallback to CSV if openpyxl not available from .rendererCsv import RendererCsv csvRenderer = RendererCsv() - return await csvRenderer.render(extractedContent, title, userPrompt, aiService) + return await csvRenderer.render(extractedContent, title, userPrompt, aiService, style=style) # Generate Excel using AI-analyzed styling - excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService) + excelContent = await self._generateExcelFromJson(extractedContent, title, userPrompt, aiService, style=style) # Extract metadata for document type and other info metadata = extractedContent.get("metadata", {}) if extractedContent else {} @@ -298,15 +298,22 @@ class RendererXlsx(BaseRenderer): except Exception as e: self.logger.warning(f"Could not populate analysis sheet: {str(e)}") - async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: + async def _generateExcelFromJson(self, jsonContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> str: """Generate Excel content from structured JSON document using AI-generated styling.""" try: # Debug output self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT TYPE: {type(jsonContent)}", "EXCEL_RENDERER") self.services.utils.debugLogToFile(f"EXCEL JSON CONTENT KEYS: {list(jsonContent.keys()) if isinstance(jsonContent, dict) else 'Not a dict'}", "EXCEL_RENDERER") - # Get style set: use styles from metadata if available, otherwise enhance with AI - styles = await self._getStyleSet(jsonContent, userPrompt, aiService) + # Store unified style for use by inline-run helpers + self._unifiedStyle = style + + # Get style set: prefer unified style, fall back to legacy approach + if style: + styles = self._convertUnifiedStyleToInternal(style) + styles = self._convertColorsFormat(styles) + else: + styles = await self._getStyleSet(jsonContent, userPrompt, aiService) # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) if not self._validateJsonStructure(jsonContent): @@ -511,6 +518,10 @@ class RendererXlsx(BaseRenderer): "code_block": {"font": "Courier New", "font_size": 10, "color": "FF2F2F2F", "background": "FFF5F5F5"} } + def _renderInlineRuns(self, runs: list) -> str: + """Flatten inline runs to plain text for Excel cells.""" + return "".join(r.get("value", "") for r in runs) + async def _getAiStylesWithExcelColors(self, aiService, styleTemplate: str, defaultStyles: Dict[str, Any]) -> Dict[str, Any]: """Get AI styles with proper Excel color conversion.""" if not aiService: @@ -1206,7 +1217,9 @@ class RendererXlsx(BaseRenderer): # Add headers with formatting - OPTIMIZED: use cached style objects for col, header in enumerate(headers, 1): - sanitized_header = self._sanitizeCellValue(header) + runs = self._inlineRunsForCell(header) + headerText = self._renderInlineRuns(runs) + sanitized_header = self._sanitizeCellValue(headerText) cell = sheet.cell(row=headerRow, column=col, value=sanitized_header) # Apply styling with fallbacks - use pre-calculated objects @@ -1272,7 +1285,9 @@ class RendererXlsx(BaseRenderer): cell_values = cell_values[:header_count] for col, cell_value in enumerate(cell_values, 1): - sanitized_value = self._sanitizeCellValue(cell_value) + runs = self._inlineRunsForCell(cell_value) + cellText = self._renderInlineRuns(runs) + sanitized_value = self._sanitizeCellValue(cellText) cell = sheet.cell(row=startRow, column=col, value=sanitized_value) # Apply styling with fallbacks - use pre-calculated objects @@ -1311,20 +1326,20 @@ class RendererXlsx(BaseRenderer): def _addListToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int: """Add a list element to Excel sheet. Expects nested content structure.""" try: - # Extract from nested content structure content = element.get("content", {}) if not isinstance(content, dict): return startRow - list_items = content.get("items") or [] - # Ensure list_items is a list - if not isinstance(list_items, list): - list_items = [] + listItems = content.get("items") or [] + if not isinstance(listItems, list): + listItems = [] - list_style = styles.get("bullet_list", {}) - for item in list_items: - sheet.cell(row=startRow, column=1, value=f"• {item}") - if list_style.get("color"): - sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(list_style["color"])) + listStyle = styles.get("bullet_list", {}) + for item in listItems: + runs = self._inlineRunsForListItem(item) + text = self._renderInlineRuns(runs) + sheet.cell(row=startRow, column=1, value=f"\u2022 {text}") + if listStyle.get("color"): + sheet.cell(row=startRow, column=1).font = Font(color=self._getSafeColor(listStyle["color"])) startRow += 1 return startRow @@ -1336,10 +1351,10 @@ class RendererXlsx(BaseRenderer): def _addParagraphToExcel(self, sheet, element: Dict[str, Any], styles: Dict[str, Any], startRow: int) -> int: """Add a paragraph element to Excel sheet. Expects nested content structure.""" try: - # Extract from nested content structure content = element.get("content", {}) if isinstance(content, dict): - text = content.get("text", "") + runs = self._inlineRunsFromContent(content) + text = self._renderInlineRuns(runs) elif isinstance(content, str): text = content else: diff --git a/modules/serviceCenter/services/serviceGeneration/styleDefaults.py b/modules/serviceCenter/services/serviceGeneration/styleDefaults.py new file mode 100644 index 00000000..b5a92641 --- /dev/null +++ b/modules/serviceCenter/services/serviceGeneration/styleDefaults.py @@ -0,0 +1,75 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +"""Default style definitions and style resolution for document rendering.""" + +from typing import Any, Dict + + +DEFAULT_STYLE: Dict[str, Any] = { + "fonts": { + "primary": "Calibri", + "monospace": "Consolas", + }, + "colors": { + "primary": "#1F3864", + "secondary": "#2C3E50", + "accent": "#2980B9", + "background": "#FFFFFF", + }, + "headings": { + "h1": {"sizePt": 24, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 12, "spaceAfterPt": 6}, + "h2": {"sizePt": 18, "weight": "bold", "color": "#1F3864", "spaceBeforePt": 10, "spaceAfterPt": 4}, + "h3": {"sizePt": 14, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 8, "spaceAfterPt": 3}, + "h4": {"sizePt": 12, "weight": "bold", "color": "#2C3E50", "spaceBeforePt": 6, "spaceAfterPt": 2}, + }, + "paragraph": {"sizePt": 11, "lineSpacing": 1.15, "color": "#333333"}, + "table": { + "headerBg": "#1F3864", + "headerFg": "#FFFFFF", + "headerSizePt": 10, + "bodySizePt": 10, + "rowBandingEven": "#F2F6FC", + "rowBandingOdd": "#FFFFFF", + "borderColor": "#CBD5E1", + "borderWidthPt": 0.5, + }, + "list": {"bulletChar": "\u2022", "indentPt": 18, "sizePt": 11}, + "image": {"defaultWidthPt": 480, "maxWidthPt": 800, "alignment": "center"}, + "codeBlock": {"fontSizePt": 9, "background": "#F8F9FA", "borderColor": "#E2E8F0"}, + "page": { + "format": "A4", + "marginsPt": {"top": 60, "bottom": 60, "left": 60, "right": 60}, + "showPageNumbers": True, + "headerHeight": 30, + "footerHeight": 30, + "headerLogo": None, + "headerText": "", + "footerText": "", + }, +} + + +def _deepMerge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + """Recursively merge override into base. Both dicts left unchanged; returns new dict.""" + result = {} + for key in base: + if key in override: + baseVal = base[key] + overVal = override[key] + if isinstance(baseVal, dict) and isinstance(overVal, dict): + result[key] = _deepMerge(baseVal, overVal) + else: + result[key] = overVal + else: + result[key] = base[key] + for key in override: + if key not in base: + result[key] = override[key] + return result + + +def resolveStyle(agentStyle: dict | None) -> Dict[str, Any]: + """Deep-merge DEFAULT_STYLE <- agentStyle. Returns fully resolved style dict.""" + if not agentStyle: + return dict(DEFAULT_STYLE) + return _deepMerge(DEFAULT_STYLE, agentStyle) diff --git a/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py b/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py index 8a3e7cea..594fbe02 100644 --- a/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py +++ b/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py @@ -9,11 +9,70 @@ from typing import Any, Dict logger = logging.getLogger(__name__) +def _parseInlineRuns(text: str) -> list: + """ + Parse inline markdown formatting into a list of InlineRun dicts. + Handles: images, links, bold, italic, inline code, plain text. + Uses a regex-based tokenizer that processes tokens left-to-right. + """ + if not text: + return [{"type": "text", "value": ""}] + + # Pattern order matters: images before links, bold before italic + _TOKEN_RE = re.compile( + r'!\[(?P[^\]]*)\]\((?P[^)"]+)(?:\s+"(?P\d+)pt")?\)' # image + r'|\[(?P[^\]]+)\]\((?P[^)]+)\)' # link + r'|`(?P[^`]+)`' # inline code + r'|\*\*(?P.+?)\*\*' # bold + r'|(?.+?)\*(?!\w)' # italic *x* + r'|(?.+?)_(?!\w)' # italic _x_ + ) + + runs = [] + lastEnd = 0 + + for m in _TOKEN_RE.finditer(text): + # Plain text before this match + if m.start() > lastEnd: + runs.append({"type": "text", "value": text[lastEnd:m.start()]}) + + if m.group("imgAlt") is not None or m.group("imgSrc") is not None: + alt = (m.group("imgAlt") or "").strip() or "Image" + src = (m.group("imgSrc") or "").strip() + widthStr = m.group("imgWidth") + run = {"type": "image", "value": alt} + if src.startswith("file:"): + run["fileId"] = src[5:] + else: + run["href"] = src + if widthStr: + run["widthPt"] = int(widthStr) + runs.append(run) + elif m.group("linkText") is not None: + runs.append({"type": "link", "value": m.group("linkText"), "href": m.group("linkHref")}) + elif m.group("code") is not None: + runs.append({"type": "code", "value": m.group("code")}) + elif m.group("bold") is not None: + runs.append({"type": "bold", "value": m.group("bold")}) + elif m.group("italic1") is not None: + runs.append({"type": "italic", "value": m.group("italic1")}) + elif m.group("italic2") is not None: + runs.append({"type": "italic", "value": m.group("italic2")}) + + lastEnd = m.end() + + # Trailing plain text + if lastEnd < len(text): + runs.append({"type": "text", "value": text[lastEnd:]}) + + return runs if runs else [{"type": "text", "value": text}] + + def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]: """ - Convert markdown content to the standard document JSON format expected by renderReport. - Supports headings, code blocks, tables, lists, images (file: refs), paragraphs. - For plain text: wraps entire content in a single paragraph section. + Convert markdown content to the standard document JSON format with Inline-Run model. + Sections use inlineRuns (list of run dicts) instead of plain text strings. + Supports headings, code blocks, tables, lists, images, paragraphs. """ if not isinstance(markdown, str): markdown = str(markdown) if markdown else "" @@ -31,7 +90,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D while i < len(lines): line = lines[i] - # Headings + # Headings (plain text, no inline formatting) headingMatch = re.match(r"^(#{1,6})\s+(.+)", line) if headingMatch: level = len(headingMatch.group(1)) @@ -43,7 +102,7 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D i += 1 continue - # Fenced code blocks + # Fenced code blocks (no inline formatting) codeMatch = re.match(r"^```(\w*)", line) if codeMatch: lang = codeMatch.group(1) or "text" @@ -59,14 +118,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D }) continue - # Tables + # Tables - cells are List[InlineRun] tableMatch = re.match(r"^\|(.+)\|$", line) if tableMatch and (i + 1) < len(lines) and re.match(r"^\|[\s\-:|]+\|$", lines[i + 1]): - headerCells = [c.strip() for c in tableMatch.group(1).split("|")] + headerCells = [_parseInlineRuns(c.strip()) for c in tableMatch.group(1).split("|")] i += 2 rows = [] while i < len(lines) and re.match(r"^\|(.+)\|$", lines[i]): - rowCells = [c.strip() for c in lines[i][1:-1].split("|")] + rowCells = [_parseInlineRuns(c.strip()) for c in lines[i][1:-1].split("|")] rows.append(rowCells) i += 1 sections.append({ @@ -75,14 +134,14 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D }) continue - # Bullet / numbered lists + # Bullet / numbered lists - items are List[List[InlineRun]] listMatch = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", line) if listMatch: isNumbered = bool(re.match(r"\d+[.)]", listMatch.group(2))) items = [] while i < len(lines) and re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i]): m = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i]) - items.append({"text": m.group(3).strip()}) + items.append(_parseInlineRuns(m.group(3).strip())) i += 1 sections.append({ "id": _nextId(), "content_type": "bullet_list", "order": order, @@ -95,46 +154,50 @@ def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> D i += 1 continue - # Images (simplified: store as paragraph with ref for now - full resolution needs Knowledge Store) - imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)]+)\)", line) + # Standalone image on its own line -> block-level image section + imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)\"]+)(?:\s+\"(\d+)pt\")?\)\s*$", line) if imgMatch: altText = imgMatch.group(1).strip() or "Image" src = imgMatch.group(2).strip() + widthStr = imgMatch.group(3) fileId = src[5:] if src.startswith("file:") else "" + content = { + "altText": altText, + "base64Data": "", + "_fileRef": fileId, + "_srcUrl": src if not fileId else "", + } + if widthStr: + content["widthPt"] = int(widthStr) sections.append({ "id": _nextId(), "content_type": "image", "order": order, - "elements": [{ - "content": { - "altText": altText, - "base64Data": "", - "_fileRef": fileId, - "_srcUrl": src if not fileId else "", - } - }], + "elements": [{"content": content}], }) i += 1 continue - # Paragraph + # Paragraph - produces inlineRuns paraLines = [] while i < len(lines) and lines[i].strip() and not re.match( - r"^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )", lines[i] + r"^(#{1,6}\s|```|\|.+\||!\[[^\]]*\]\([^)]+\)\s*$|(\s*)([-*+]|\d+[.)]) )", lines[i] ): paraLines.append(lines[i]) i += 1 if paraLines: + combinedText = " ".join(paraLines) sections.append({ "id": _nextId(), "content_type": "paragraph", "order": order, - "elements": [{"content": {"text": " ".join(paraLines)}}], + "elements": [{"content": {"inlineRuns": _parseInlineRuns(combinedText)}}], }) continue i += 1 if not sections: + fallbackText = markdown.strip() or "(empty)" sections.append({ "id": _nextId(), "content_type": "paragraph", "order": order, - "elements": [{"content": {"text": markdown.strip() or "(empty)"}}], + "elements": [{"content": {"inlineRuns": _parseInlineRuns(fallbackText)}}], }) return { diff --git a/modules/workflows/methods/methodAi/_common.py b/modules/workflows/methods/methodAi/_common.py new file mode 100644 index 00000000..9e77d431 --- /dev/null +++ b/modules/workflows/methods/methodAi/_common.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. + +"""Shared helpers for AI workflow actions.""" + + +def applyCommonAiParams(parameters: dict, request) -> None: + """Apply common AI parameters (requireNeutralization, allowedModels) from node to request.""" + requireNeutralization = parameters.get("requireNeutralization") + if requireNeutralization is not None: + request.requireNeutralization = bool(requireNeutralization) + + allowedModels = parameters.get("allowedModels") + if allowedModels and isinstance(allowedModels, list): + if not request.options: + from modules.datamodels.datamodelAi import AiCallOptions + request.options = AiCallOptions() + request.options.allowedModels = allowedModels diff --git a/modules/workflows/methods/methodAi/actions/consolidate.py b/modules/workflows/methods/methodAi/actions/consolidate.py index fa622507..7483507e 100644 --- a/modules/workflows/methods/methodAi/actions/consolidate.py +++ b/modules/workflows/methods/methodAi/actions/consolidate.py @@ -67,6 +67,8 @@ async def consolidate(self, parameters: Dict[str, Any]) -> ActionResult: prompt=prompt, options=AiCallOptions(operationType=OperationTypeEnum.DATA_ANALYSE), ) + from modules.workflows.methods.methodAi._common import applyCommonAiParams + applyCommonAiParams(parameters, req) resp = await ai_service.callAi(req) except (SubscriptionInactiveException, BillingContextError): raise diff --git a/modules/workflows/methods/methodAi/actions/convertDocument.py b/modules/workflows/methods/methodAi/actions/convertDocument.py index 39d6e16f..b2ed908b 100644 --- a/modules/workflows/methods/methodAi/actions/convertDocument.py +++ b/modules/workflows/methods/methodAi/actions/convertDocument.py @@ -36,6 +36,10 @@ async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult: } if parentOperationId: processParams["parentOperationId"] = parentOperationId + if parameters.get("allowedModels"): + processParams["allowedModels"] = parameters["allowedModels"] + if parameters.get("requireNeutralization") is not None: + processParams["requireNeutralization"] = parameters["requireNeutralization"] return await self.process(processParams) diff --git a/modules/workflows/methods/methodAi/actions/generateCode.py b/modules/workflows/methods/methodAi/actions/generateCode.py index 313057a0..5ec6b51d 100644 --- a/modules/workflows/methods/methodAi/actions/generateCode.py +++ b/modules/workflows/methods/methodAi/actions/generateCode.py @@ -55,6 +55,16 @@ async def generateCode(self, parameters: Dict[str, Any]) -> ActionResult: processingMode=ProcessingModeEnum.DETAILED ) + # Apply node-level AI params + allowedModels = parameters.get("allowedModels") + if allowedModels and isinstance(allowedModels, list): + options.allowedModels = allowedModels + requireNeutralization = parameters.get("requireNeutralization") + if requireNeutralization is not None: + _ctx = getattr(self.services, '_context', None) + if _ctx: + _ctx.requireNeutralization = bool(requireNeutralization) + # outputFormat: Optional - if None, formats determined from prompt by AI aiResponse: AiResponse = await self.services.ai.callAiContent( prompt=prompt, diff --git a/modules/workflows/methods/methodAi/actions/generateDocument.py b/modules/workflows/methods/methodAi/actions/generateDocument.py index 0709b924..18c158c1 100644 --- a/modules/workflows/methods/methodAi/actions/generateDocument.py +++ b/modules/workflows/methods/methodAi/actions/generateDocument.py @@ -59,6 +59,16 @@ async def generateDocument(self, parameters: Dict[str, Any]) -> ActionResult: compressContext=False ) + # Apply node-level AI params + allowedModels = parameters.get("allowedModels") + if allowedModels and isinstance(allowedModels, list): + options.allowedModels = allowedModels + requireNeutralization = parameters.get("requireNeutralization") + if requireNeutralization is not None: + _ctx = getattr(self.services, '_context', None) + if _ctx: + _ctx.requireNeutralization = bool(requireNeutralization) + # outputFormat: Optional - if None, formats determined from prompt by AI aiResponse: AiResponse = await self.services.ai.callAiContent( prompt=prompt, diff --git a/modules/workflows/methods/methodAi/actions/process.py b/modules/workflows/methods/methodAi/actions/process.py index 63e0f33e..d82ac4f7 100644 --- a/modules/workflows/methods/methodAi/actions/process.py +++ b/modules/workflows/methods/methodAi/actions/process.py @@ -212,6 +212,9 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: ) ) + from modules.workflows.methods.methodAi._common import applyCommonAiParams + applyCommonAiParams(parameters, request) + aiResponse_obj = await self.services.ai.callAi(request) # Convert AiCallResponse to AiResponse format @@ -243,6 +246,16 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: operationType=OperationTypeEnum.IMAGE_GENERATE if isImageGeneration else OperationTypeEnum.DATA_GENERATE ) + # Apply node-level AI params (allowedModels, requireNeutralization) + allowedModels = parameters.get("allowedModels") + if allowedModels and isinstance(allowedModels, list): + options.allowedModels = allowedModels + requireNeutralization = parameters.get("requireNeutralization") + if requireNeutralization is not None: + _ctx = getattr(self.services, '_context', None) + if _ctx: + _ctx.requireNeutralization = bool(requireNeutralization) + # Get generationIntent from parameters (required for DATA_GENERATE) # Default to "document" if not provided (most common use case) # For code generation, use ai.generateCode action or explicitly pass generationIntent="code" diff --git a/modules/workflows/methods/methodAi/actions/summarizeDocument.py b/modules/workflows/methods/methodAi/actions/summarizeDocument.py index e32c1965..4c2bb2bc 100644 --- a/modules/workflows/methods/methodAi/actions/summarizeDocument.py +++ b/modules/workflows/methods/methodAi/actions/summarizeDocument.py @@ -39,6 +39,10 @@ async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult: } if parentOperationId: processParams["parentOperationId"] = parentOperationId + if parameters.get("allowedModels"): + processParams["allowedModels"] = parameters["allowedModels"] + if parameters.get("requireNeutralization") is not None: + processParams["requireNeutralization"] = parameters["requireNeutralization"] return await self.process(processParams) diff --git a/modules/workflows/methods/methodAi/actions/translateDocument.py b/modules/workflows/methods/methodAi/actions/translateDocument.py index bb6f8437..dc0533a9 100644 --- a/modules/workflows/methods/methodAi/actions/translateDocument.py +++ b/modules/workflows/methods/methodAi/actions/translateDocument.py @@ -41,6 +41,10 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult: processParams["resultType"] = resultType if parentOperationId: processParams["parentOperationId"] = parentOperationId + if parameters.get("allowedModels"): + processParams["allowedModels"] = parameters["allowedModels"] + if parameters.get("requireNeutralization") is not None: + processParams["requireNeutralization"] = parameters["requireNeutralization"] return await self.process(processParams) diff --git a/tests/serviceAi/__init__.py b/tests/serviceAi/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/serviceAi/test_allowed_models_whitelist.py b/tests/serviceAi/test_allowed_models_whitelist.py new file mode 100644 index 00000000..4593afd9 --- /dev/null +++ b/tests/serviceAi/test_allowed_models_whitelist.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +import pytest +from modules.datamodels.datamodelAi import AiCallOptions + + +def test_allowed_models_field_exists(): + opts = AiCallOptions(allowedModels=["gpt-5-mini", "claude-4-7-opus"]) + assert opts.allowedModels == ["gpt-5-mini", "claude-4-7-opus"] + + +def test_allowed_models_default_none(): + opts = AiCallOptions() + assert opts.allowedModels is None diff --git a/tests/serviceGeneration/__init__.py b/tests/serviceGeneration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/serviceGeneration/test_inline_image_paragraph.py b/tests/serviceGeneration/test_inline_image_paragraph.py new file mode 100644 index 00000000..be0c5d19 --- /dev/null +++ b/tests/serviceGeneration/test_inline_image_paragraph.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +import pytest +from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson + + +def test_inline_image_in_paragraph(): + md = "Results show ![chart](file:abc \"200pt\") clearly." + result = markdownToDocumentJson(md, "Test") + runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"] + types = [r["type"] for r in runs] + assert "text" in types + assert "image" in types + imgRun = next(r for r in runs if r["type"] == "image") + assert imgRun.get("fileId") == "abc" + + +def test_multiple_inline_images(): + md = "A ![x](file:1) B ![y](file:2) C" + result = markdownToDocumentJson(md, "Test") + runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"] + images = [r for r in runs if r["type"] == "image"] + assert len(images) == 2 diff --git a/tests/serviceGeneration/test_md_to_json_consolidation.py b/tests/serviceGeneration/test_md_to_json_consolidation.py new file mode 100644 index 00000000..83118374 --- /dev/null +++ b/tests/serviceGeneration/test_md_to_json_consolidation.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +import pytest +from modules.serviceCenter.services.serviceGeneration.subDocumentUtility import markdownToDocumentJson + + +def test_basic_paragraph(): + result = markdownToDocumentJson("Hello world", "Test") + doc = result["documents"][0] + section = doc["sections"][0] + assert section["content_type"] == "paragraph" + assert section["elements"][0]["content"]["inlineRuns"][0] == {"type": "text", "value": "Hello world"} + + +def test_inline_bold(): + result = markdownToDocumentJson("This is **bold** text", "Test") + runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"] + assert any(r["type"] == "bold" and r["value"] == "bold" for r in runs) + + +def test_inline_image(): + result = markdownToDocumentJson("Text ![logo](file:abc123) more", "Test") + runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"] + assert any(r["type"] == "image" and r.get("fileId") == "abc123" for r in runs) + + +def test_inline_link(): + result = markdownToDocumentJson("Click [here](https://example.com)", "Test") + runs = result["documents"][0]["sections"][0]["elements"][0]["content"]["inlineRuns"] + assert any(r["type"] == "link" and r.get("href") == "https://example.com" for r in runs) + + +def test_table_cells_are_inline_runs(): + md = "| A | B |\n| --- | --- |\n| **x** | y |" + result = markdownToDocumentJson(md, "Test") + section = result["documents"][0]["sections"][0] + assert section["content_type"] == "table" + rows = section["elements"][0]["content"]["rows"] + assert isinstance(rows[0][0], list) + + +def test_bullet_list_inline_runs(): + md = "- Item **one**\n- Item two" + result = markdownToDocumentJson(md, "Test") + section = result["documents"][0]["sections"][0] + assert section["content_type"] == "bullet_list" + items = section["elements"][0]["content"]["items"] + assert isinstance(items[0], list) + + +def test_standalone_image_block(): + md = "![Big chart](file:chart123)" + result = markdownToDocumentJson(md, "Test") + section = result["documents"][0]["sections"][0] + assert section["content_type"] == "image" + + +def test_heading_unchanged(): + result = markdownToDocumentJson("# Title", "Test") + section = result["documents"][0]["sections"][0] + assert section["content_type"] == "heading" + assert section["elements"][0]["content"]["text"] == "Title" + assert section["elements"][0]["content"]["level"] == 1 + + +def test_code_block_unchanged(): + md = "```python\nprint('hi')\n```" + result = markdownToDocumentJson(md, "Test") + section = result["documents"][0]["sections"][0] + assert section["content_type"] == "code_block" + assert section["elements"][0]["content"]["code"] == "print('hi')" diff --git a/tests/serviceGeneration/test_style_resolver.py b/tests/serviceGeneration/test_style_resolver.py new file mode 100644 index 00000000..6b2b649a --- /dev/null +++ b/tests/serviceGeneration/test_style_resolver.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025 Patrick Motsch +# All rights reserved. +import pytest +from modules.serviceCenter.services.serviceGeneration.styleDefaults import resolveStyle, DEFAULT_STYLE + + +def test_resolve_none_returns_defaults(): + result = resolveStyle(None) + assert result == DEFAULT_STYLE + + +def test_resolve_empty_returns_defaults(): + result = resolveStyle({}) + assert result == DEFAULT_STYLE + + +def test_override_single_color(): + result = resolveStyle({"colors": {"primary": "#FF0000"}}) + assert result["colors"]["primary"] == "#FF0000" + assert result["colors"]["secondary"] == DEFAULT_STYLE["colors"]["secondary"] + + +def test_override_nested_heading(): + result = resolveStyle({"headings": {"h1": {"sizePt": 30}}}) + assert result["headings"]["h1"]["sizePt"] == 30 + assert result["headings"]["h1"]["weight"] == "bold" + + +def test_override_font(): + result = resolveStyle({"fonts": {"primary": "Arial"}}) + assert result["fonts"]["primary"] == "Arial" + assert result["fonts"]["monospace"] == "Consolas" + + +def test_full_style_passthrough(): + custom = {"fonts": {"primary": "Helvetica", "monospace": "Monaco"}} + result = resolveStyle(custom) + assert result["fonts"]["primary"] == "Helvetica" + assert result["fonts"]["monospace"] == "Monaco" From b500bfa6c1320f05b43a5c63c7934a69d0bf8a4c Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Wed, 29 Apr 2026 23:27:52 +0200 Subject: [PATCH 14/18] plan D fixed --- .../services/serviceAi/mainServiceAi.py | 4 ++ .../renderers/rendererPdf.py | 39 ++++++++++++------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/modules/serviceCenter/services/serviceAi/mainServiceAi.py b/modules/serviceCenter/services/serviceAi/mainServiceAi.py index 18ac46bc..3b800fb5 100644 --- a/modules/serviceCenter/services/serviceAi/mainServiceAi.py +++ b/modules/serviceCenter/services/serviceAi/mainServiceAi.py @@ -51,6 +51,10 @@ class _ServicesAdapter: def workflow(self): return self._context.workflow + @workflow.setter + def workflow(self, value): + self._context.workflow = value + @property def chat(self): return self._get_service("chat") diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py index 31537980..7913a246 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererPdf.py @@ -247,13 +247,28 @@ class RendererPdf(BaseRenderer): removed = False for idx, flowable in enumerate(story): fRepr = repr(flowable) + if "Image" in fRepr and hasattr(flowable, 'drawWidth') and hasattr(flowable, 'drawHeight'): + from reportlab.platypus import Image as ReportLabImage + if isinstance(flowable, ReportLabImage): + frameH = 650.0 + frameW = 450.0 + if flowable.drawHeight > frameH or flowable.drawWidth > frameW: + scaleW = frameW / flowable.drawWidth if flowable.drawWidth > frameW else 1.0 + scaleH = frameH / flowable.drawHeight if flowable.drawHeight > frameH else 1.0 + s = min(scaleW, scaleH) * 0.9 + flowable.drawWidth = flowable.drawWidth * s + flowable.drawHeight = flowable.drawHeight * s + flowable._width = flowable.drawWidth + flowable._height = flowable.drawHeight + removed = True + break if "Table" in fRepr and hasattr(flowable, '_cellvalues'): try: nRows = len(flowable._cellvalues) nCols = len(flowable._cellvalues[0]) if flowable._cellvalues else 0 if nRows == 1 and nCols == 1: errPara = Paragraph( - "[Code block omitted — content too large for PDF page]", + "[Code block omitted - content too large for PDF page]", self._createNormalStyle({}), ) story[idx] = errPara @@ -1078,20 +1093,18 @@ class RendererPdf(BaseRenderer): pilImage = PILImage.open(imageStream) originalWidth, originalHeight = pilImage.size - # Calculate available page dimensions (A4 with margins: 72pt left/right, 72pt top, 18pt bottom) pageWidth = A4[0] # 595.27 points pageHeight = A4[1] # 841.89 points - leftMargin = 72 - rightMargin = 72 - topMargin = 72 - bottomMargin = 18 - - # Use actual frame dimensions from SimpleDocTemplate - # Frame is smaller than page minus margins due to internal spacing - # From error message: frame is 439.27559055118115 x 739.8897637795277 - # Use conservative values with safety margin - availableWidth = 430.0 # Slightly smaller than frame width for safety - availableHeight = 730.0 # Slightly smaller than frame height for safety + # Use page dimensions minus margins with generous safety buffer + # A4 = 595.27 x 841.89 pt; frame = page - margins - internal padding + _us = getattr(self, '_unifiedStyle', None) or {} + _pageMgn = (_us.get('page') or {}).get('marginsPt') or {} + marginTop = _pageMgn.get('top', 60) + marginBottom = _pageMgn.get('bottom', 60) + marginLeft = _pageMgn.get('left', 60) + marginRight = _pageMgn.get('right', 60) + availableWidth = pageWidth - marginLeft - marginRight - 20 # 20pt safety + availableHeight = pageHeight - marginTop - marginBottom - 80 # 80pt safety for header/footer # Convert original image size from pixels to points # PIL provides size in pixels, need to convert to points From 06d9910ecd270a0ecbfcbd56c1529280b7854ba2 Mon Sep 17 00:00:00 2001 From: Ida Date: Thu, 30 Apr 2026 12:37:46 +0200 Subject: [PATCH 15/18] file tree ersetzt durch gruppierung im formgenerator --- modules/datamodels/datamodelFileFolder.py | 82 --- modules/datamodels/datamodelFiles.py | 11 - modules/features/graphicalEditor/portTypes.py | 2 +- .../workspace/routeFeatureWorkspace.py | 27 +- modules/interfaces/interfaceDbManagement.py | 439 ++---------- modules/interfaces/interfaceRbac.py | 7 +- modules/migrations/__init__.py | 0 .../migrations/migrate_folders_to_groups.py | 240 +++++++ modules/routes/routeDataFiles.py | 667 ++++++------------ modules/routes/routeHelpers.py | 15 + .../serviceAgent/coreTools/_helpers.py | 77 +- .../serviceAgent/coreTools/_workspaceTools.py | 367 +++++----- .../services/serviceAgent/mainServiceAgent.py | 33 +- .../services/serviceChat/mainServiceChat.py | 52 +- 14 files changed, 795 insertions(+), 1224 deletions(-) delete mode 100644 modules/datamodels/datamodelFileFolder.py create mode 100644 modules/migrations/__init__.py create mode 100644 modules/migrations/migrate_folders_to_groups.py diff --git a/modules/datamodels/datamodelFileFolder.py b/modules/datamodels/datamodelFileFolder.py deleted file mode 100644 index 4829385e..00000000 --- a/modules/datamodels/datamodelFileFolder.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2025 Patrick Motsch -# All rights reserved. -"""FileFolder: hierarchical folder structure for file organization.""" - -from typing import Optional -from pydantic import BaseModel, Field -from modules.datamodels.datamodelBase import PowerOnModel -from modules.shared.i18nRegistry import i18nModel -import uuid - - -@i18nModel("Dateiordner") -class FileFolder(PowerOnModel): - """Hierarchischer Ordner fuer die Dateiverwaltung.""" - id: str = Field( - default_factory=lambda: str(uuid.uuid4()), - description="Primary key", - json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False}, - ) - name: str = Field( - description="Folder name", - json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True}, - ) - parentId: Optional[str] = Field( - default=None, - description="Parent folder ID (null = root)", - json_schema_extra={ - "label": "Uebergeordneter Ordner", - "frontend_type": "text", - "frontend_readonly": False, - "frontend_required": False, - "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"}, - }, - ) - mandateId: Optional[str] = Field( - default=None, - description="Mandate context", - json_schema_extra={ - "label": "Mandanten-ID", - "frontend_type": "text", - "frontend_readonly": True, - "frontend_required": False, - "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"}, - }, - ) - featureInstanceId: Optional[str] = Field( - default=None, - description="Feature instance context", - json_schema_extra={ - "label": "Feature-Instanz-ID", - "frontend_type": "text", - "frontend_readonly": True, - "frontend_required": False, - "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}, - }, - ) - scope: str = Field( - default="personal", - description="Data visibility scope: personal, featureInstance, mandate, global. Inherited by files in this folder.", - json_schema_extra={ - "label": "Sichtbarkeit", - "frontend_type": "select", - "frontend_readonly": False, - "frontend_required": False, - "frontend_options": [ - {"value": "personal", "label": "Persönlich"}, - {"value": "featureInstance", "label": "Feature-Instanz"}, - {"value": "mandate", "label": "Mandant"}, - {"value": "global", "label": "Global"}, - ], - }, - ) - neutralize: bool = Field( - default=False, - description="Whether files in this folder should be neutralized before AI processing. Inherited by new/moved files.", - json_schema_extra={ - "label": "Neutralisieren", - "frontend_type": "checkbox", - "frontend_readonly": False, - "frontend_required": False, - }, - ) diff --git a/modules/datamodels/datamodelFiles.py b/modules/datamodels/datamodelFiles.py index 82628e0c..2a547b9c 100644 --- a/modules/datamodels/datamodelFiles.py +++ b/modules/datamodels/datamodelFiles.py @@ -68,17 +68,6 @@ class FileItem(PowerOnModel): description="Tags for categorization and search", json_schema_extra={"label": "Tags", "frontend_type": "tags", "frontend_readonly": False, "frontend_required": False}, ) - folderId: Optional[str] = Field( - default=None, - description="ID of the parent folder", - json_schema_extra={ - "label": "Ordner-ID", - "frontend_type": "text", - "frontend_readonly": False, - "frontend_required": False, - "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"}, - }, - ) description: Optional[str] = Field( default=None, description="User-provided description of the file", diff --git a/modules/features/graphicalEditor/portTypes.py b/modules/features/graphicalEditor/portTypes.py index e8d5b48d..f1513f9e 100644 --- a/modules/features/graphicalEditor/portTypes.py +++ b/modules/features/graphicalEditor/portTypes.py @@ -83,7 +83,7 @@ PORT_TYPE_CATALOG: Dict[str, PortSchema] = { PortField(name="listId", type="str", description="ClickUp-Listen-ID"), PortField(name="name", type="str", required=False, description="Listenname"), PortField(name="spaceId", type="str", required=False, description="Space-ID"), - PortField(name="folderId", type="str", required=False, description="Ordner-ID"), + PortField(name="groupId", type="str", required=False, description="Gruppen-ID für die Gruppierungszuordnung"), PortField(name="connection", type="ConnectionRef", required=False, description="ClickUp-Verbindung"), ]), diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py index 3e1a54b7..7ba7acec 100644 --- a/modules/features/workspace/routeFeatureWorkspace.py +++ b/modules/features/workspace/routeFeatureWorkspace.py @@ -1202,7 +1202,7 @@ async def patchWorkspaceWorkflowAttachments( # --------------------------------------------------------------------------- -# File and folder list endpoints +# File endpoints # --------------------------------------------------------------------------- @router.get("/{instanceId}/files") @@ -1210,7 +1210,6 @@ async def patchWorkspaceWorkflowAttachments( async def listWorkspaceFiles( request: Request, instanceId: str = Path(...), - folderId: Optional[str] = Query(None), tags: Optional[str] = Query(None), search: Optional[str] = Query(None), context: RequestContext = Depends(getRequestContext), @@ -1265,30 +1264,6 @@ async def getFileContent( return Response(content=content, media_type=mimeType) -@router.get("/{instanceId}/folders") -@limiter.limit("300/minute") -async def listWorkspaceFolders( - request: Request, - instanceId: str = Path(...), - parentId: Optional[str] = Query(None), - context: RequestContext = Depends(getRequestContext), -): - _mandateId, _ = _validateInstanceAccess(instanceId, context) - try: - from modules.serviceCenter import getService - from modules.serviceCenter.context import ServiceCenterContext - ctx = ServiceCenterContext( - user=context.user, - mandate_id=_mandateId or "", - feature_instance_id=instanceId, - ) - chatService = getService("chat", ctx) - folders = chatService.listFolders(parentId=parentId) - return JSONResponse({"folders": folders or []}) - except Exception: - return JSONResponse({"folders": []}) - - @router.get("/{instanceId}/datasources") @limiter.limit("300/minute") async def listWorkspaceDataSources( diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py index f72597b3..b263c98b 100644 --- a/modules/interfaces/interfaceDbManagement.py +++ b/modules/interfaces/interfaceDbManagement.py @@ -20,7 +20,6 @@ from modules.security.rbac import RbacClass from modules.datamodels.datamodelRbac import AccessRuleContext from modules.datamodels.datamodelUam import AccessLevel from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData -from modules.datamodels.datamodelFileFolder import FileFolder from modules.datamodels.datamodelUtils import Prompt from modules.datamodels.datamodelMessaging import ( MessagingSubscription, @@ -1103,15 +1102,12 @@ class ComponentObjects: return newfileName counter += 1 - def createFile(self, name: str, mimeType: str, content: bytes, folderId: Optional[str] = None) -> FileItem: + def createFile(self, name: str, mimeType: str, content: bytes) -> FileItem: """Creates a new file entry if user has permission. Computes fileHash and fileSize from content. Duplicate check: if a file with the same user + fileHash + fileName already exists, the existing file is returned instead of creating a new one. Same hash with different name is allowed (intentional copy by user). - - Args: - folderId: Optional parent folder ID. None/empty means the root folder. """ if not self.checkRbacPermission(FileItem, "create"): raise PermissionError("No permission to create files") @@ -1139,11 +1135,6 @@ class ComponentObjects: else: scope = "personal" - # Normalize folderId: treat empty string as "no folder" (= root) – NULL in DB - normalizedFolderId: Optional[str] = folderId - if isinstance(normalizedFolderId, str) and not normalizedFolderId.strip(): - normalizedFolderId = None - fileItem = FileItem( mandateId=mandateId, featureInstanceId=featureInstanceId, @@ -1152,7 +1143,6 @@ class ComponentObjects: mimeType=mimeType, fileSize=fileSize, fileHash=fileHash, - folderId=normalizedFolderId, ) # Store in database @@ -1277,382 +1267,47 @@ class ComponentObjects: self.db.connection.rollback() raise FileDeletionError(f"Error deleting files in batch: {str(e)}") - # ---- Folder methods ---- - - _RESERVED_FOLDER_NAMES = {"(Global)"} - - def _validateFolderName(self, name: str, parentId: Optional[str], excludeFolderId: Optional[str] = None): - """Ensures folder name is not reserved and is unique within parent.""" - if name in self._RESERVED_FOLDER_NAMES: - raise ValueError(f"Folder name '{name}' is reserved") - if not name or not name.strip(): - raise ValueError("Folder name cannot be empty") - existingFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId or ""}) - for f in existingFolders: - if f.get("name") == name and f.get("id") != excludeFolderId: - raise ValueError(f"Folder '{name}' already exists in this directory") - - def _isDescendantOf(self, folderId: str, ancestorId: str) -> bool: - """Checks if folderId is a descendant of ancestorId (circular reference check).""" - visited = set() - currentId = folderId - while currentId: - if currentId == ancestorId: - return True - if currentId in visited: - break - visited.add(currentId) - folders = self.db.getRecordset(FileFolder, recordFilter={"id": currentId}) - if not folders: - break - currentId = folders[0].get("parentId") - return False - - def _ensureFeatureInstanceFolder(self, featureInstanceId: str, mandateId: str = "") -> Optional[str]: - """Return the folder ID for a feature instance, creating it on first use. - The folder is named after the feature instance label.""" - existing = self.db.getRecordset( - FileFolder, - recordFilter={ - "featureInstanceId": featureInstanceId, - "sysCreatedBy": self.userId or "", - }, - ) - if existing: - return existing[0].get("id") - - # Resolve the instance label for the folder name - folderName = featureInstanceId[:8] + def _ensureFeatureInstanceGroup(self, featureInstanceId: str, contextKey: str = "files/list") -> Optional[str]: + """Return the groupId of the default group for a feature instance. + Creates the group if it doesn't exist yet.""" try: - from modules.datamodels.datamodelFeatures import FeatureInstance - from modules.security.rootAccess import getRootDbAppConnector - dbApp = getRootDbAppConnector() - instances = dbApp.getRecordset(FeatureInstance, recordFilter={"id": featureInstanceId}) - if instances: - folderName = instances[0].get("label") or folderName + import modules.interfaces.interfaceDbApp as _appIface + appInterface = _appIface.getInterface(self._currentUser) + existing = appInterface.getTableGrouping(contextKey) + nodes = [n.model_dump() if hasattr(n, 'model_dump') else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + # Look for group with name matching featureInstanceId + def _find(nds): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + nmeta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {}) + if (nmeta or {}).get("featureInstanceId") == featureInstanceId: + return nid + subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []) + result = _find(subs) + if result: + return result + return None + found = _find(nodes) + if found: + return found + # Create new group + import uuid + newId = str(uuid.uuid4()) + newGroup = { + "id": newId, + "name": featureInstanceId, + "itemIds": [], + "subGroups": [], + "meta": {"featureInstanceId": featureInstanceId}, + } + nodes.append(newGroup) + appInterface.upsertTableGrouping(contextKey, nodes) + return newId except Exception as e: - logger.warning(f"Could not resolve feature instance label: {e}") + logger.error(f"_ensureFeatureInstanceGroup failed: {e}") + return None - folder = FileFolder( - name=folderName, - parentId=None, - mandateId=mandateId, - featureInstanceId=featureInstanceId, - ) - created = self.db.recordCreate(FileFolder, folder) - return created.get("id") if isinstance(created, dict) else getattr(created, "id", None) - - def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]: - """Returns a folder by ID if it belongs to the current user.""" - folders = self.db.getRecordset(FileFolder, recordFilter={"id": folderId, "sysCreatedBy": self.userId or ""}) - return folders[0] if folders else None - - def listFolders(self, parentId: Optional[str] = None) -> List[Dict[str, Any]]: - """List folders visible to the current user. - Own folders are always returned. Other users' folders are only - returned when they contain files visible to the current user. - Each folder is enriched with ``fileCount``.""" - recordFilter = {} - if parentId is not None: - recordFilter["parentId"] = parentId - folders = self.db.getRecordset(FileFolder, recordFilter=recordFilter if recordFilter else None) - - if not folders: - return folders - - folderIds = [f["id"] for f in folders if f.get("id")] - fileCounts: Dict[str, int] = {} - try: - from modules.interfaces.interfaceRbac import buildFilesScopeWhereClause - scopeClause = buildFilesScopeWhereClause( - self.currentUser, "FileItem", self.db, - self.mandateId, self.featureInstanceId, - [], [], - ) - - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - baseQuery = ( - 'SELECT "folderId", COUNT(*) AS cnt ' - 'FROM "FileItem" ' - 'WHERE "folderId" = ANY(%s)' - ) - queryValues: list = [folderIds] - - if scopeClause: - baseQuery += ' AND (' + scopeClause["condition"] + ')' - queryValues.extend(scopeClause["values"]) - - baseQuery += ' GROUP BY "folderId"' - cursor.execute(baseQuery, queryValues) - for row in cursor.fetchall(): - fileCounts[row["folderId"]] = row["cnt"] - except Exception as e: - logger.warning(f"Could not count files per folder: {e}") - - userId = self.userId or "" - result = [] - for folder in folders: - fc = fileCounts.get(folder.get("id", ""), 0) - folder["fileCount"] = fc - isOwn = folder.get("sysCreatedBy") == userId - if isOwn or fc > 0: - result.append(folder) - - return result - - def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]: - """Create a new folder with unique name validation.""" - self._validateFolderName(name, parentId) - folder = FileFolder( - name=name, - parentId=parentId, - mandateId=self.mandateId or "", - featureInstanceId=self.featureInstanceId or "", - ) - return self.db.recordCreate(FileFolder, folder) - - def renameFolder(self, folderId: str, newName: str) -> bool: - """Rename a folder with unique name validation.""" - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - self._validateFolderName(newName, folder.get("parentId"), excludeFolderId=folderId) - return self.db.recordModify(FileFolder, folderId, {"name": newName}) - - def updateFolder(self, folderId: str, updateData: Dict[str, Any]) -> bool: - """ - Update folder metadata (e.g. ``scope``, ``neutralize``). Owner-only, - same access model as renameFolder/moveFolder. Use ``renameFolder`` for - ``name`` changes (uniqueness validation) and ``moveFolder`` for - ``parentId`` changes (cycle/uniqueness validation). - """ - if not updateData: - return True - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - forbiddenKeys = {"id", "sysCreatedBy", "sysCreatedAt", "sysUpdatedAt"} - cleaned: Dict[str, Any] = {k: v for k, v in updateData.items() if k not in forbiddenKeys} - if "name" in cleaned: - self._validateFolderName(cleaned["name"], folder.get("parentId"), excludeFolderId=folderId) - return self.db.recordModify(FileFolder, folderId, cleaned) - - def moveFolder(self, folderId: str, targetParentId: Optional[str] = None) -> bool: - """Move a folder to a new parent, with circular reference and unique name checks.""" - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - if targetParentId and self._isDescendantOf(targetParentId, folderId): - raise ValueError("Cannot move folder into its own subtree") - self._validateFolderName(folder.get("name", ""), targetParentId, excludeFolderId=folderId) - return self.db.recordModify(FileFolder, folderId, {"parentId": targetParentId}) - - def moveFilesBatch(self, fileIds: List[str], targetFolderId: Optional[str] = None) -> Dict[str, Any]: - """Move multiple files with one SQL update. - Owner can always move; non-owners need RBAC ALL level.""" - uniqueIds = [str(fid) for fid in dict.fromkeys(fileIds or []) if fid] - if not uniqueIds: - return {"movedFiles": 0} - - if targetFolderId: - targetFolder = self.getFolder(targetFolderId) - if not targetFolder: - raise FileNotFoundError(f"Target folder {targetFolderId} not found") - - try: - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - cursor.execute( - 'SELECT "id", "sysCreatedBy" FROM "FileItem" WHERE "id" = ANY(%s)', - (uniqueIds,), - ) - rows = cursor.fetchall() - foundIds = {row["id"] for row in rows} - missing = sorted(set(uniqueIds) - foundIds) - if missing: - raise FileNotFoundError(f"Files not found: {missing}") - - for row in rows: - self._requireFileWriteAccess(row, row["id"], "update") - - accessibleIds = [row["id"] for row in rows] - cursor.execute( - 'UPDATE "FileItem" SET "folderId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s ' - 'WHERE "id" = ANY(%s)', - (targetFolderId, getUtcTimestamp(), self.userId or "", accessibleIds), - ) - movedFiles = cursor.rowcount - - self.db.connection.commit() - return {"movedFiles": movedFiles} - except Exception as e: - logger.error(f"Error moving files in batch: {e}") - self.db.connection.rollback() - raise FileError(f"Error moving files in batch: {str(e)}") - - def moveFoldersBatch(self, folderIds: List[str], targetParentId: Optional[str] = None) -> Dict[str, Any]: - """Move multiple folders with one SQL update after validation.""" - uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid] - if not uniqueIds: - return {"movedFolders": 0} - - foldersToMove: List[Dict[str, Any]] = [] - for folderId in uniqueIds: - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - if targetParentId and self._isDescendantOf(targetParentId, folderId): - raise ValueError("Cannot move folder into its own subtree") - foldersToMove.append(folder) - - existingInTarget = self.db.getRecordset( - FileFolder, - recordFilter={"parentId": targetParentId or "", "sysCreatedBy": self.userId or ""}, - ) - existingNames = {f.get("name"): f.get("id") for f in existingInTarget} - movingNames: Dict[str, str] = {} - movingIds = set(uniqueIds) - - for folder in foldersToMove: - name = folder.get("name", "") - folderId = folder.get("id") - if name in movingNames and movingNames[name] != folderId: - raise ValueError(f"Folder '{name}' already exists in this move batch") - movingNames[name] = folderId - - existingId = existingNames.get(name) - if existingId and existingId not in movingIds: - raise ValueError(f"Folder '{name}' already exists in target directory") - - try: - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - cursor.execute( - 'UPDATE "FileFolder" SET "parentId" = %s, "sysModifiedAt" = %s, "sysModifiedBy" = %s ' - 'WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (targetParentId, getUtcTimestamp(), self.userId or "", uniqueIds, self.userId or ""), - ) - movedFolders = cursor.rowcount - - self.db.connection.commit() - return {"movedFolders": movedFolders} - except Exception as e: - logger.error(f"Error moving folders in batch: {e}") - self.db.connection.rollback() - raise FileError(f"Error moving folders in batch: {str(e)}") - - def deleteFolder(self, folderId: str, recursive: bool = False) -> Dict[str, Any]: - """Delete a folder. If recursive, deletes all contents. Returns summary of deletions.""" - folder = self.getFolder(folderId) - if not folder: - raise FileNotFoundError(f"Folder {folderId} not found") - - childFolders = self.db.getRecordset(FileFolder, recordFilter={"parentId": folderId, "sysCreatedBy": self.userId or ""}) - childFiles = self._getFilesByCurrentUser(recordFilter={"folderId": folderId}) - - if not recursive and (childFolders or childFiles): - raise ValueError( - f"Folder '{folder.get('name')}' is not empty " - f"({len(childFiles)} files, {len(childFolders)} subfolders). " - f"Use recursive=true to delete contents." - ) - - deletedFiles = 0 - deletedFolders = 0 - - if recursive: - for subFolder in childFolders: - subResult = self.deleteFolder(subFolder["id"], recursive=True) - deletedFiles += subResult.get("deletedFiles", 0) - deletedFolders += subResult.get("deletedFolders", 0) - for childFile in childFiles: - try: - self.deleteFile(childFile["id"]) - deletedFiles += 1 - except Exception as e: - logger.warning(f"Failed to delete file {childFile['id']} during folder deletion: {e}") - - self.db.recordDelete(FileFolder, folderId) - deletedFolders += 1 - - return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders} - - def deleteFoldersBatch(self, folderIds: List[str], recursive: bool = True) -> Dict[str, Any]: - """Delete multiple folders and their content in batched SQL calls.""" - uniqueIds = [str(fid) for fid in dict.fromkeys(folderIds or []) if fid] - if not uniqueIds: - return {"deletedFiles": 0, "deletedFolders": 0} - - if not recursive: - deletedFiles = 0 - deletedFolders = 0 - for folderId in uniqueIds: - result = self.deleteFolder(folderId, recursive=False) - deletedFiles += result.get("deletedFiles", 0) - deletedFolders += result.get("deletedFolders", 0) - return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders} - - try: - self.db._ensure_connection() - with self.db.connection.cursor() as cursor: - cursor.execute( - 'SELECT "id" FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (uniqueIds, self.userId or ""), - ) - rootAccessibleIds = [row["id"] for row in cursor.fetchall()] - if len(rootAccessibleIds) != len(uniqueIds): - missingIds = sorted(set(uniqueIds) - set(rootAccessibleIds)) - raise FileNotFoundError(f"Folders not found or not accessible: {missingIds}") - - cursor.execute( - """ - WITH RECURSIVE folder_tree AS ( - SELECT "id" - FROM "FileFolder" - WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s - UNION ALL - SELECT child."id" - FROM "FileFolder" child - INNER JOIN folder_tree ft ON child."parentId" = ft."id" - WHERE child."sysCreatedBy" = %s - ) - SELECT DISTINCT "id" FROM folder_tree - """, - (rootAccessibleIds, self.userId or "", self.userId or ""), - ) - allFolderIds = [row["id"] for row in cursor.fetchall()] - - cursor.execute( - 'SELECT "id" FROM "FileItem" WHERE "folderId" = ANY(%s) AND "sysCreatedBy" = %s', - (allFolderIds, self.userId or ""), - ) - allFileIds = [row["id"] for row in cursor.fetchall()] - - if allFileIds: - cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (allFileIds,)) - cursor.execute( - 'DELETE FROM "FileItem" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (allFileIds, self.userId or ""), - ) - deletedFiles = cursor.rowcount - else: - deletedFiles = 0 - - cursor.execute( - 'DELETE FROM "FileFolder" WHERE "id" = ANY(%s) AND "sysCreatedBy" = %s', - (allFolderIds, self.userId or ""), - ) - deletedFolders = cursor.rowcount - - self.db.connection.commit() - return {"deletedFiles": deletedFiles, "deletedFolders": deletedFolders} - except Exception as e: - logger.error(f"Error deleting folders in batch: {e}") - self.db.connection.rollback() - raise FileDeletionError(f"Error deleting folders in batch: {str(e)}") - - def copyFile(self, sourceFileId: str, targetFolderId: Optional[str] = None, newFileName: Optional[str] = None) -> FileItem: + def copyFile(self, sourceFileId: str, newFileName: Optional[str] = None) -> FileItem: """Create a full duplicate of a file (FileItem + FileData).""" sourceFile = self.getFile(sourceFileId) if not sourceFile: @@ -1665,11 +1320,6 @@ class ComponentObjects: fileName = newFileName or sourceFile.fileName copiedFile = self.createFile(fileName, sourceFile.mimeType, sourceData) - if targetFolderId: - self.updateFile(copiedFile.id, {"folderId": targetFolderId}) - elif sourceFile.folderId: - self.updateFile(copiedFile.id, {"folderId": sourceFile.folderId}) - self.createFileData(copiedFile.id, sourceData) return copiedFile @@ -1884,18 +1534,14 @@ class ComponentObjects: logger.error(f"Error getting file content: {str(e)}") return None - def saveUploadedFile(self, fileContent: bytes, fileName: str, folderId: Optional[str] = None) -> tuple[FileItem, str]: - """Saves an uploaded file if user has permission. - - Args: - folderId: Optional parent folder ID. None means root folder. - """ + def saveUploadedFile(self, fileContent: bytes, fileName: str) -> tuple[FileItem, str]: + """Saves an uploaded file if user has permission.""" try: # Check file creation permission if not self.checkRbacPermission(FileItem, "create"): raise PermissionError("No permission to upload files") - logger.debug(f"Starting upload process for file: {fileName} (folderId={folderId!r})") + logger.debug(f"Starting upload process for file: {fileName}") if not isinstance(fileContent, bytes): logger.error(f"Invalid fileContent type: {type(fileContent)}") @@ -1921,7 +1567,6 @@ class ComponentObjects: name=fileName, mimeType=mimeType, content=fileContent, - folderId=folderId, ) # Save binary data diff --git a/modules/interfaces/interfaceRbac.py b/modules/interfaces/interfaceRbac.py index ad2ac6b5..8ecc51fd 100644 --- a/modules/interfaces/interfaceRbac.py +++ b/modules/interfaces/interfaceRbac.py @@ -204,7 +204,6 @@ TABLE_NAMESPACE = { # Files - benutzer-eigen "FileItem": "files", "FileData": "files", - "FileFolder": "files", # Automation - benutzer-eigen "AutomationDefinition": "automation", "AutomationTemplate": "automation", @@ -529,8 +528,7 @@ def getRecordsetPaginatedWithRBAC( if val is None: # val=None in pagination.filters means "match empty/null" # (same convention as connectorDbPostgre._buildPaginationClauses). - # Covers both historical empty-string values and true NULLs - # e.g. root-folder files where folderId may be "" or NULL. + # Covers both historical empty-string values and true NULLs. whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')') continue if isinstance(val, dict): @@ -689,8 +687,7 @@ def getDistinctColumnValuesWithRBAC( if val is None: # val=None in pagination.filters means "match empty/null" # (same convention as connectorDbPostgre._buildPaginationClauses). - # Covers both historical empty-string values and true NULLs - # e.g. root-folder files where folderId may be "" or NULL. + # Covers both historical empty-string values and true NULLs. whereConditions.append(f'("{key}" IS NULL OR "{key}"::TEXT = \'\')') continue if isinstance(val, dict): diff --git a/modules/migrations/__init__.py b/modules/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/migrations/migrate_folders_to_groups.py b/modules/migrations/migrate_folders_to_groups.py new file mode 100644 index 00000000..870e1e45 --- /dev/null +++ b/modules/migrations/migrate_folders_to_groups.py @@ -0,0 +1,240 @@ +""" +One-time migration: Convert FileFolder tree + FileItem.folderId → table_groupings. + +Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column +from the database (those are separate Alembic/SQL steps). + +Usage: + python -m modules.migrations.migrate_folders_to_groups [--dry-run] [--verbose] + +Steps: + 1. For each distinct (userId, mandateId) combination that has FileFolder records: + a. Build the full folder tree (recursive) + b. Write it as a TableGroupNode tree into table_groupings (contextKey='files/list') + – merges with any existing groups rather than overwriting + c. For each FileItem with a folderId that maps into this tree, + add its id to the matching group's itemIds + 2. Print a summary (rows migrated, groups created, files assigned) + 3. If not --dry-run: commits the inserts/updates + NOTE: Schema changes (ALTER TABLE DROP COLUMN, DROP TABLE) are intentionally + NOT performed by this script. Run the corresponding Alembic migration + (migrations/versions/xxxx_drop_folder_columns.py) afterwards. +""" + +import argparse +import json +import logging +import uuid +from typing import Optional + +logger = logging.getLogger(__name__) + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +def _build_tree(folders: list, parent_id: Optional[str]) -> list: + """Recursively build TableGroupNode-compatible dicts from a flat folder list.""" + children = [f for f in folders if f.get("parentId") == parent_id] + result = [] + for folder in children: + node = { + "id": str(uuid.uuid4()), + "name": folder["name"], + "itemIds": [], + "subGroups": _build_tree(folders, folder["id"]), + "meta": {"migratedFromFolderId": folder["id"]}, + } + result.append(node) + return result + + +def _assign_files_to_nodes(nodes: list, files_by_folder: dict) -> list: + """Recursively assign file IDs to group nodes based on folder mapping.""" + for node in nodes: + folder_id = (node.get("meta") or {}).get("migratedFromFolderId") + if folder_id and folder_id in files_by_folder: + node["itemIds"] = list(files_by_folder[folder_id]) + node["subGroups"] = _assign_files_to_nodes(node.get("subGroups", []), files_by_folder) + return nodes + + +def _count_items(nodes: list) -> int: + total = 0 + for node in nodes: + total += len(node.get("itemIds", [])) + total += _count_items(node.get("subGroups", [])) + return total + + +def _now_ts() -> str: + from modules.shared.timeUtils import getUtcTimestamp + return getUtcTimestamp() + + +# ── Main migration ──────────────────────────────────────────────────────────── + +def run_migration(dry_run: bool = True, verbose: bool = False): + """Main migration entry point.""" + logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) + logger.info(f"Starting folder→group migration (dry_run={dry_run})") + + from modules.connectors.connectorDbPostgre import getCachedConnector + + connector = getCachedConnector() + if not connector or not connector.connection: + logger.error("Could not obtain a DB connection. Aborting.") + return + + conn = connector.connection + cur = conn.cursor() + + # ── 1. Check that the source tables still exist ─────────────────────────── + cur.execute(""" + SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'FileFolder' + ) + """) + folder_table_exists = cur.fetchone()[0] + + cur.execute(""" + SELECT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'FileItem' AND column_name = 'folderId' + ) + """) + folder_column_exists = cur.fetchone()[0] + + if not folder_table_exists and not folder_column_exists: + logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.") + return + + if not folder_table_exists: + logger.warning("FileFolder table missing but FileItem.folderId column still present. Only file assignments will be migrated.") + if not folder_column_exists: + logger.warning("FileItem.folderId column missing but FileFolder table still present. Only group tree structure will be migrated.") + + # ── 2. Load all folders ─────────────────────────────────────────────────── + folders_by_user: dict = {} + if folder_table_exists: + cur.execute('SELECT "id", "name", "parentId", "sysCreatedBy", "mandateId" FROM "FileFolder"') + for row in cur.fetchall(): + fid, fname, parent_id, user_id, mandate_id = row + key = (str(user_id), str(mandate_id) if mandate_id else "") + folders_by_user.setdefault(key, []).append({ + "id": fid, "name": fname, "parentId": parent_id, + }) + logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations") + + # ── 3. Load file→folder assignments ────────────────────────────────────── + files_by_key: dict = {} + if folder_column_exists: + cur.execute( + 'SELECT "id", "folderId", "sysCreatedBy", "mandateId" FROM "FileItem" WHERE "folderId" IS NOT NULL AND "folderId" != \'\'' + ) + for row in cur.fetchall(): + file_id, folder_id, user_id, mandate_id = row + key = (str(user_id), str(mandate_id) if mandate_id else "") + files_by_key.setdefault(key, {}).setdefault(folder_id, []).append(file_id) + total_files = sum( + sum(len(v) for v in d.values()) for d in files_by_key.values() + ) + logger.info(f"Found {total_files} file→folder assignments across {len(files_by_key)} (user, mandate) combos") + + # ── 4. Combine and upsert groupings ────────────────────────────────────── + all_keys = set(folders_by_user.keys()) | set(files_by_key.keys()) + stats = {"groups_created": 0, "groupings_upserted": 0, "files_assigned": 0} + + for key in all_keys: + user_id, mandate_id = key + folders = folders_by_user.get(key, []) + files_by_folder = files_by_key.get(key, {}) + + # Build tree + roots = _build_tree(folders, None) + roots = _assign_files_to_nodes(roots, files_by_folder) + + # Handle files in unknown folders (folder no longer in tree) + known_folder_ids = {f["id"] for f in folders} + for folder_id, file_ids in files_by_folder.items(): + if folder_id not in known_folder_ids: + # Orphaned files: put them in an "Orphaned" group + roots.append({ + "id": str(uuid.uuid4()), + "name": f"Orphaned (folder {folder_id[:8]}…)", + "itemIds": file_ids, + "subGroups": [], + "meta": {"migratedFromFolderId": folder_id, "orphaned": True}, + }) + + if not roots: + continue + + n_items = _count_items(roots) + stats["groups_created"] += len(roots) + stats["files_assigned"] += n_items + + context_key = "files/list" + if verbose: + logger.debug(f" user={user_id} mandate={mandate_id}: {len(roots)} root groups, {n_items} files") + + if not dry_run: + # Check for existing grouping + cur.execute( + 'SELECT "id", "rootGroups" FROM "TableGrouping" WHERE "userId" = %s AND "contextKey" = %s', + (user_id, context_key), + ) + existing_row = cur.fetchone() + + if existing_row: + existing_id, existing_raw = existing_row + existing_roots = json.loads(existing_raw) if isinstance(existing_raw, str) else (existing_raw or []) + # Merge: append migrated groups (avoid duplicates by migratedFromFolderId) + existing_meta_ids = { + (n.get("meta") or {}).get("migratedFromFolderId") + for n in existing_roots + if (n.get("meta") or {}).get("migratedFromFolderId") + } + new_roots = existing_roots + [ + r for r in roots + if (r.get("meta") or {}).get("migratedFromFolderId") not in existing_meta_ids + ] + cur.execute( + 'UPDATE "TableGrouping" SET "rootGroups" = %s, "updatedAt" = %s WHERE "id" = %s', + (json.dumps(new_roots), _now_ts(), existing_id), + ) + else: + new_id = str(uuid.uuid4()) + cur.execute( + 'INSERT INTO "TableGrouping" ("id", "userId", "contextKey", "rootGroups", "updatedAt") VALUES (%s, %s, %s, %s, %s)', + (new_id, user_id, context_key, json.dumps(roots), _now_ts()), + ) + stats["groupings_upserted"] += 1 + + # ── 5. Summary ──────────────────────────────────────────────────────────── + if not dry_run: + conn.commit() + logger.info("Migration committed.") + else: + logger.info("DRY RUN — no changes written.") + + logger.info( + f"Summary: groupings_upserted={stats['groupings_upserted']}, " + f"groups_created={stats['groups_created']}, " + f"files_assigned={stats['files_assigned']}" + ) + logger.info( + "Next steps (run after verifying data):\n" + " 1. Run Alembic migration to DROP COLUMN FileItem.folderId\n" + " 2. Run Alembic migration to DROP TABLE FileFolder" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings") + parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)") + parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)") + parser.add_argument("--verbose", action="store_true", help="Show per-user details") + args = parser.parse_args() + dry_run = not args.execute + run_migration(dry_run=dry_run, verbose=args.verbose) diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index 8168d8d2..c20f3f3a 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -12,7 +12,6 @@ from modules.auth import limiter, getCurrentUser, getRequestContext, RequestCont # Import interfaces import modules.interfaces.interfaceDbManagement as interfaceDbManagement from modules.datamodels.datamodelFiles import FileItem, FilePreview -from modules.datamodels.datamodelFileFolder import FileFolder from modules.shared.attributeUtils import getModelAttributeDefinitions from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict @@ -319,16 +318,7 @@ def get_files( recordFilter = {"sysCreatedBy": managementInterface.userId} return handleIdsMode(managementInterface.db, FileItem, pagination, recordFilter) - recordFilter = None - if paginationParams and paginationParams.filters and "folderId" in paginationParams.filters: - fVal = paginationParams.filters.get("folderId") - if fVal is None or (isinstance(fVal, str) and fVal.strip() == ""): - paginationParams.filters["folderId"] = None - else: - paginationParams.filters.pop("folderId") - recordFilter = {"folderId": fVal} - - result = managementInterface.getAllFiles(pagination=paginationParams, recordFilter=recordFilter) + result = managementInterface.getAllFiles(pagination=paginationParams) if paginationParams: enriched = applyGroupScopeFilter(enrichRowsWithFkLabels(_filesToDicts(result.items), FileItem), groupCtx.itemIds) @@ -358,6 +348,36 @@ def get_files( ) +def _addFileToGroup(appInterface, fileId: str, groupId: str, contextKey: str = "files/list"): + """Add a file to a group in the persisted groupTree (upsert).""" + from modules.routes.routeHelpers import _collectItemIds + try: + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return + nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups] + def _add(nds): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == groupId: + itemIds = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + if fileId not in itemIds: + itemIds.append(fileId) + if isinstance(nd, dict): + nd["itemIds"] = itemIds + else: + nd.itemIds = itemIds + return True + subs = nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []) + if _add(subs): + return True + return False + _add(nodes) + appInterface.upsertTableGrouping(contextKey, nodes) + except Exception as e: + logger.warning(f"_addFileToGroup failed: {e}") + + @router.post("/upload", status_code=status.HTTP_201_CREATED) @limiter.limit("10/minute") async def upload_file( @@ -365,7 +385,7 @@ async def upload_file( file: UploadFile = File(...), workflowId: Optional[str] = Form(None), featureInstanceId: Optional[str] = Form(None), - folderId: Optional[str] = Form(None), + groupId: Optional[str] = Form(None), currentUser: User = Depends(getCurrentUser), context: RequestContext = Depends(getRequestContext), ) -> JSONResponse: @@ -389,31 +409,22 @@ async def upload_file( status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail=f"File too large. Maximum size: {interfaceDbManagement.APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB" ) - - # Normalize folderId: empty string / "null" / "root" → None (root folder) - normalizedFolderId: Optional[str] = folderId - if isinstance(normalizedFolderId, str): - trimmed = normalizedFolderId.strip() - if not trimmed or trimmed.lower() in {"null", "none", "root"}: - normalizedFolderId = None - else: - normalizedFolderId = trimmed # Save file via LucyDOM interface in the database fileItem, duplicateType = managementInterface.saveUploadedFile( - fileContent, file.filename, folderId=normalizedFolderId + fileContent, file.filename ) if featureInstanceId and not fileItem.featureInstanceId: managementInterface.updateFile(fileItem.id, {"featureInstanceId": featureInstanceId}) fileItem.featureInstanceId = featureInstanceId - # For exact duplicates we keep the existing record, but move it into the - # target folder so the user actually sees their upload land where they expect. - if duplicateType == "exact_duplicate" and normalizedFolderId != getattr(fileItem, "folderId", None): - managementInterface.updateFile(fileItem.id, {"folderId": normalizedFolderId}) - fileItem.folderId = normalizedFolderId - + # Add to group if groupId was provided + if groupId: + import modules.interfaces.interfaceDbApp as _appIface + appInterface = _appIface.getInterface(currentUser) + _addFileToGroup(appInterface, fileItem.id, groupId) + # Determine response message based on duplicate type if duplicateType == "exact_duplicate": message = f"File '{file.filename}' already exists with identical content. Reusing existing file." @@ -478,347 +489,6 @@ async def upload_file( detail=f"Error during file upload: {str(e)}" ) -# ── Folder endpoints (MUST be before /{fileId} catch-all) ───────────────────── - -@router.get("/folders", response_model=List[Dict[str, Any]]) -@limiter.limit("30/minute") -def list_folders( - request: Request, - parentId: Optional[str] = Query(None, description="Parent folder ID (omit for all folders)"), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> List[Dict[str, Any]]: - """List folders for the current user.""" - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - if parentId is not None: - return mgmt.listFolders(parentId=parentId) - return mgmt.listFolders() - except Exception as e: - logger.error(f"Error listing folders: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/folders", status_code=status.HTTP_201_CREATED) -@limiter.limit("10/minute") -def create_folder( - request: Request, - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Create a new folder.""" - name = body.get("name", "") - parentId = body.get("parentId") - if not name: - raise HTTPException(status_code=400, detail=routeApiMsg("name is required")) - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - return mgmt.createFolder(name=name, parentId=parentId) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error creating folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.put("/folders/{folderId}") -@limiter.limit("10/minute") -def rename_folder( - request: Request, - folderId: str = Path(...), - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Rename a folder.""" - newName = body.get("name", "") - if not newName: - raise HTTPException(status_code=400, detail=routeApiMsg("name is required")) - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - mgmt.renameFolder(folderId, newName) - return {"success": True, "folderId": folderId, "name": newName} - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error renaming folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.delete("/folders/{folderId}") -@limiter.limit("10/minute") -def delete_folder( - request: Request, - folderId: str = Path(...), - recursive: bool = Query(False, description="Delete folder contents recursively"), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Delete a folder. Use recursive=true to delete non-empty folders.""" - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - return mgmt.deleteFolder(folderId, recursive=recursive) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error deleting folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.post("/folders/{folderId}/move") -@limiter.limit("10/minute") -def move_folder( - request: Request, - folderId: str = Path(...), - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Move a folder to a new parent.""" - targetParentId = body.get("targetParentId") - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - mgmt.moveFolder(folderId, targetParentId) - return {"success": True, "folderId": folderId, "parentId": targetParentId} - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - except Exception as e: - logger.error(f"Error moving folder: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.patch("/folders/{folderId}/scope") -@limiter.limit("10/minute") -def _updateFolderScope( - request: Request, - folderId: str = Path(..., description="ID of the folder"), - scope: str = Body(..., embed=True), - context: RequestContext = Depends(getRequestContext), -) -> Dict[str, Any]: - """Update the scope of a folder. Propagates to all files inside (recursively). Global scope requires sysAdmin.""" - validScopes = {"personal", "featureInstance", "mandate", "global"} - if scope not in validScopes: - raise HTTPException(status_code=400, detail=f"Invalid scope: {scope}. Must be one of {validScopes}") - if scope == "global" and not context.isSysAdmin: - raise HTTPException(status_code=403, detail=routeApiMsg("Only sysadmins can set global scope")) - try: - mgmt = interfaceDbManagement.getInterface( - context.user, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - folder = mgmt.getFolder(folderId) - if not folder: - raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found")) - mgmt.updateFolder(folderId, {"scope": scope}) - fileIds = _collectFolderFileIds(mgmt, folderId) - for fid in fileIds: - try: - mgmt.updateFile(fid, {"scope": scope}) - except Exception as e: - logger.error("Folder scope propagation: failed to update file %s: %s", fid, e) - logger.info("Updated scope=%s for folder %s: %d files affected", scope, folderId, len(fileIds)) - return {"folderId": folderId, "scope": scope, "filesUpdated": len(fileIds)} - except HTTPException: - raise - except Exception as e: - logger.error(f"Error updating folder scope: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.patch("/folders/{folderId}/neutralize") -@limiter.limit("10/minute") -def updateFolderNeutralize( - request: Request, - background_tasks: BackgroundTasks, - folderId: str = Path(..., description="ID of the folder"), - neutralize: bool = Body(..., embed=True), - context: RequestContext = Depends(getRequestContext), -) -> Dict[str, Any]: - """Toggle neutralization on a folder. Propagates to all files inside (recursively). - - When turning ON: all files in the folder get ``neutralize=True``, their - knowledge indexes are purged synchronously, and background re-indexing - is triggered. - When turning OFF: files revert to ``neutralize=False`` unless they were - individually marked (not implemented yet -- all are reverted). - """ - try: - mgmt = interfaceDbManagement.getInterface( - context.user, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - - folder = mgmt.getFolder(folderId) - if not folder: - raise HTTPException(status_code=404, detail=routeApiMsg("Folder not found")) - - mgmt.updateFolder(folderId, {"neutralize": neutralize}) - - fileIds = _collectFolderFileIds(mgmt, folderId) - logger.info("Folder neutralize toggle %s for folder %s: %d files affected", neutralize, folderId, len(fileIds)) - - from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface - knowledgeDb = getKnowledgeInterface() - - for fid in fileIds: - try: - mgmt.updateFile(fid, {"neutralize": neutralize}) - if neutralize: - try: - knowledgeDb.deleteFileContentIndex(fid) - except Exception as e: - logger.warning("Folder neutralize: failed to purge index for file %s: %s", fid, e) - else: - try: - from modules.datamodels.datamodelKnowledge import FileContentIndex - indices = knowledgeDb.db.getRecordset(FileContentIndex, recordFilter={"id": fid}) - for idx in indices: - idxId = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None) - if idxId: - knowledgeDb.db.recordModify(FileContentIndex, idxId, { - "neutralizationStatus": "original", - "isNeutralized": False, - }) - except Exception as e: - logger.warning("Folder neutralize OFF: metadata update failed for %s: %s", fid, e) - except Exception as e: - logger.error("Folder neutralize: failed to update file %s: %s", fid, e) - - for fid in fileIds: - fileMeta = mgmt.getFile(fid) - if fileMeta: - fn = fileMeta.fileName if hasattr(fileMeta, "fileName") else fileMeta.get("fileName", "") - mt = fileMeta.mimeType if hasattr(fileMeta, "mimeType") else fileMeta.get("mimeType", "") - - async def _reindex(fileId=fid, fileName=fn, mimeType=mt): - try: - await _autoIndexFile(fileId=fileId, fileName=fileName, mimeType=mimeType, user=context.user) - except Exception as ex: - logger.error("Folder neutralize re-index failed for %s: %s", fileId, ex) - - background_tasks.add_task(_reindex) - - return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": len(fileIds)} - except HTTPException: - raise - except Exception as e: - logger.error(f"Error updating folder neutralize flag: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -def _collectFolderFileIds(mgmt, folderId: str) -> List[str]: - """Recursively collect all file IDs in a folder and its sub-folders.""" - fileIds = [] - try: - files = mgmt.listFiles(folderId=folderId) - if isinstance(files, dict): - files = files.get("files", []) - for f in (files or []): - fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None) - if fid: - fileIds.append(fid) - except Exception as e: - logger.warning("_collectFolderFileIds: listFiles failed for folder %s: %s", folderId, e) - - try: - subFolders = mgmt.listFolders(parentId=folderId) - for sf in (subFolders or []): - sfId = sf.get("id") if isinstance(sf, dict) else getattr(sf, "id", None) - if sfId: - fileIds.extend(_collectFolderFileIds(mgmt, sfId)) - except Exception as e: - logger.warning("_collectFolderFileIds: listFolders failed for folder %s: %s", folderId, e) - - return fileIds - - -@router.get("/folders/{folderId}/download") -@limiter.limit("10/minute") -def download_folder( - request: Request, - folderId: str = Path(..., description="ID of the folder to download as ZIP"), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Response: - """Download a folder (including subfolders) as a ZIP archive.""" - import io - import zipfile - import urllib.parse - - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - - folder = mgmt.getFolder(folderId) - if not folder: - raise HTTPException(status_code=404, detail=f"Folder {folderId} not found") - - folderName = folder.get("name", "download") - - def _collectFiles(parentId: str, pathPrefix: str): - """Recursively collect (zipPath, fileId) tuples.""" - entries = [] - for f in mgmt._getFilesByCurrentUser(recordFilter={"folderId": parentId}): - fname = f.get("fileName") or f.get("name") or f.get("id", "file") - entries.append((f"{pathPrefix}{fname}", f["id"])) - for sub in mgmt.listFolders(parentId=parentId): - subName = sub.get("name", sub["id"]) - entries.extend(_collectFiles(sub["id"], f"{pathPrefix}{subName}/")) - return entries - - fileEntries = _collectFiles(folderId, "") - if not fileEntries: - raise HTTPException(status_code=404, detail=routeApiMsg("Folder is empty")) - - buf = io.BytesIO() - with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: - for zipPath, fileId in fileEntries: - data = mgmt.getFileData(fileId) - if data: - zf.writestr(zipPath, data) - - buf.seek(0) - zipBytes = buf.getvalue() - encodedName = urllib.parse.quote(f"{folderName}.zip") - - return Response( - content=zipBytes, - media_type="application/zip", - headers={ - "Content-Disposition": f"attachment; filename*=UTF-8''{encodedName}" - } - ) - except HTTPException: - raise - except Exception as e: - logger.error(f"Error downloading folder as ZIP: {e}") - raise HTTPException(status_code=500, detail=f"Error downloading folder: {str(e)}") @router.post("/batch-delete") @@ -829,13 +499,11 @@ def batch_delete_items( currentUser: User = Depends(getCurrentUser), context: RequestContext = Depends(getRequestContext) ) -> Dict[str, Any]: - """Batch delete files/folders with a single SQL-backed operation per type.""" + """Batch delete files.""" fileIds = body.get("fileIds") or [] - folderIds = body.get("folderIds") or [] - recursiveFolders = bool(body.get("recursiveFolders", True)) - if not isinstance(fileIds, list) or not isinstance(folderIds, list): - raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays")) + if not isinstance(fileIds, list): + raise HTTPException(status_code=400, detail=routeApiMsg("fileIds must be an array")) try: mgmt = interfaceDbManagement.getInterface( @@ -844,17 +512,12 @@ def batch_delete_items( featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, ) - result = {"deletedFiles": 0, "deletedFolders": 0} + result = {"deletedFiles": 0} if fileIds: fileResult = mgmt.deleteFilesBatch(fileIds) result["deletedFiles"] += fileResult.get("deletedFiles", 0) - if folderIds: - folderResult = mgmt.deleteFoldersBatch(folderIds, recursive=recursiveFolders) - result["deletedFiles"] += folderResult.get("deletedFiles", 0) - result["deletedFolders"] += folderResult.get("deletedFolders", 0) - return result except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -863,45 +526,189 @@ def batch_delete_items( raise HTTPException(status_code=500, detail=str(e)) -@router.post("/batch-move") -@limiter.limit("10/minute") -def batch_move_items( - request: Request, - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Batch move files/folders with a single SQL-backed operation per type.""" - fileIds = body.get("fileIds") or [] - folderIds = body.get("folderIds") or [] - targetFolderId = body.get("targetFolderId") - targetParentId = body.get("targetParentId") - - if not isinstance(fileIds, list) or not isinstance(folderIds, list): - raise HTTPException(status_code=400, detail=routeApiMsg("fileIds and folderIds must be arrays")) +# ── Group bulk endpoints ────────────────────────────────────────────────────── +def _get_group_item_ids(contextKey: str, groupId: str, appInterface) -> set: + """Collect all file IDs in a group and its sub-groups from the stored groupTree.""" + from modules.routes.routeHelpers import _collectItemIds try: - mgmt = interfaceDbManagement.getInterface( + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return set() + nodes = [n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups] + result = _collectItemIds(nodes, groupId) + return result or set() + except Exception as e: + logger.error(f"_get_group_item_ids failed for groupId={groupId}: {e}") + return set() + + +@router.patch("/groups/{groupId}/scope") +@limiter.limit("60/minute") +def patch_group_scope( + request: Request, + groupId: str = Path(..., description="Group ID"), + body: dict = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Set scope for all files in a group (recursive).""" + scope = body.get("scope") + if not scope: + raise HTTPException(status_code=400, detail="scope is required") + try: + import modules.interfaces.interfaceDbApp as _appIface + managementInterface = interfaceDbManagement.getInterface( currentUser, mandateId=str(context.mandateId) if context.mandateId else None, featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, ) - - result = {"movedFiles": 0, "movedFolders": 0} - - if fileIds: - fileResult = mgmt.moveFilesBatch(fileIds, targetFolderId=targetFolderId) - result["movedFiles"] += fileResult.get("movedFiles", 0) - - if folderIds: - folderResult = mgmt.moveFoldersBatch(folderIds, targetParentId=targetParentId) - result["movedFolders"] += folderResult.get("movedFolders", 0) - - return result - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + updated = 0 + for fid in fileIds: + try: + managementInterface.updateFile(fid, {"scope": scope}) + updated += 1 + except Exception as e: + logger.error(f"patch_group_scope: failed to update file {fid}: {e}") + return {"groupId": groupId, "scope": scope, "filesUpdated": updated} + except HTTPException: + raise except Exception as e: - logger.error(f"Error in batch move: {e}") + logger.error(f"patch_group_scope error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.patch("/groups/{groupId}/neutralize") +@limiter.limit("60/minute") +def patch_group_neutralize( + request: Request, + groupId: str = Path(..., description="Group ID"), + body: dict = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Toggle neutralize for all files in a group (recursive, incl. knowledge purge/reindex).""" + neutralize = body.get("neutralize") + if neutralize is None: + raise HTTPException(status_code=400, detail="neutralize is required") + try: + import modules.interfaces.interfaceDbApp as _appIface + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + updated = 0 + for fid in fileIds: + try: + managementInterface.updateFile(fid, {"neutralize": neutralize}) + if not neutralize: + try: + from modules.interfaces import interfaceDbKnowledge + kIface = interfaceDbKnowledge.getInterface(currentUser) + kIface.purgeFileKnowledge(fid) + except Exception as ke: + logger.warning(f"patch_group_neutralize: knowledge purge failed for {fid}: {ke}") + updated += 1 + except Exception as e: + logger.error(f"patch_group_neutralize: failed for file {fid}: {e}") + return {"groupId": groupId, "neutralize": neutralize, "filesUpdated": updated} + except HTTPException: + raise + except Exception as e: + logger.error(f"patch_group_neutralize error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/groups/{groupId}/download") +@limiter.limit("20/minute") +async def download_group_zip( + request: Request, + groupId: str = Path(..., description="Group ID"), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Download all files in a group as a ZIP archive.""" + import io, zipfile + try: + import modules.interfaces.interfaceDbApp as _appIface + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + if not fileIds: + raise HTTPException(status_code=404, detail="Group not found or empty") + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + for fid in fileIds: + try: + fileMeta = managementInterface.getFile(fid) + fileData = managementInterface.getFileData(fid) + if fileMeta and fileData: + name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid + zf.writestr(name, fileData) + except Exception as fe: + logger.warning(f"download_group_zip: skipping file {fid}: {fe}") + buf.seek(0) + from fastapi.responses import StreamingResponse + return StreamingResponse( + buf, + media_type="application/zip", + headers={"Content-Disposition": f'attachment; filename="group-{groupId}.zip"'}, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"download_group_zip error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.delete("/groups/{groupId}") +@limiter.limit("30/minute") +def delete_group( + request: Request, + groupId: str = Path(..., description="Group ID"), + deleteItems: bool = Query(False, description="If true, also delete all files in the group"), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Remove a group from the groupTree. Optionally delete all its files.""" + try: + import modules.interfaces.interfaceDbApp as _appIface + appInterface = _appIface.getInterface(currentUser) + fileIds = _get_group_item_ids("files/list", groupId, appInterface) + # Remove group from tree + existing = appInterface.getTableGrouping("files/list") + if existing: + from modules.routes.routeHelpers import _removeGroupFromTree + newRoots = _removeGroupFromTree([n.model_dump() if hasattr(n, 'model_dump') else n for n in existing.rootGroups], groupId) + appInterface.upsertTableGrouping("files/list", newRoots) + # Optionally delete files + deletedFiles = 0 + if deleteItems: + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + for fid in fileIds: + try: + managementInterface.deleteFile(fid) + deletedFiles += 1 + except Exception as e: + logger.error(f"delete_group: failed to delete file {fid}: {e}") + return {"groupId": groupId, "deletedFiles": deletedFiles} + except HTTPException: + raise + except Exception as e: + logger.error(f"delete_group error: {e}") raise HTTPException(status_code=500, detail=str(e)) @@ -1102,7 +909,7 @@ def update_file( ) -> FileItem: """Update file info""" try: - _EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "folderId", "neutralize"} + _EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "neutralize"} safeData = {k: v for k, v in file_info.items() if k in _EDITABLE_FIELDS} if not safeData: raise HTTPException(status_code=400, detail=routeApiMsg("No editable fields provided")) @@ -1257,37 +1064,3 @@ def preview_file( ) -@router.post("/{fileId}/move") -@limiter.limit("10/minute") -def move_file( - request: Request, - fileId: str = Path(...), - body: Dict[str, Any] = Body(...), - currentUser: User = Depends(getCurrentUser), - context: RequestContext = Depends(getRequestContext) -) -> Dict[str, Any]: - """Move a file to a different folder.""" - targetFolderId = body.get("targetFolderId") - try: - mgmt = interfaceDbManagement.getInterface( - currentUser, - mandateId=str(context.mandateId) if context.mandateId else None, - featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, - ) - mgmt.updateFile(fileId, {"folderId": targetFolderId}) - - if targetFolderId: - try: - targetFolder = mgmt.getFolder(targetFolderId) - folderNeut = (targetFolder.get("neutralize") if isinstance(targetFolder, dict) - else getattr(targetFolder, "neutralize", False)) if targetFolder else False - if folderNeut: - mgmt.updateFile(fileId, {"neutralize": True}) - logger.info("File %s moved to neutralized folder %s — inherited neutralize=True", fileId, targetFolderId) - except Exception as e: - logger.warning("File move: folder neutralize inheritance check failed for %s: %s", fileId, e) - - return {"success": True, "fileId": fileId, "folderId": targetFolderId} - except Exception as e: - logger.error(f"Error moving file: {e}") - raise HTTPException(status_code=500, detail=str(e)) diff --git a/modules/routes/routeHelpers.py b/modules/routes/routeHelpers.py index 0f0b8ea7..9e8644ca 100644 --- a/modules/routes/routeHelpers.py +++ b/modules/routes/routeHelpers.py @@ -750,6 +750,21 @@ def _collectAllIds(node, ids: set) -> None: _collectAllIds(child, ids) +def _removeGroupFromTree(nodes: list, groupId: str) -> list: + """Remove a group node (and all descendants) from the tree by id.""" + result = [] + for node in nodes: + nodeId = node.get("id") if isinstance(node, dict) else getattr(node, "id", None) + if nodeId == groupId: + continue # skip this node (remove it) + subGroups = node.get("subGroups", []) if isinstance(node, dict) else getattr(node, "subGroups", []) + filtered_sub = _removeGroupFromTree(subGroups, groupId) + if isinstance(node, dict): + node = {**node, "subGroups": filtered_sub} + result.append(node) + return result + + def handleGroupingInRequest( paginationParams: Optional[PaginationParams], interface, diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py index 129de517..37116ee5 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_helpers.py @@ -1,6 +1,6 @@ # Copyright (c) 2025 Patrick Motsch # All rights reserved. -"""Shared helpers for core agent tools (file scope, binary detection, temp folder).""" +"""Shared helpers for core agent tools (file scope, binary detection, group helpers).""" import logging import uuid @@ -46,39 +46,60 @@ def _looksLikeBinary(data: bytes, sampleSize: int = 1024) -> bool: return nonPrintable / len(sample) > 0.10 -def _getOrCreateInstanceFolder(chatService, featureInstanceId: str, mandateId: str = "") -> Optional[str]: - """Return the folder ID for a feature instance, creating it on first use. - - Delegates to interfaceDbManagement._ensureFeatureInstanceFolder. - AI tools call this when saving a file without an explicit folderId - so that instance-produced files land in a named folder automatically. - """ - try: - dbMgmt = chatService.interfaceDbComponent - return dbMgmt._ensureFeatureInstanceFolder(featureInstanceId, mandateId) - except Exception as e: - logger.warning(f"Could not get/create instance folder for {featureInstanceId}: {e}") - return None - - def _getOrCreateTempFolder(chatService) -> Optional[str]: - """Return the ID of the root-level 'Temp' folder, creating it if it doesn't exist.""" + """Deprecated stub: folder-based organisation has been replaced by grouping. + + Returns None unconditionally so callers skip the (now removed) folderId + assignment. Remove callers incrementally and delete this stub afterwards. + """ + logger.debug("_getOrCreateTempFolder called – folder support removed, returning None") + return None + + +async def _getOrCreateInstanceGroup( + appInterface, + featureInstanceId: str, + contextKey: str = "files/list", +) -> Optional[str]: + """Return groupId of the default group for a feature instance; create if needed.""" try: - allFolders = chatService.interfaceDbComponent.listFolders() - tempFolder = next( - (f for f in allFolders - if f.get("name") == "Temp" and not f.get("parentId")), - None, - ) - if tempFolder: - return tempFolder.get("id") - newFolder = chatService.interfaceDbComponent.createFolder("Temp", parentId=None) - return newFolder.get("id") if newFolder else None + existing = appInterface.getTableGrouping(contextKey) + nodes = [ + n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) + for n in (existing.rootGroups if existing else []) + ] + + def _find(nds): + for nd in nds: + meta = nd.get("meta", {}) if isinstance(nd, dict) else getattr(nd, "meta", {}) + if (meta or {}).get("featureInstanceId") == featureInstanceId: + return nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + found = _find(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])) + if found: + return found + return None + + found = _find(nodes) + if found: + return found + newId = str(uuid.uuid4()) + nodes.append({"id": newId, "name": featureInstanceId, "itemIds": [], "subGroups": [], "meta": {"featureInstanceId": featureInstanceId}}) + appInterface.upsertTableGrouping(contextKey, nodes) + return newId except Exception as e: - logger.warning(f"Could not get/create Temp folder: {e}") + logger.error(f"_getOrCreateInstanceGroup: {e}") return None +async def _getOrCreateTempGroup( + appInterface, + sessionId: str, + contextKey: str = "files/list", +) -> Optional[str]: + """Return groupId of a temporary group for a session; create if needed.""" + return await _getOrCreateInstanceGroup(appInterface, f"_temp_{sessionId}", contextKey) + + def _attachFileAsChatDocument( services: Any, fileItem: Any, diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py index e413c3f0..3b9f5945 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py @@ -11,8 +11,8 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( _attachFileAsChatDocument, _formatToolFileResult, - _getOrCreateInstanceFolder, - _getOrCreateTempFolder, + _getOrCreateInstanceGroup, + _getOrCreateTempGroup, _looksLikeBinary, _MAX_TOOL_RESULT_CHARS, ) @@ -169,7 +169,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): try: chatService = services.chat files = chatService.listFiles( - folderId=args.get("folderId"), tags=args.get("tags"), search=args.get("search"), ) @@ -222,18 +221,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): except Exception as e: return ToolResult(toolCallId="", toolName="searchInFileContent", success=False, error=str(e)) - async def _listFolders(args: Dict[str, Any], context: Dict[str, Any]): - try: - chatService = services.chat - folders = chatService.listFolders(parentId=args.get("parentId")) - folderList = "\n".join( - f"- {f.get('name', 'unnamed')} (id: {f.get('id', '?')})" - for f in folders - ) if folders else "No folders found." - return ToolResult(toolCallId="", toolName="listFolders", success=True, data=folderList) - except Exception as e: - return ToolResult(toolCallId="", toolName="listFolders", success=False, error=str(e)) - async def _webSearch(args: Dict[str, Any], context: Dict[str, Any]): query = args.get("query", "") if not query: @@ -271,35 +258,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): except Exception as e: return ToolResult(toolCallId="", toolName="tagFile", success=False, error=str(e)) - async def _moveFile(args: Dict[str, Any], context: Dict[str, Any]): - fileId = args.get("fileId", "") - targetFolderId = args.get("targetFolderId") - if not fileId: - return ToolResult(toolCallId="", toolName="moveFile", success=False, error="fileId is required") - try: - chatService = services.chat - chatService.interfaceDbComponent.updateFile(fileId, {"folderId": targetFolderId}) - return ToolResult( - toolCallId="", toolName="moveFile", success=True, - data=f"File {fileId} moved to folder {targetFolderId or 'root'}" - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="moveFile", success=False, error=str(e)) - - async def _createFolder(args: Dict[str, Any], context: Dict[str, Any]): - name = args.get("name", "") - if not name: - return ToolResult(toolCallId="", toolName="createFolder", success=False, error="name is required") - try: - chatService = services.chat - folder = chatService.createFolder(name=name, parentId=args.get("parentId")) - return ToolResult( - toolCallId="", toolName="createFolder", success=True, - data=f"Folder '{name}' created (id: {folder.get('id', '?')})" - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="createFolder", success=False, error=str(e)) - async def _writeFile(args: Dict[str, Any], context: Dict[str, Any]): content = args.get("content", "") mode = args.get("mode", "create") @@ -354,12 +312,52 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): fiId = context.get("featureInstanceId") or (services.featureInstanceId if services else "") if fiId: dbMgmt.updateFile(fileItem.id, {"featureInstanceId": fiId}) - if args.get("folderId"): - dbMgmt.updateFile(fileItem.id, {"folderId": args["folderId"]}) + if args.get("groupId"): + try: + appIface = chatService.interfaceDbApp + existing = appIface.getTableGrouping("files/list") + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + def _addToGroup(nds, gid, fid): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == gid: + ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + if fid not in ids: + ids.append(fid) + if isinstance(nd, dict): + nd["itemIds"] = ids + return True + if _addToGroup(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid): + return True + return False + _addToGroup(nodes, args["groupId"], fileItem.id) + appIface.upsertTableGrouping("files/list", nodes) + except Exception as _ge: + logger.warning(f"writeFile: failed to add file to group {args['groupId']}: {_ge}") elif fiId: - instanceFolderId = _getOrCreateInstanceFolder(chatService, fiId, context.get("mandateId", "")) - if instanceFolderId: - dbMgmt.updateFile(fileItem.id, {"folderId": instanceFolderId}) + try: + appIface = chatService.interfaceDbApp + instanceGroupId = await _getOrCreateInstanceGroup(appIface, fiId) + if instanceGroupId: + existing = appIface.getTableGrouping("files/list") + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + def _addToGroup2(nds, gid, fid): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == gid: + ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + if fid not in ids: + ids.append(fid) + if isinstance(nd, dict): + nd["itemIds"] = ids + return True + if _addToGroup2(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", []), gid, fid): + return True + return False + _addToGroup2(nodes, instanceGroupId, fileItem.id) + appIface.upsertTableGrouping("files/list", nodes) + except Exception as _ge: + logger.warning(f"writeFile: failed to add file to instance group for {fiId}: {_ge}") if args.get("tags"): dbMgmt.updateFile(fileItem.id, {"tags": args["tags"]}) @@ -412,13 +410,13 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): registry.register( "listFiles", _listFiles, description=( - "List files in the local workspace. Filter by folder, tags, or search term. " + "List files in the local workspace. Filter by tags or search term. " + "To filter by group, use listItemsInGroup. " "For external data sources, use browseDataSource instead." ), parameters={ "type": "object", "properties": { - "folderId": {"type": "string", "description": "Filter by folder ID"}, "tags": {"type": "array", "items": {"type": "string"}, "description": "Filter by tags (any match)"}, "search": {"type": "string", "description": "Search in file names and descriptions"}, } @@ -445,18 +443,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=True ) - registry.register( - "listFolders", _listFolders, - description="List folders in the local workspace. For external data sources, use browseDataSource instead.", - parameters={ - "type": "object", - "properties": { - "parentId": {"type": "string", "description": "Parent folder ID (omit for root)"}, - } - }, - readOnly=True - ) - registry.register( "webSearch", _webSearch, description="Search the web for general information. Use readUrl to fetch content from a known URL instead.", @@ -482,34 +468,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=False ) - registry.register( - "moveFile", _moveFile, - description="Move a file to a different folder in the local workspace.", - parameters={ - "type": "object", - "properties": { - "fileId": {"type": "string", "description": "The file ID to move"}, - "targetFolderId": {"type": "string", "description": "Target folder ID (null for root)"}, - }, - "required": ["fileId"] - }, - readOnly=False - ) - - registry.register( - "createFolder", _createFolder, - description="Create a new folder in the local workspace.", - parameters={ - "type": "object", - "properties": { - "name": {"type": "string", "description": "Folder name"}, - "parentId": {"type": "string", "description": "Parent folder ID (omit for root)"}, - }, - "required": ["name"] - }, - readOnly=False - ) - registry.register( "writeFile", _writeFile, description=( @@ -530,7 +488,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): "content": {"type": "string", "description": "Content to write/append"}, "mode": {"type": "string", "enum": ["create", "append", "overwrite"], "description": "Write mode (default: create)"}, "fileId": {"type": "string", "description": "File ID (required for mode=append/overwrite)"}, - "folderId": {"type": "string", "description": "Target folder ID (mode=create only)"}, + "groupId": {"type": "string", "description": "Group ID to place the file in (mode=create only). Omit to use the instance default group."}, "tags": {"type": "array", "items": {"type": "string"}, "description": "Tags (mode=create only)"}, }, "required": ["content"] @@ -690,55 +648,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=True ) - # ---- Phase 2: deleteFolder, renameFolder, moveFolder, copyFile, editFile ---- - - async def _deleteFolder(args: Dict[str, Any], context: Dict[str, Any]): - folderId = args.get("folderId", "") - recursive = args.get("recursive", False) - if not folderId: - return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error="folderId is required") - try: - chatService = services.chat - result = chatService.interfaceDbComponent.deleteFolder(folderId, recursive=recursive) - summary = f"Deleted {result.get('deletedFolders', 1)} folder(s) and {result.get('deletedFiles', 0)} file(s)" - return ToolResult( - toolCallId="", toolName="deleteFolder", success=True, data=summary, - sideEvents=[{"type": "folderDeleted", "data": {"folderId": folderId, **result}}], - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="deleteFolder", success=False, error=str(e)) - - async def _renameFolder(args: Dict[str, Any], context: Dict[str, Any]): - folderId = args.get("folderId", "") - newName = args.get("newName", "") - if not folderId or not newName: - return ToolResult(toolCallId="", toolName="renameFolder", success=False, error="folderId and newName are required") - try: - chatService = services.chat - chatService.interfaceDbComponent.renameFolder(folderId, newName) - return ToolResult( - toolCallId="", toolName="renameFolder", success=True, - data=f"Folder {folderId} renamed to '{newName}'", - sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "name": newName}}], - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="renameFolder", success=False, error=str(e)) - - async def _moveFolder(args: Dict[str, Any], context: Dict[str, Any]): - folderId = args.get("folderId", "") - targetParentId = args.get("targetParentId") - if not folderId: - return ToolResult(toolCallId="", toolName="moveFolder", success=False, error="folderId is required") - try: - chatService = services.chat - chatService.interfaceDbComponent.moveFolder(folderId, targetParentId) - return ToolResult( - toolCallId="", toolName="moveFolder", success=True, - data=f"Folder {folderId} moved to {targetParentId or 'root'}", - sideEvents=[{"type": "folderUpdated", "data": {"folderId": folderId, "parentId": targetParentId}}], - ) - except Exception as e: - return ToolResult(toolCallId="", toolName="moveFolder", success=False, error=str(e)) + # ---- Phase 2: copyFile, editFile ---- async def _copyFile(args: Dict[str, Any], context: Dict[str, Any]): fileId = args.get("fileId", "") @@ -748,7 +658,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): chatService = services.chat copiedFile = chatService.interfaceDbComponent.copyFile( fileId, - targetFolderId=args.get("targetFolderId"), newFileName=args.get("newFileName"), ) return ToolResult( @@ -823,48 +732,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): except Exception as e: return ToolResult(toolCallId="", toolName="replaceInFile", success=False, error=str(e)) - registry.register( - "deleteFolder", _deleteFolder, - description="Delete a folder from the local workspace. Set recursive=true to delete all contents.", - parameters={ - "type": "object", - "properties": { - "folderId": {"type": "string", "description": "The folder ID to delete"}, - "recursive": {"type": "boolean", "description": "If true, delete folder and all contents (files and subfolders). Default: false"}, - }, - "required": ["folderId"] - }, - readOnly=False - ) - - registry.register( - "renameFolder", _renameFolder, - description="Rename a folder in the local workspace.", - parameters={ - "type": "object", - "properties": { - "folderId": {"type": "string", "description": "The folder ID to rename"}, - "newName": {"type": "string", "description": "New folder name"}, - }, - "required": ["folderId", "newName"] - }, - readOnly=False - ) - - registry.register( - "moveFolder", _moveFolder, - description="Move a folder to a different parent in the local workspace.", - parameters={ - "type": "object", - "properties": { - "folderId": {"type": "string", "description": "The folder ID to move"}, - "targetParentId": {"type": "string", "description": "Target parent folder ID (null/omit for root)"}, - }, - "required": ["folderId"] - }, - readOnly=False - ) - registry.register( "copyFile", _copyFile, description="Create an independent copy of a file in the local workspace.", @@ -872,7 +739,6 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): "type": "object", "properties": { "fileId": {"type": "string", "description": "The file ID to copy"}, - "targetFolderId": {"type": "string", "description": "Target folder for the copy (default: same folder)"}, "newFileName": {"type": "string", "description": "New file name (default: same name, auto-numbered if duplicate)"}, }, "required": ["fileId"] @@ -880,6 +746,137 @@ def _registerWorkspaceTools(registry: ToolRegistry, services): readOnly=False ) + # ---- Group tools (replaces folder-based tools) ---- + + async def _listGroups(args: Dict[str, Any], context: Dict[str, Any]): + contextKey = args.get("contextKey", "files/list") + try: + chatService = services.chat + appInterface = chatService.interfaceDbApp + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return ToolResult(toolCallId="", toolName="listGroups", success=True, data="No groups found.") + + def _flatten(nodes, depth=0): + result = [] + for n in nodes: + nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) + result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))}) + result.extend(_flatten(nd.get("subGroups", []), depth + 1)) + return result + + groups = _flatten(existing.rootGroups) + lines = "\n".join( + f"{' ' * g['depth']}- {g['name']} (id: {g['id']}, items: {g['itemCount']})" + for g in groups + ) if groups else "No groups found." + return ToolResult(toolCallId="", toolName="listGroups", success=True, data=lines) + except Exception as e: + return ToolResult(toolCallId="", toolName="listGroups", success=False, error=str(e)) + + async def _listItemsInGroup(args: Dict[str, Any], context: Dict[str, Any]): + groupId = args.get("groupId", "") + contextKey = args.get("contextKey", "files/list") + if not groupId: + return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error="groupId is required") + try: + from modules.routes.routeHelpers import _collectItemIds + chatService = services.chat + appInterface = chatService.interfaceDbApp + existing = appInterface.getTableGrouping(contextKey) + if not existing: + return ToolResult(toolCallId="", toolName="listItemsInGroup", success=True, data="No groups found.") + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups] + ids = _collectItemIds(nodes, groupId) + itemList = list(ids) if ids else [] + return ToolResult( + toolCallId="", toolName="listItemsInGroup", success=True, + data="\n".join(f"- {fid}" for fid in itemList) if itemList else "No items in group.", + ) + except Exception as e: + return ToolResult(toolCallId="", toolName="listItemsInGroup", success=False, error=str(e)) + + async def _addItemsToGroup(args: Dict[str, Any], context: Dict[str, Any]): + groupId = args.get("groupId", "") + itemIds = args.get("itemIds", []) + contextKey = args.get("contextKey", "files/list") + if not groupId: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="groupId is required") + if not itemIds: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error="itemIds is required") + try: + chatService = services.chat + appInterface = chatService.interfaceDbApp + existing = appInterface.getTableGrouping(contextKey) + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in (existing.rootGroups if existing else [])] + + def _add(nds): + for nd in nds: + nid = nd.get("id") if isinstance(nd, dict) else getattr(nd, "id", None) + if nid == groupId: + existing_ids = list(nd.get("itemIds", []) if isinstance(nd, dict) else getattr(nd, "itemIds", [])) + for fid in itemIds: + if fid not in existing_ids: + existing_ids.append(fid) + if isinstance(nd, dict): + nd["itemIds"] = existing_ids + return True + if _add(nd.get("subGroups", []) if isinstance(nd, dict) else getattr(nd, "subGroups", [])): + return True + return False + + found = _add(nodes) + if not found: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=f"Group {groupId} not found") + appInterface.upsertTableGrouping(contextKey, nodes) + return ToolResult( + toolCallId="", toolName="addItemsToGroup", success=True, + data=f"Added {len(itemIds)} item(s) to group {groupId}", + ) + except Exception as e: + return ToolResult(toolCallId="", toolName="addItemsToGroup", success=False, error=str(e)) + + registry.register( + "listGroups", _listGroups, + description="List all groups in the file grouping tree. Groups replace folders for organising files.", + parameters={ + "type": "object", + "properties": { + "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"}, + } + }, + readOnly=True + ) + + registry.register( + "listItemsInGroup", _listItemsInGroup, + description="List all file IDs assigned to a specific group (includes sub-groups recursively).", + parameters={ + "type": "object", + "properties": { + "groupId": {"type": "string", "description": "The group ID to inspect"}, + "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"}, + }, + "required": ["groupId"] + }, + readOnly=True + ) + + registry.register( + "addItemsToGroup", _addItemsToGroup, + description="Add one or more file IDs to an existing group.", + parameters={ + "type": "object", + "properties": { + "groupId": {"type": "string", "description": "The group ID to add files to"}, + "itemIds": {"type": "array", "items": {"type": "string"}, "description": "List of file IDs to add"}, + "contextKey": {"type": "string", "description": "Grouping context key (default: 'files/list')"}, + }, + "required": ["groupId", "itemIds"] + }, + readOnly=False + ) + registry.register( "replaceInFile", _replaceInFile, description=( diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py index fdf172aa..372ec5b2 100644 --- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py +++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py @@ -268,24 +268,19 @@ class AgentService: info = chatService.getFileInfo(fid) if not info: - folderInfo = chatService.interfaceDbComponent.getFolder(fid) - if folderInfo: - folderName = folderInfo.get("name", fid) - folderFiles = chatService.listFiles(folderId=fid) - desc = f"### Folder: {folderName}\n - id: {fid}\n - type: folder\n - contains: {len(folderFiles)} file(s)" - if folderFiles: - desc += "\n - files:" - for ff in folderFiles[:30]: - ffName = ff.get("fileName", "?") - ffId = ff.get("id", "?") - ffMime = ff.get("mimeType", "?") - ffSize = ff.get("fileSize", ff.get("size", "?")) - desc += f"\n * {ffName} (id: {ffId}, type: {ffMime}, size: {ffSize} bytes)" - if len(folderFiles) > 30: - desc += f"\n ... and {len(folderFiles) - 30} more files" - desc += f'\nUse `listFiles(folderId="{fid}")` to get the full file list, then `readFile(fileId)` to read individual files.' - fileDescriptions.append(desc) - continue + # Check if fid is a group ID + try: + groupFileIds = chatService.listFilesInGroup(fid) + if groupFileIds: + allGroups = chatService.listGroups() + groupInfo = next((g for g in allGroups if g.get("id") == fid), None) + groupName = groupInfo.get("name", fid) if groupInfo else fid + desc = f"### Group: {groupName}\n - id: {fid}\n - type: group\n - contains: {len(groupFileIds)} file(s)" + desc += f'\nUse `listItemsInGroup(groupId="{fid}")` to get file IDs, then `readFile(fileId)` to read each.' + fileDescriptions.append(desc) + continue + except Exception: + pass fileDescriptions.append(f"### File id: {fid}") continue @@ -333,7 +328,7 @@ class AgentService: "These files/folders have been uploaded and processed through the extraction pipeline.\n" "Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, " "or `describeImage(fileId)` for image analysis.\n" - "For folders, use `listFiles(folderId)` to get the files inside, then `readFile(fileId)` for each.\n" + "For groups, use `listItemsInGroup(groupId)` to get the file IDs inside, then `readFile(fileId)` for each.\n" "For large PDFs/DOCX, avoid huge `renderDocument` tool JSON: build markdown with " "`writeFile` (create + append), then `renderDocument(sourceFileId=that file id, outputFormat=...)`.\n" "For small docs you may pass `content` inline. Embed images with `![alt](file:fileId)` in markdown.\n\n" diff --git a/modules/serviceCenter/services/serviceChat/mainServiceChat.py b/modules/serviceCenter/services/serviceChat/mainServiceChat.py index 0630c83b..73bef577 100644 --- a/modules/serviceCenter/services/serviceChat/mainServiceChat.py +++ b/modules/serviceCenter/services/serviceChat/mainServiceChat.py @@ -419,7 +419,7 @@ class ChatService: return None def getFileInfo(self, fileId: str) -> Dict[str, Any]: - """Get file information including new fields (tags, folderId, description, status).""" + """Get file information including new fields (tags, description, status).""" fileItem = self.interfaceDbComponent.getFile(fileId) if fileItem: return { @@ -430,7 +430,6 @@ class ChatService: "fileHash": fileItem.fileHash, "creationDate": fileItem.sysCreatedAt, "tags": getattr(fileItem, "tags", None), - "folderId": getattr(fileItem, "folderId", None), "description": getattr(fileItem, "description", None), "status": getattr(fileItem, "status", None), } @@ -449,14 +448,12 @@ class ChatService: def listFiles( self, - folderId: str = None, tags: List[str] = None, search: str = None, ) -> List[Dict[str, Any]]: """List files for the current user with optional filters. Args: - folderId: Filter by folder (None = root / all). tags: Filter by tags (any match). search: Search in fileName and description. @@ -469,10 +466,6 @@ class ChatService: allFiles = self.interfaceDbComponent.getAllFiles() results = [] for fileItem in allFiles: - if folderId is not None: - if fileItem.get("folderId") != folderId: - continue - if tags: itemTags = fileItem.get("tags") or [] if not any(t in itemTags for t in tags): @@ -492,27 +485,40 @@ class ChatService: "fileSize": fileItem.get("fileSize"), "creationDate": fileItem.get("sysCreatedAt"), "tags": fileItem.get("tags"), - "folderId": fileItem.get("folderId"), "description": fileItem.get("description"), "status": fileItem.get("status"), }) return results - def listFolders(self, parentId: str = None) -> List[Dict[str, Any]]: - """List file folders for the current user. + def listGroups(self, contextKey: str = "files/list") -> list: + """List all groups in the groupTree for the current context.""" + try: + existing = self.interfaceDbApp.getTableGrouping(contextKey) + if not existing: + return [] + def _flatten(nodes, depth=0): + result = [] + for n in nodes: + nd = n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) + result.append({"id": nd.get("id"), "name": nd.get("name"), "depth": depth, "itemCount": len(nd.get("itemIds", []))}) + result.extend(_flatten(nd.get("subGroups", []), depth + 1)) + return result + return _flatten(existing.rootGroups) + except Exception as e: + return [] - Args: - parentId: Optional parent folder ID to filter by. - None = return ALL folders (for tree building). - - Returns: - List of folder dicts. - """ - return self.interfaceDbComponent.listFolders(parentId=parentId) - - def createFolder(self, name: str, parentId: str = None) -> Dict[str, Any]: - """Create a new file folder with unique name validation.""" - return self.interfaceDbComponent.createFolder(name=name, parentId=parentId) + def listFilesInGroup(self, groupId: str, contextKey: str = "files/list") -> list: + """List file IDs in a specific group (recursive).""" + try: + from modules.routes.routeHelpers import _collectItemIds + existing = self.interfaceDbApp.getTableGrouping(contextKey) + if not existing: + return [] + nodes = [n.model_dump() if hasattr(n, "model_dump") else (n if isinstance(n, dict) else vars(n)) for n in existing.rootGroups] + ids = _collectItemIds(nodes, groupId) + return list(ids) if ids else [] + except Exception: + return [] # ---- DataSource CRUD ---- From c140bd14d45d89665b4d9412dee8e7d747cbc9ad Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Thu, 30 Apr 2026 23:54:45 +0200 Subject: [PATCH 16/18] fixed nodes handovers --- modules/datamodels/datamodelDocref.py | 10 +- .../graphicalEditor/nodeDefinitions/ai.py | 21 ++-- modules/features/trustee/mainTrustee.py | 4 +- modules/interfaces/interfaceBootstrap.py | 95 ++++++++++++++++++ modules/routes/routeAutomationWorkspace.py | 97 +++++++++++++++---- .../services/serviceChat/mainServiceChat.py | 6 -- .../methods/methodAi/actions/process.py | 66 ++++++++++++- .../workflows/methods/methodAi/methodAi.py | 17 ++++ 8 files changed, 276 insertions(+), 40 deletions(-) diff --git a/modules/datamodels/datamodelDocref.py b/modules/datamodels/datamodelDocref.py index 27ba5e2b..e20fb072 100644 --- a/modules/datamodels/datamodelDocref.py +++ b/modules/datamodels/datamodelDocref.py @@ -110,11 +110,13 @@ class DocumentReferenceList(BaseModel): # docItem:documentId references.append(DocumentItemReference(documentId=parts[0])) - # Unknown format - skip or log warning else: - # Try to parse as simple string (backward compatibility) - # Assume it's a label if it doesn't match known patterns - if refStr: + if not refStr: + continue + import re + if re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', refStr, re.I): + references.append(DocumentItemReference(documentId=refStr)) + else: references.append(DocumentListReference(label=refStr)) return cls(references=references) diff --git a/modules/features/graphicalEditor/nodeDefinitions/ai.py b/modules/features/graphicalEditor/nodeDefinitions/ai.py index 0336e382..65e97654 100644 --- a/modules/features/graphicalEditor/nodeDefinitions/ai.py +++ b/modules/features/graphicalEditor/nodeDefinitions/ai.py @@ -24,8 +24,13 @@ AI_NODES = [ {"name": "resultType", "type": "string", "required": False, "frontendType": "select", "frontendOptions": {"options": ["txt", "json", "md", "csv", "xml", "html", "pdf", "docx", "xlsx", "pptx", "png", "jpg"]}, "description": t("Ausgabeformat"), "default": "txt"}, - {"name": "documentList", "type": "string", "required": False, "frontendType": "hidden", - "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""}, + {"name": "documentList", "type": "DocumentList", "required": False, "frontendType": "dataRef", + "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""}, + {"name": "context", "type": "string", "required": False, "frontendType": "dataRef", + "description": t("Kontextdaten fuer den Prompt (Upstream-Output binden)"), "default": ""}, + {"name": "documentTheme", "type": "string", "required": False, "frontendType": "select", + "frontendOptions": {"options": ["general", "finance", "legal", "technical", "hr"]}, + "description": t("Dokument-Thema (Style-Hinweis fuer den Renderer)"), "default": "general"}, {"name": "simpleMode", "type": "boolean", "required": False, "frontendType": "checkbox", "description": t("Einfacher Modus"), "default": True}, ] + _AI_COMMON_PARAMS, @@ -62,8 +67,8 @@ AI_NODES = [ "label": t("Dokument zusammenfassen"), "description": t("Dokumentinhalt zusammenfassen"), "parameters": [ - {"name": "documentList", "type": "string", "required": True, "frontendType": "hidden", - "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""}, + {"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef", + "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""}, {"name": "summaryLength", "type": "string", "required": False, "frontendType": "select", "frontendOptions": {"options": ["brief", "medium", "detailed"]}, "description": t("Kurz, mittel oder ausführlich"), "default": "medium"}, @@ -82,8 +87,8 @@ AI_NODES = [ "label": t("Dokument übersetzen"), "description": t("Dokument in Zielsprache übersetzen"), "parameters": [ - {"name": "documentList", "type": "string", "required": True, "frontendType": "hidden", - "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""}, + {"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef", + "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""}, {"name": "targetLanguage", "type": "string", "required": True, "frontendType": "text", "description": t("Zielsprache (z.B. de, en, French)")}, ] + _AI_COMMON_PARAMS, @@ -101,8 +106,8 @@ AI_NODES = [ "label": t("Dokument konvertieren"), "description": t("Dokument in anderes Format konvertieren"), "parameters": [ - {"name": "documentList", "type": "string", "required": True, "frontendType": "hidden", - "description": t("Dokumentenliste (via Wire oder DataRef)"), "default": ""}, + {"name": "documentList", "type": "DocumentList", "required": True, "frontendType": "dataRef", + "description": t("Dokumentenliste (Upstream-Output binden)"), "default": ""}, {"name": "targetFormat", "type": "string", "required": True, "frontendType": "select", "frontendOptions": {"options": ["docx", "pdf", "xlsx", "csv", "txt", "html", "json", "md"]}, "description": t("Zielformat")}, diff --git a/modules/features/trustee/mainTrustee.py b/modules/features/trustee/mainTrustee.py index d8f7a804..b8ab853d 100644 --- a/modules/features/trustee/mainTrustee.py +++ b/modules/features/trustee/mainTrustee.py @@ -383,7 +383,7 @@ def _buildAnalysisWorkflowGraph(prompt: str) -> Dict[str, Any]: "parameters": { "aiPrompt": prompt + _FINANCE_STYLE_HINT, "context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]}, - "requireNeutralization": True, + "requireNeutralization": False, "simpleMode": False, }, "position": {"x": 500, "y": 0}}, ], @@ -478,7 +478,7 @@ TEMPLATE_WORKFLOWS = [ ), "resultType": "xlsx", "documentTheme": "finance", - "requireNeutralization": True, + "requireNeutralization": False, "documentList": {"type": "ref", "nodeId": "trigger", "path": ["payload", "documentList"]}, "context": {"type": "ref", "nodeId": "refresh", "path": ["data", "accountingData"]}, "simpleMode": False, diff --git a/modules/interfaces/interfaceBootstrap.py b/modules/interfaces/interfaceBootstrap.py index 4bcd0e97..b7a56a02 100644 --- a/modules/interfaces/interfaceBootstrap.py +++ b/modules/interfaces/interfaceBootstrap.py @@ -115,6 +115,10 @@ def initBootstrap(db: DatabaseConnector) -> None: # Bootstrap system workflow templates for graphical editor _bootstrapSystemTemplates(db) + # Sync feature template workflows (update graph of existing instance workflows + # whose templateSourceId matches a current code-defined template) + _syncFeatureTemplateWorkflows() + # Ensure billing settings and accounts exist for all mandates _bootstrapBilling() @@ -190,6 +194,97 @@ def _bootstrapSystemTemplates(db: DatabaseConnector) -> None: logger.warning(f"System workflow template bootstrap failed: {e}") +def _syncFeatureTemplateWorkflows() -> None: + """Sync existing instance-scoped workflows with current code-defined templates. + + For each feature that exposes getTemplateWorkflows(), find all AutoWorkflow + rows whose templateSourceId matches a template ID and update their graph + if the code-defined version has changed. Preserves instance-specific + fields (label, tags, targetFeatureInstanceId, invocations, active). + Idempotent, runs on every boot. + """ + import json + + try: + from modules.system.registry import loadFeatureMainModules + from modules.features.graphicalEditor.datamodelFeatureGraphicalEditor import AutoWorkflow + from modules.features.graphicalEditor.interfaceFeatureGraphicalEditor import graphicalEditorDatabase + + mainModules = loadFeatureMainModules() + + templatesBySourceId: dict = {} + for featureCode, mod in mainModules.items(): + getTemplateWorkflows = getattr(mod, "getTemplateWorkflows", None) + if not getTemplateWorkflows: + continue + try: + templates = getTemplateWorkflows() or [] + except Exception: + continue + for tpl in templates: + tplId = tpl.get("id") + if tplId: + templatesBySourceId[tplId] = tpl + + if not templatesBySourceId: + logger.info("_syncFeatureTemplateWorkflows: no templates found, skipping") + return + logger.info(f"_syncFeatureTemplateWorkflows: found {len(templatesBySourceId)} template(s): {list(templatesBySourceId.keys())}") + + greenfieldDb = DatabaseConnector( + dbHost=APP_CONFIG.get("DB_HOST", "localhost"), + dbDatabase=graphicalEditorDatabase, + dbUser=APP_CONFIG.get("DB_USER"), + dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET") or APP_CONFIG.get("DB_PASSWORD"), + ) + + updated = 0 + for sourceId, tpl in templatesBySourceId.items(): + instances = greenfieldDb.getRecordset(AutoWorkflow, recordFilter={ + "templateSourceId": sourceId, + "isTemplate": False, + }) + if not instances: + continue + + canonicalGraph = tpl.get("graph", {}) + + for inst in instances: + instId = inst.get("id") if isinstance(inst, dict) else getattr(inst, "id", None) + targetInstanceId = ( + inst.get("targetFeatureInstanceId") if isinstance(inst, dict) + else getattr(inst, "targetFeatureInstanceId", None) + ) or "" + + graphJson = json.dumps(canonicalGraph) + graphJson = graphJson.replace("{{featureInstanceId}}", targetInstanceId) + newGraph = json.loads(graphJson) + + existingGraph = inst.get("graph") if isinstance(inst, dict) else getattr(inst, "graph", None) + if isinstance(existingGraph, str): + try: + existingGraph = json.loads(existingGraph) + except Exception: + existingGraph = None + + if existingGraph == newGraph: + logger.debug(f"_syncFeatureTemplateWorkflows: graph unchanged for workflow {instId} (template={sourceId})") + continue + logger.debug(f"_syncFeatureTemplateWorkflows: graph DIFFERS for workflow {instId} (template={sourceId}), updating") + + greenfieldDb.recordModify(AutoWorkflow, instId, {"graph": newGraph}) + updated += 1 + logger.info(f"_syncFeatureTemplateWorkflows: updated graph for workflow {instId} (template={sourceId})") + + if updated: + logger.info(f"_syncFeatureTemplateWorkflows: synced {updated} workflow(s) with current templates") + else: + logger.info("_syncFeatureTemplateWorkflows: all instance graphs already match current templates") + greenfieldDb.close() + except Exception as e: + logger.warning(f"Feature template workflow sync failed: {e}") + + def _buildSystemTemplates(): """Build the graph definitions for platform system templates.""" return [ diff --git a/modules/routes/routeAutomationWorkspace.py b/modules/routes/routeAutomationWorkspace.py index 6efbdeb6..b742d7ea 100644 --- a/modules/routes/routeAutomationWorkspace.py +++ b/modules/routes/routeAutomationWorkspace.py @@ -58,6 +58,36 @@ def _getUserAccessibleInstanceIds(userId: str) -> list[str]: ] +_FILE_REF_KEYS = ("fileId", "documentId", "fileIds", "documents") + + +def _extractFileIdsFromValue(value, accumulator: set[str]) -> None: + """Recursively scan a value (dict/list/str) for file id references.""" + if isinstance(value, dict): + for key, sub in value.items(): + if key in _FILE_REF_KEYS: + _collectFileIdsFromRef(sub, accumulator) + else: + _extractFileIdsFromValue(sub, accumulator) + elif isinstance(value, list): + for item in value: + _extractFileIdsFromValue(item, accumulator) + + +def _collectFileIdsFromRef(val, accumulator: set[str]) -> None: + """Add file ids from a value located under a known file-reference key.""" + if isinstance(val, str) and val: + accumulator.add(val) + elif isinstance(val, list): + for v in val: + if isinstance(v, str) and v: + accumulator.add(v) + elif isinstance(v, dict) and v.get("id"): + accumulator.add(v["id"]) + elif isinstance(val, dict) and val.get("id"): + accumulator.add(val["id"]) + + @router.get("") @limiter.limit("60/minute") def listWorkspaceRuns( @@ -198,40 +228,68 @@ def getWorkspaceRunDetail( steps = [dict(s) for s in stepRecords] steps.sort(key=lambda s: s.get("startedAt") or 0) - fileItems: list = [] + allFileIds: set[str] = set() + perStepFileIds: list[tuple[set[str], set[str]]] = [] + for step in steps: + inputIds: set[str] = set() + outputIds: set[str] = set() + _extractFileIdsFromValue(step.get("inputSnapshot") or {}, inputIds) + _extractFileIdsFromValue(step.get("output") or {}, outputIds) + perStepFileIds.append((inputIds, outputIds)) + allFileIds.update(inputIds) + allFileIds.update(outputIds) + + nodeOutputs = run.get("nodeOutputs") or {} + runLevelIds: set[str] = set() + _extractFileIdsFromValue(nodeOutputs, runLevelIds) + allFileIds.update(runLevelIds) + + fileMetaById: dict[str, dict] = {} try: from modules.datamodels.datamodelFiles import FileItem from modules.interfaces.interfaceDbManagement import ComponentObjects mgmtDb = ComponentObjects().db if mgmtDb._ensureTableExists(FileItem): - nodeOutputs = run.get("nodeOutputs") or {} - fileIds: set[str] = set() - for nodeId, output in nodeOutputs.items(): - if not isinstance(output, dict): - continue - for key in ("fileId", "documentId", "fileIds", "documents"): - val = output.get(key) - if isinstance(val, str) and val: - fileIds.add(val) - elif isinstance(val, list): - for v in val: - if isinstance(v, str) and v: - fileIds.add(v) - elif isinstance(v, dict) and v.get("id"): - fileIds.add(v["id"]) - for fid in fileIds: + for fid in allFileIds: try: rec = mgmtDb.getRecord(FileItem, fid) if rec: - fileItems.append(dict(rec)) + recDict = dict(rec) + fileMetaById[fid] = { + "id": fid, + "fileName": recDict.get("fileName") or recDict.get("name"), + } except Exception: pass except Exception as e: logger.warning("getWorkspaceRunDetail: file lookup failed: %s", e) + def _resolveFileList(ids: set[str]) -> list[dict]: + return [fileMetaById[fid] for fid in ids if fid in fileMetaById] + + assignedFileIds: set[str] = set() + for step, (inputIds, outputIds) in zip(steps, perStepFileIds): + step["inputFiles"] = _resolveFileList(inputIds) + step["outputFiles"] = _resolveFileList(outputIds) + assignedFileIds.update(inputIds) + assignedFileIds.update(outputIds) + + unassignedFiles = _resolveFileList(allFileIds - assignedFileIds) + allFiles = _resolveFileList(allFileIds) + run["workflowLabel"] = run.get("label") or workflow.get("label") or wfId run["targetFeatureInstanceId"] = tid + targetInstanceLabel = None + if tid: + try: + from modules.routes.routeHelpers import resolveInstanceLabels + labelMap = resolveInstanceLabels([tid]) + targetInstanceLabel = labelMap.get(tid) + except Exception: + pass + run["targetInstanceLabel"] = targetInstanceLabel + return { "run": run, "workflow": { @@ -242,5 +300,6 @@ def getWorkspaceRunDetail( "tags": workflow.get("tags", []), } if workflow else None, "steps": steps, - "files": fileItems, + "files": allFiles, + "unassignedFiles": unassignedFiles, } diff --git a/modules/serviceCenter/services/serviceChat/mainServiceChat.py b/modules/serviceCenter/services/serviceChat/mainServiceChat.py index 0630c83b..077596b8 100644 --- a/modules/serviceCenter/services/serviceChat/mainServiceChat.py +++ b/modules/serviceCenter/services/serviceChat/mainServiceChat.py @@ -199,13 +199,8 @@ class ChatService: label = parts[1] messageFound = None for message in workflow.messages: - # Validate message belongs to this workflow msgWorkflowId = getattr(message, 'workflowId', None) if not msgWorkflowId or msgWorkflowId != workflowId: - if msgWorkflowId: - logger.warning(f"Message {message.id} has workflowId {msgWorkflowId} but belongs to workflow {workflowId}. Skipping.") - else: - logger.warning(f"Message {message.id} has no workflowId. Skipping.") continue msgLabel = getattr(message, 'documentsLabel', None) @@ -213,7 +208,6 @@ class ChatService: messageFound = message break - # If found, add documents if messageFound and messageFound.documents: allDocuments.extend(messageFound.documents) else: diff --git a/modules/workflows/methods/methodAi/actions/process.py b/modules/workflows/methods/methodAi/actions/process.py index d82ac4f7..50500929 100644 --- a/modules/workflows/methods/methodAi/actions/process.py +++ b/modules/workflows/methods/methodAi/actions/process.py @@ -73,6 +73,47 @@ def _action_docs_to_content_parts(services, docs: List[Any]) -> List[ContentPart logger.info(f"ai.process: Extracted {len(ec.parts)} parts from {name} (no persistence)") return all_parts +def _resolve_file_refs_to_content_parts(services, fileIdRefs) -> List[ContentPart]: + """Fetch files by ID from the file store and extract content. + Used for automation2 workflows where documents are file-store references, + not chat message attachments.""" + from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy + + mgmt = getattr(services, 'interfaceDbComponent', None) + extraction = getattr(services, 'extraction', None) + if not mgmt or not extraction: + logger.warning("_resolve_file_refs_to_content_parts: missing interfaceDbComponent or extraction service") + return [] + + allParts: List[ContentPart] = [] + opts = ExtractionOptions(prompt="", mergeStrategy=MergeStrategy()) + for ref in fileIdRefs: + fileId = ref.documentId + fileMeta = mgmt.getFile(fileId) + if not fileMeta: + logger.warning(f"_resolve_file_refs_to_content_parts: file {fileId} not found") + continue + fileData = mgmt.getFileData(fileId) + if not fileData: + logger.warning(f"_resolve_file_refs_to_content_parts: no data for file {fileId}") + continue + fileName = getattr(fileMeta, 'fileName', fileId) + mimeType = getattr(fileMeta, 'mimeType', 'application/octet-stream') + ec = extraction.extractContentFromBytes( + documentBytes=fileData, + fileName=fileName, + mimeType=mimeType, + documentId=fileId, + options=opts, + ) + for p in ec.parts: + if p.data or getattr(p, "typeGroup", "") == "image": + p.metadata.setdefault("originalFileName", fileName) + allParts.append(p) + logger.info(f"_resolve_file_refs_to_content_parts: extracted {len(ec.parts)} parts from {fileName}") + return allParts + + async def process(self, parameters: Dict[str, Any]) -> ActionResult: operationId = None try: @@ -129,6 +170,17 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: f"ai.process: Coerced documentList ({type(documentListParam).__name__}) " f"to DocumentReferenceList with {len(documentList.references)} references" ) + + # Resolve DocumentItemReferences (file-ID refs from automation2) directly + # from the file store. These cannot be resolved via chat messages. + from modules.datamodels.datamodelDocref import DocumentItemReference + fileIdRefs = [r for r in documentList.references if isinstance(r, DocumentItemReference)] + if fileIdRefs: + extractedParts = _resolve_file_refs_to_content_parts(self.services, fileIdRefs) + if extractedParts: + inline_content_parts = (inline_content_parts or []) + extractedParts + remaining = [r for r in documentList.references if not isinstance(r, DocumentItemReference)] + documentList = DocumentReferenceList(references=remaining) # Optional: if omitted, formats determined from prompt. Default "txt" is validation fallback only. resultType = parameters.get("resultType") @@ -157,7 +209,19 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: mimeMap = {"txt": "text/plain", "json": "application/json", "html": "text/html", "md": "text/markdown", "csv": "text/csv", "xml": "application/xml"} output_mime_type = mimeMap.get(normalized_result_type, "text/plain") if normalized_result_type else "text/plain" - + + # Normalize context: workflow refs may resolve to dict/list instead of str + paramContext = parameters.get("context") + if paramContext is not None and not isinstance(paramContext, str): + try: + paramContext = json.dumps(paramContext, ensure_ascii=False, default=str) + parameters["context"] = paramContext + logger.info(f"ai.process: Serialized non-string context ({type(parameters.get('context')).__name__}) to JSON ({len(paramContext)} chars)") + except Exception as e: + logger.warning(f"ai.process: Failed to serialize context: {e}") + paramContext = str(paramContext) + parameters["context"] = paramContext + # Phase 7.3: Pass documentList and/or contentParts to AI service contentParts: Optional[List[ContentPart]] = inline_content_parts if "contentParts" in parameters and not inline_content_parts: diff --git a/modules/workflows/methods/methodAi/methodAi.py b/modules/workflows/methods/methodAi/methodAi.py index 5265f5c9..ecd60b12 100644 --- a/modules/workflows/methods/methodAi/methodAi.py +++ b/modules/workflows/methods/methodAi/methodAi.py @@ -56,6 +56,23 @@ class MethodAi(MethodBase): required=False, description="Document reference(s) in any format to use as input/context" ), + "context": WorkflowActionParameter( + name="context", + type="str", + frontendType=FrontendType.TEXTAREA, + required=False, + default="", + description="Additional context data (string or upstream-bound dict/list, e.g. accounting data) appended to the prompt. Non-string values are JSON-serialized." + ), + "documentTheme": WorkflowActionParameter( + name="documentTheme", + type="str", + frontendType=FrontendType.SELECT, + frontendOptions=["general", "finance", "legal", "technical", "hr"], + required=False, + default="general", + description="Style hint for the document renderer (e.g. finance, legal). Used by the AI agent to choose colors and layout." + ), "resultType": WorkflowActionParameter( name="resultType", type="str", From 3da6e24bec2e9de2dd45bdaea02bea362865e180 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 3 May 2026 22:03:29 +0200 Subject: [PATCH 17/18] fixed component formgeneratortree and truastee workflows --- .../providerClickup/connectorClickup.py | 3 + modules/datamodels/datamodelDocref.py | 18 +- modules/datamodels/datamodelFiles.py | 74 ++++ modules/interfaces/interfaceDbManagement.py | 239 ++++++++++- modules/interfaces/interfaceRbac.py | 1 + modules/migrations/_archive/README.md | 11 + modules/migrations/_archive/__init__.py | 1 + .../migrate_folders_to_groups.py | 47 ++- modules/routes/routeClickup.py | 4 +- modules/routes/routeDataFiles.py | 337 ++++++++++++++- modules/routes/routeSharepoint.py | 2 +- .../serviceAgent/actionToolAdapter.py | 83 +++- .../coreTools/_dataSourceTools.py | 5 +- .../serviceAgent/coreTools/_mediaTools.py | 13 +- .../services/serviceAgent/mainServiceAgent.py | 17 +- .../services/serviceAgent/sandboxExecutor.py | 45 +- .../serviceClickup/mainServiceClickup.py | 16 + .../renderers/rendererCodeCsv.py | 2 +- .../renderers/rendererCodeJson.py | 2 +- .../renderers/rendererCodeXml.py | 2 +- .../renderers/rendererCsv.py | 2 +- .../renderers/rendererImage.py | 2 +- .../renderers/rendererJson.py | 2 +- .../renderers/rendererMarkdown.py | 2 +- .../renderers/rendererText.py | 2 +- .../methods/methodAi/actions/process.py | 28 +- .../methodClickup/actions/list_tasks.py | 24 +- .../methods/methodClickup/methodClickup.py | 35 ++ scripts/stage0_filefolder_schema_check.py | 58 +++ tests/unit/interfaces/test_folderRbac.py | 327 +++++++++++++++ tests/unit/routes/test_folder_crud.py | 392 ++++++++++++++++++ 31 files changed, 1727 insertions(+), 69 deletions(-) create mode 100644 modules/migrations/_archive/README.md create mode 100644 modules/migrations/_archive/__init__.py rename modules/migrations/{ => _archive}/migrate_folders_to_groups.py (86%) create mode 100644 scripts/stage0_filefolder_schema_check.py create mode 100644 tests/unit/interfaces/test_folderRbac.py create mode 100644 tests/unit/routes/test_folder_crud.py diff --git a/modules/connectors/providerClickup/connectorClickup.py b/modules/connectors/providerClickup/connectorClickup.py index f8b4fae1..10517db2 100644 --- a/modules/connectors/providerClickup/connectorClickup.py +++ b/modules/connectors/providerClickup/connectorClickup.py @@ -210,6 +210,9 @@ class ClickupListsAdapter(ServiceAdapter): data = await self._svc.getTask(task_id) if isinstance(data, dict) and data.get("error"): return json.dumps(data).encode("utf-8") + returnedId = data.get("id", "") if isinstance(data, dict) else "" + if returnedId and returnedId != task_id: + logger.warning(f"ClickUp download: requested task_id={task_id} but API returned id={returnedId}") payload = json.dumps(data, indent=2).encode("utf-8") return DownloadResult(data=payload, fileName=f"task-{task_id}.json", mimeType="application/json") diff --git a/modules/datamodels/datamodelDocref.py b/modules/datamodels/datamodelDocref.py index e20fb072..f4ce09aa 100644 --- a/modules/datamodels/datamodelDocref.py +++ b/modules/datamodels/datamodelDocref.py @@ -155,9 +155,12 @@ def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList: return coerceDocumentReferenceList(value[innerKey]) docId = value.get("documentId") or value.get("id") if docId: + docIdStr = str(docId) + if docIdStr.startswith("docItem:") or docIdStr.startswith("docList:"): + return DocumentReferenceList.from_string_list([docIdStr]) return DocumentReferenceList(references=[ DocumentItemReference( - documentId=str(docId), + documentId=docIdStr, fileName=value.get("fileName") or value.get("name"), ) ]) @@ -180,10 +183,15 @@ def coerceDocumentReferenceList(value: Any) -> DocumentReferenceList: continue docId = item.get("documentId") or item.get("id") if docId: - references.append(DocumentItemReference( - documentId=str(docId), - fileName=item.get("fileName") or item.get("name"), - )) + docIdStr = str(docId) + if docIdStr.startswith("docItem:") or docIdStr.startswith("docList:"): + parsed = DocumentReferenceList.from_string_list([docIdStr]) + references.extend(parsed.references) + else: + references.append(DocumentItemReference( + documentId=docIdStr, + fileName=item.get("fileName") or item.get("name"), + )) elif item.get("label"): references.append(DocumentListReference( label=str(item["label"]), diff --git a/modules/datamodels/datamodelFiles.py b/modules/datamodels/datamodelFiles.py index 2a547b9c..6adf6642 100644 --- a/modules/datamodels/datamodelFiles.py +++ b/modules/datamodels/datamodelFiles.py @@ -10,6 +10,69 @@ import uuid import base64 +@i18nModel("Ordner") +class FileFolder(PowerOnModel): + """Persistenter Datei-Ordner im Management-DB-Kontext (RBAC wie FileItem).""" + + id: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Primary key", + json_schema_extra={"label": "ID", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False}, + ) + name: str = Field( + description="Display name of the folder", + json_schema_extra={"label": "Name", "frontend_type": "text", "frontend_readonly": False, "frontend_required": True}, + ) + parentId: Optional[str] = Field( + default=None, + description="Parent folder id; empty or None for root", + json_schema_extra={ + "label": "Uebergeordneter Ordner", + "frontend_type": "text", + "frontend_readonly": False, + "frontend_required": False, + "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"}, + }, + ) + mandateId: Optional[str] = Field( + default="", + description="ID of the mandate this folder belongs to", + json_schema_extra={ + "label": "Mandant", + "frontend_type": "text", + "frontend_readonly": True, + "frontend_required": False, + "fk_target": {"db": "poweron_app", "table": "Mandate", "labelField": "label"}, + }, + ) + featureInstanceId: Optional[str] = Field( + default="", + description="ID of the feature instance this folder belongs to", + json_schema_extra={ + "label": "Feature-Instanz", + "frontend_type": "text", + "frontend_readonly": True, + "frontend_required": False, + "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}, + }, + ) + scope: str = Field( + default="personal", + description="Data visibility scope: personal, featureInstance, mandate, global", + json_schema_extra={"label": "Sichtbarkeit", "frontend_type": "select", "frontend_readonly": False, "frontend_required": False, "frontend_options": [ + {"value": "personal", "label": "Persönlich"}, + {"value": "featureInstance", "label": "Feature-Instanz"}, + {"value": "mandate", "label": "Mandant"}, + {"value": "global", "label": "Global"}, + ]}, + ) + neutralize: bool = Field( + default=False, + description="Whether files in this folder should be neutralized before AI processing", + json_schema_extra={"label": "Neutralisieren", "frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False}, + ) + + @i18nModel("Datei") class FileItem(PowerOnModel): """Metadaten einer gespeicherten Datei.""" @@ -44,6 +107,17 @@ class FileItem(PowerOnModel): "fk_target": {"db": "poweron_app", "table": "FeatureInstance", "labelField": "label"}, }, ) + folderId: Optional[str] = Field( + default=None, + description="ID of the folder containing this file (if any)", + json_schema_extra={ + "label": "Ordner", + "frontend_type": "text", + "frontend_readonly": False, + "frontend_required": False, + "fk_target": {"db": "poweron_management", "table": "FileFolder", "labelField": "name"}, + }, + ) mimeType: str = Field( description="MIME type of the file", json_schema_extra={"label": "MIME-Typ", "frontend_type": "text", "frontend_readonly": True, "frontend_required": False}, diff --git a/modules/interfaces/interfaceDbManagement.py b/modules/interfaces/interfaceDbManagement.py index b263c98b..120aecce 100644 --- a/modules/interfaces/interfaceDbManagement.py +++ b/modules/interfaces/interfaceDbManagement.py @@ -19,7 +19,7 @@ from modules.interfaces.interfaceRbac import getRecordsetWithRBAC, getRecordsetP from modules.security.rbac import RbacClass from modules.datamodels.datamodelRbac import AccessRuleContext from modules.datamodels.datamodelUam import AccessLevel -from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData +from modules.datamodels.datamodelFiles import FilePreview, FileItem, FileData, FileFolder from modules.datamodels.datamodelUtils import Prompt from modules.datamodels.datamodelMessaging import ( MessagingSubscription, @@ -1067,7 +1067,242 @@ class ComponentObjects: except Exception as e: logger.error(f"Error converting file record: {str(e)}") return None - + + # ── Folder methods ───────────────────────────────────────────────────────── + + def getOwnFolderTree(self) -> List[Dict[str, Any]]: + """Folders owned by the current user, filtered via RBAC.""" + return getRecordsetWithRBAC( + self.db, FileFolder, self.currentUser, + recordFilter={"sysCreatedBy": self.userId}, + mandateId=self.mandateId, + featureInstanceId=self.featureInstanceId, + ) + + def getSharedFolderTree(self) -> List[Dict[str, Any]]: + """Folders visible via scope but NOT owned by the current user. + Adds contextOrphan=True when a folder's parentId is not in the result set.""" + allFolders = getRecordsetWithRBAC( + self.db, FileFolder, self.currentUser, + mandateId=self.mandateId, + featureInstanceId=self.featureInstanceId, + ) + shared = [f for f in allFolders if f.get("sysCreatedBy") != self.userId] + sharedIds = {f["id"] for f in shared} + for f in shared: + f["contextOrphan"] = bool(f.get("parentId") and f["parentId"] not in sharedIds) + return shared + + def getFolder(self, folderId: str) -> Optional[Dict[str, Any]]: + """Return a single folder dict or None.""" + results = getRecordsetWithRBAC( + self.db, FileFolder, self.currentUser, + recordFilter={"id": folderId}, + mandateId=self.mandateId, + featureInstanceId=self.featureInstanceId, + ) + return results[0] if results else None + + def _isFolderOwner(self, folder) -> bool: + createdBy = ( + getattr(folder, "sysCreatedBy", None) + or (folder.get("sysCreatedBy") if isinstance(folder, dict) else None) + ) + return createdBy == self.userId + + def _requireFolderWriteAccess(self, folder, folderId: str, operation: str = "update"): + """Raise PermissionError if the user cannot mutate this folder. + Owners always can. Non-owners need RBAC ALL level.""" + if self._isFolderOwner(folder): + return + from modules.interfaces.interfaceRbac import buildDataObjectKey + objectKey = buildDataObjectKey("FileFolder") + permissions = self.rbac.getUserPermissions( + self.currentUser, AccessRuleContext.DATA, objectKey, + mandateId=self.mandateId, featureInstanceId=self.featureInstanceId, + ) + level = getattr(permissions, operation, None) + if level != AccessLevel.ALL: + raise PermissionError( + f"No permission to {operation} folder {folderId} (not owner, access level: {level})" + ) + + def createFolder(self, name: str, parentId: Optional[str] = None) -> Dict[str, Any]: + if not self.checkRbacPermission(FileFolder, "create"): + raise PermissionError("No permission to create folders") + folder = FileFolder( + name=name, + parentId=parentId, + mandateId=self.mandateId or "", + featureInstanceId=self.featureInstanceId or "", + scope="personal", + neutralize=False, + ) + self.db.recordCreate(FileFolder, folder) + return folder.model_dump() + + def renameFolder(self, folderId: str, newName: str) -> Dict[str, Any]: + folder = self.getFolder(folderId) + if not folder: + raise FileNotFoundError(f"Folder {folderId} not found") + self._requireFolderWriteAccess(folder, folderId, "update") + self.db.recordModify(FileFolder, folderId, {"name": newName}) + folder["name"] = newName + return folder + + def moveFolder(self, folderId: str, newParentId: Optional[str] = None) -> Dict[str, Any]: + folder = self.getFolder(folderId) + if not folder: + raise FileNotFoundError(f"Folder {folderId} not found") + self._requireFolderWriteAccess(folder, folderId, "update") + + if newParentId: + parent = self.getFolder(newParentId) + if not parent: + raise FileNotFoundError(f"Target parent folder {newParentId} not found") + self._requireFolderWriteAccess(parent, newParentId, "update") + # Circular-reference guard: newParentId must not be a descendant of folderId + if self._isDescendant(newParentId, folderId): + raise ValueError(f"Cannot move folder into its own subtree (circular reference)") + + self.db.recordModify(FileFolder, folderId, {"parentId": newParentId}) + folder["parentId"] = newParentId + return folder + + def _isDescendant(self, candidateId: str, ancestorId: str) -> bool: + """Return True if candidateId is a descendant of (or equal to) ancestorId.""" + visited = set() + current = candidateId + while current: + if current == ancestorId: + return True + if current in visited: + break + visited.add(current) + f = self.getFolder(current) + current = f.get("parentId") if f else None + return False + + def deleteFolderCascade(self, folderId: str) -> Dict[str, Any]: + """Delete a folder and all owned sub-folders + their files.""" + folder = self.getFolder(folderId) + if not folder: + raise FileNotFoundError(f"Folder {folderId} not found") + self._requireFolderWriteAccess(folder, folderId, "delete") + + folderIds = self._collectChildFolderIds(folderId) + + # Verify all child folders are owned + for fid in folderIds: + if fid == folderId: + continue + child = self.getFolder(fid) + if child and not self._isFolderOwner(child): + raise PermissionError(f"Cannot delete folder tree: sub-folder {fid} is not owned by you") + + # Collect files in those folders + fileRows = [] + for fid in folderIds: + items = self.db.getRecordset(FileItem, recordFilter={"folderId": fid}) + fileRows.extend(items) + + for item in fileRows: + itemOwner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None) + if itemOwner != self.userId: + itemId = item.get("id") if isinstance(item, dict) else getattr(item, "id", None) + raise PermissionError(f"Cannot delete folder tree: file {itemId} is not owned by you") + + fileIds = [ + (item.get("id") if isinstance(item, dict) else getattr(item, "id", None)) + for item in fileRows + ] + + # Single transaction: delete FileData, FileItem, then FileFolder (children first) + self.db._ensure_connection() + try: + with self.db.connection.cursor() as cursor: + if fileIds: + cursor.execute('DELETE FROM "FileData" WHERE "id" = ANY(%s)', (fileIds,)) + cursor.execute('DELETE FROM "FileItem" WHERE "id" = ANY(%s)', (fileIds,)) + orderedIds = list(folderIds) + orderedIds.remove(folderId) + orderedIds.append(folderId) + if orderedIds: + cursor.execute('DELETE FROM "FileFolder" WHERE "id" = ANY(%s)', (orderedIds,)) + self.db.connection.commit() + except Exception: + self.db.connection.rollback() + raise + + return {"deletedFolders": len(folderIds), "deletedFiles": len(fileIds)} + + def _collectChildFolderIds(self, folderId: str) -> List[str]: + """BFS to collect folderId + all descendant folder IDs owned by user.""" + result = [folderId] + queue = [folderId] + while queue: + parentId = queue.pop(0) + children = self.db.getRecordset(FileFolder, recordFilter={"parentId": parentId}) + for child in children: + cid = child.get("id") if isinstance(child, dict) else getattr(child, "id", None) + if cid and cid not in result: + result.append(cid) + queue.append(cid) + return result + + def patchFolderScope(self, folderId: str, scope: str, cascadeToFiles: bool = False) -> Dict[str, Any]: + validScopes = {"personal", "featureInstance", "mandate", "global"} + if scope not in validScopes: + raise ValueError(f"Invalid scope: {scope}. Must be one of {validScopes}") + + folder = self.getFolder(folderId) + if not folder: + raise FileNotFoundError(f"Folder {folderId} not found") + self._requireFolderWriteAccess(folder, folderId, "update") + + if scope == "global": + from modules.interfaces.interfaceRbac import buildDataObjectKey + objectKey = buildDataObjectKey("FileFolder") + permissions = self.rbac.getUserPermissions( + self.currentUser, AccessRuleContext.DATA, objectKey, + mandateId=self.mandateId, featureInstanceId=self.featureInstanceId, + ) + if getattr(permissions, "update", None) != AccessLevel.ALL: + raise PermissionError("Setting global scope requires ALL permission") + + self.db.recordModify(FileFolder, folderId, {"scope": scope}) + + filesUpdated = 0 + if cascadeToFiles: + items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId}) + for item in items: + owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None) + if owner == self.userId: + iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None) + self.db.recordModify(FileItem, iid, {"scope": scope}) + filesUpdated += 1 + + return {"folderId": folderId, "scope": scope, "filesUpdated": filesUpdated} + + def patchFolderNeutralize(self, folderId: str, neutralize: bool) -> Dict[str, Any]: + folder = self.getFolder(folderId) + if not folder: + raise FileNotFoundError(f"Folder {folderId} not found") + self._requireFolderWriteAccess(folder, folderId, "update") + + self.db.recordModify(FileFolder, folderId, {"neutralize": neutralize}) + + items = self.db.getRecordset(FileItem, recordFilter={"folderId": folderId}) + filesUpdated = 0 + for item in items: + owner = item.get("sysCreatedBy") if isinstance(item, dict) else getattr(item, "sysCreatedBy", None) + if owner == self.userId: + iid = item.get("id") if isinstance(item, dict) else getattr(item, "id", None) + self.db.recordModify(FileItem, iid, {"neutralize": neutralize}) + filesUpdated += 1 + + return {"folderId": folderId, "neutralize": neutralize, "filesUpdated": filesUpdated} + def _isfileNameUnique(self, fileName: str, excludeFileId: Optional[str] = None) -> bool: """Checks if a fileName is unique for the current user.""" # Get all files filtered by RBAC (will be filtered by user's access level) diff --git a/modules/interfaces/interfaceRbac.py b/modules/interfaces/interfaceRbac.py index 8ecc51fd..42a32b82 100644 --- a/modules/interfaces/interfaceRbac.py +++ b/modules/interfaces/interfaceRbac.py @@ -204,6 +204,7 @@ TABLE_NAMESPACE = { # Files - benutzer-eigen "FileItem": "files", "FileData": "files", + "FileFolder": "files", # Automation - benutzer-eigen "AutomationDefinition": "automation", "AutomationTemplate": "automation", diff --git a/modules/migrations/_archive/README.md b/modules/migrations/_archive/README.md new file mode 100644 index 00000000..c488801a --- /dev/null +++ b/modules/migrations/_archive/README.md @@ -0,0 +1,11 @@ +# Archived one-off migrations + +`migrate_folders_to_groups.py` copies `FileFolder` + `FileItem.folderId` into `TableGrouping` (`files/list`). It was used during an experimental UI path; **product choice** is to keep physical folders (`FileFolder`, `folderId`) and recover `FormGeneratorTree` (see `wiki/c-work/1-plan/2026-05-formgenerator-tree-and-folder-recovery.md`). + +Run only if you need a historical data rescue: + +```bash +cd gateway +python -m modules.migrations._archive.migrate_folders_to_groups --verbose +python -m modules.migrations._archive.migrate_folders_to_groups --execute --verbose +``` diff --git a/modules/migrations/_archive/__init__.py b/modules/migrations/_archive/__init__.py new file mode 100644 index 00000000..a733bae9 --- /dev/null +++ b/modules/migrations/_archive/__init__.py @@ -0,0 +1 @@ +# Subpackage for archived one-off migration scripts (not part of normal app startup). diff --git a/modules/migrations/migrate_folders_to_groups.py b/modules/migrations/_archive/migrate_folders_to_groups.py similarity index 86% rename from modules/migrations/migrate_folders_to_groups.py rename to modules/migrations/_archive/migrate_folders_to_groups.py index 870e1e45..6beed744 100644 --- a/modules/migrations/migrate_folders_to_groups.py +++ b/modules/migrations/_archive/migrate_folders_to_groups.py @@ -1,11 +1,16 @@ """ -One-time migration: Convert FileFolder tree + FileItem.folderId → table_groupings. +One-time migration: Convert FileFolder tree + FileItem.folderId to table_groupings. + +Archived per wiki plan 2026-05-formgenerator-tree-and-folder-recovery (Stage 1.A). +Product direction: keep FileFolder + folderId; do not run DROP migrations. +This script remains for audit / one-off data rescue only. Run this BEFORE dropping the physical FileFolder table and FileItem.folderId column -from the database (those are separate Alembic/SQL steps). +from the database (those would be separate Alembic/SQL steps -- not part of current product path). -Usage: - python -m modules.migrations.migrate_folders_to_groups [--dry-run] [--verbose] +Usage (from gateway working directory): + python -m modules.migrations._archive.migrate_folders_to_groups [--dry-run] [--verbose] + python -m modules.migrations._archive.migrate_folders_to_groups --execute --verbose Steps: 1. For each distinct (userId, mandateId) combination that has FileFolder records: @@ -30,6 +35,14 @@ from typing import Optional logger = logging.getLogger(__name__) +def _scalarRow(row): + if row is None: + return None + if isinstance(row, dict): + return next(iter(row.values())) + return row[0] + + # ── Helpers ────────────────────────────────────────────────────────────────── def _build_tree(folders: list, parent_id: Optional[str]) -> list: @@ -76,11 +89,19 @@ def _now_ts() -> str: def run_migration(dry_run: bool = True, verbose: bool = False): """Main migration entry point.""" logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) - logger.info(f"Starting folder→group migration (dry_run={dry_run})") + logger.info(f"Starting folder to group migration (dry_run={dry_run})") from modules.connectors.connectorDbPostgre import getCachedConnector + from modules.shared.configuration import APP_CONFIG - connector = getCachedConnector() + connector = getCachedConnector( + dbHost=APP_CONFIG.get("DB_HOST", "_no_config_default_data"), + dbDatabase="poweron_management", + dbUser=APP_CONFIG.get("DB_USER"), + dbPassword=APP_CONFIG.get("DB_PASSWORD_SECRET"), + dbPort=int(APP_CONFIG.get("DB_PORT", 5432)), + userId=None, + ) if not connector or not connector.connection: logger.error("Could not obtain a DB connection. Aborting.") return @@ -93,17 +114,17 @@ def run_migration(dry_run: bool = True, verbose: bool = False): SELECT EXISTS ( SELECT 1 FROM information_schema.tables WHERE table_name = 'FileFolder' - ) + ) AS ok """) - folder_table_exists = cur.fetchone()[0] + folder_table_exists = bool(_scalarRow(cur.fetchone())) cur.execute(""" SELECT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'FileItem' AND column_name = 'folderId' - ) + ) AS ok """) - folder_column_exists = cur.fetchone()[0] + folder_column_exists = bool(_scalarRow(cur.fetchone())) if not folder_table_exists and not folder_column_exists: logger.info("FileFolder table and FileItem.folderId column not found — migration already applied or not needed.") @@ -126,7 +147,7 @@ def run_migration(dry_run: bool = True, verbose: bool = False): }) logger.info(f"Loaded folders for {len(folders_by_user)} (user, mandate) combinations") - # ── 3. Load file→folder assignments ────────────────────────────────────── + # ── 3. Load file to folder assignments ──────────────────────────────────── files_by_key: dict = {} if folder_column_exists: cur.execute( @@ -139,7 +160,7 @@ def run_migration(dry_run: bool = True, verbose: bool = False): total_files = sum( sum(len(v) for v in d.values()) for d in files_by_key.values() ) - logger.info(f"Found {total_files} file→folder assignments across {len(files_by_key)} (user, mandate) combos") + logger.info(f"Found {total_files} file to folder assignments across {len(files_by_key)} (user, mandate) combos") # ── 4. Combine and upsert groupings ────────────────────────────────────── all_keys = set(folders_by_user.keys()) | set(files_by_key.keys()) @@ -231,7 +252,7 @@ def run_migration(dry_run: bool = True, verbose: bool = False): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings") + parser = argparse.ArgumentParser(description="Migrate FileFolder tree to table_groupings (archived script)") parser.add_argument("--dry-run", action="store_true", default=True, help="Preview only, no DB writes (default)") parser.add_argument("--execute", action="store_true", help="Actually write to DB (disables dry-run)") parser.add_argument("--verbose", action="store_true", help="Show per-user details") diff --git a/modules/routes/routeClickup.py b/modules/routes/routeClickup.py index ccf1c481..c3f4b976 100644 --- a/modules/routes/routeClickup.py +++ b/modules/routes/routeClickup.py @@ -57,8 +57,8 @@ def _svc_for_connection(current_user: User, connection: UserConnection): services = getServices(current_user, None) if not services.clickup.setAccessTokenFromConnection(connection): raise HTTPException( - status_code=status.HTTP_401_UNAUTHORIZED, - detail=routeApiMsg("Failed to set ClickUp access token"), + status_code=status.HTTP_502_BAD_GATEWAY, + detail=routeApiMsg("Failed to set ClickUp access token. Connection may be expired or invalid."), ) return services.clickup diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index c20f3f3a..3394b5c5 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -11,7 +11,7 @@ from modules.auth import limiter, getCurrentUser, getRequestContext, RequestCont # Import interfaces import modules.interfaces.interfaceDbManagement as interfaceDbManagement -from modules.datamodels.datamodelFiles import FileItem, FilePreview +from modules.datamodels.datamodelFiles import FileItem, FilePreview, FileFolder from modules.shared.attributeUtils import getModelAttributeDefinitions from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict @@ -72,14 +72,18 @@ def _resolveFileWithScope(currentUser: User, context: RequestContext, fileId: st return scopedMgmt, fileItem -async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): +async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user, *, mandateId: str = None, featureInstanceId: str = None): """Background task: pre-scan + extraction + knowledge indexing. Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted) Step 2: Content extraction via runExtraction -> ContentParts Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store""" userId = user.id if hasattr(user, "id") else str(user) try: - mgmtInterface = interfaceDbManagement.getInterface(user) + mgmtInterface = interfaceDbManagement.getInterface( + user, + mandateId=mandateId or None, + featureInstanceId=featureInstanceId or None, + ) mgmtInterface.updateFile(fileId, {"status": "processing"}) rawBytes = mgmtInterface.getFileData(fileId) @@ -250,6 +254,213 @@ router = APIRouter( } ) + +@router.get("/folders/tree") +@limiter.limit("120/minute") +def get_folder_tree( + request: Request, + owner: str = Query("me", description="'me' | 'shared'"), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + o = (owner or "me").strip().lower() + if o == "me": + return managementInterface.getOwnFolderTree() + if o == "shared": + return managementInterface.getSharedFolderTree() + raise HTTPException(status_code=400, detail="owner must be 'me' or 'shared'") + except HTTPException: + raise + except Exception as e: + logger.error(f"get_folder_tree error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/folders", status_code=status.HTTP_201_CREATED) +@limiter.limit("30/minute") +def create_folder( + request: Request, + body: Dict[str, Any] = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + name = body.get("name") + if not name or not str(name).strip(): + raise HTTPException(status_code=400, detail="name is required") + parentId = body.get("parentId") or None + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + return managementInterface.createFolder(str(name).strip(), parentId) + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + except interfaceDbManagement.FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"create_folder error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.patch("/folders/{folderId}") +@limiter.limit("30/minute") +def rename_folder( + request: Request, + folderId: str = Path(...), + body: Dict[str, Any] = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + name = body.get("name") + if not name or not str(name).strip(): + raise HTTPException(status_code=400, detail="name is required") + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + return managementInterface.renameFolder(folderId, str(name).strip()) + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + except interfaceDbManagement.FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"rename_folder error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/folders/{folderId}/move") +@limiter.limit("30/minute") +def move_folder( + request: Request, + folderId: str = Path(...), + body: Dict[str, Any] = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + newParentId = body.get("parentId") + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + return managementInterface.moveFolder(folderId, newParentId or None) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + except interfaceDbManagement.FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"move_folder error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.delete("/folders/{folderId}") +@limiter.limit("30/minute") +def delete_folder( + request: Request, + folderId: str = Path(...), + cascade: bool = Query(True, description="Cascade delete sub-folders and files"), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + return managementInterface.deleteFolderCascade(folderId) + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + except interfaceDbManagement.FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"delete_folder error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.patch("/folders/{folderId}/scope") +@limiter.limit("30/minute") +def patch_folder_scope( + request: Request, + folderId: str = Path(...), + body: Dict[str, Any] = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + scope = body.get("scope") + if not scope: + raise HTTPException(status_code=400, detail="scope is required") + cascadeToFiles = body.get("cascadeToFiles", False) + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + return managementInterface.patchFolderScope(folderId, scope, cascadeToFiles) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + except interfaceDbManagement.FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"patch_folder_scope error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.patch("/folders/{folderId}/neutralize") +@limiter.limit("30/minute") +def patch_folder_neutralize( + request: Request, + folderId: str = Path(...), + body: Dict[str, Any] = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + try: + neutralize = body.get("neutralize") + if neutralize is None: + raise HTTPException(status_code=400, detail="neutralize is required") + managementInterface = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + return managementInterface.patchFolderNeutralize(folderId, bool(neutralize)) + except PermissionError as e: + raise HTTPException(status_code=403, detail=str(e)) + except interfaceDbManagement.FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"patch_folder_neutralize error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + @router.get("/list") @limiter.limit("120/minute") def get_files( @@ -462,6 +673,8 @@ async def upload_file( fileName=fileItem.fileName, mimeType=fileItem.mimeType, user=currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, )) except Exception as indexErr: logger.warning(f"Auto-index trigger failed (non-blocking): {indexErr}") @@ -526,6 +739,110 @@ def batch_delete_items( raise HTTPException(status_code=500, detail=str(e)) +@router.post("/batch-download") +@limiter.limit("10/minute") +def batchDownload( + request: Request, + body: Dict[str, Any] = Body(...), + currentUser: User = Depends(getCurrentUser), + context: RequestContext = Depends(getRequestContext), +): + """Download multiple files and/or folders as a single ZIP archive, + preserving the folder hierarchy as ZIP paths.""" + import io, zipfile + + fileIds = body.get("fileIds") or [] + folderIds = body.get("folderIds") or [] + + if not fileIds and not folderIds: + raise HTTPException(status_code=400, detail="fileIds or folderIds required") + + try: + mgmt = interfaceDbManagement.getInterface( + currentUser, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) + + folderCache: dict[str, dict] = {} + + def _getFolder(fid: str): + if fid not in folderCache: + f = mgmt.getFolder(fid) + folderCache[fid] = f if f else {} + return folderCache[fid] + + def _folderPath(fid: str) -> str: + """Build the full path for a folder by walking up parentId.""" + parts: list[str] = [] + current = fid + visited: set[str] = set() + while current and current not in visited: + visited.add(current) + folder = _getFolder(current) + if not folder: + break + parts.append(folder.get("name", current)) + current = folder.get("parentId") + parts.reverse() + return "/".join(parts) + + # Collect files from requested folders (recursive) + fileEntries: list[tuple[str, str]] = [] + seenFileIds: set[str] = set() + + for fid in folderIds: + childFolderIds = mgmt._collectChildFolderIds(fid) + for cfid in childFolderIds: + prefix = _folderPath(cfid) + items = mgmt.db.getRecordset(FileItem, recordFilter={"folderId": cfid}) + for item in items: + itemId = item.get("id") if isinstance(item, dict) else getattr(item, "id", None) + if itemId and itemId not in seenFileIds: + seenFileIds.add(itemId) + fileEntries.append((itemId, prefix)) + + # Loose files (not via folder selection) + for fid in fileIds: + if fid in seenFileIds: + continue + seenFileIds.add(fid) + fileMeta = mgmt.getFile(fid) + if not fileMeta: + continue + fileFolderId = fileMeta.get("folderId") if isinstance(fileMeta, dict) else getattr(fileMeta, "folderId", None) + prefix = _folderPath(fileFolderId) if fileFolderId else "" + fileEntries.append((fid, prefix)) + + if not fileEntries: + raise HTTPException(status_code=404, detail="No downloadable files found") + + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + for fid, prefix in fileEntries: + try: + fileMeta = mgmt.getFile(fid) + fileData = mgmt.getFileData(fid) + if fileMeta and fileData: + name = (fileMeta.get("fileName") if isinstance(fileMeta, dict) else getattr(fileMeta, "fileName", fid)) or fid + zipPath = f"{prefix}/{name}" if prefix else name + zf.writestr(zipPath, fileData) + except Exception as fe: + logger.warning(f"batch_download: skipping file {fid}: {fe}") + buf.seek(0) + from fastapi.responses import StreamingResponse + return StreamingResponse( + buf, + media_type="application/zip", + headers={"Content-Disposition": 'attachment; filename="download.zip"'}, + ) + except HTTPException: + raise + except Exception as e: + logger.error(f"batch_download error: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + # ── Group bulk endpoints ────────────────────────────────────────────────────── def _get_group_item_ids(contextKey: str, groupId: str, appInterface) -> set: @@ -759,7 +1076,11 @@ def updateFileScope( async def _runReindexAfterScopeChange(): try: - await _autoIndexFile(fileId=fileId, fileName=fn, mimeType=mt, user=context.user) + await _autoIndexFile( + fileId=fileId, fileName=fn, mimeType=mt, user=context.user, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) except Exception as ex: logger.warning("Re-index after scope change failed for %s: %s", fileId, ex) @@ -837,7 +1158,11 @@ def updateFileNeutralize( async def _runReindexAfterNeutralizeToggle(): try: - await _autoIndexFile(fileId=fileId, fileName=fn, mimeType=mt, user=context.user) + await _autoIndexFile( + fileId=fileId, fileName=fn, mimeType=mt, user=context.user, + mandateId=str(context.mandateId) if context.mandateId else None, + featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None, + ) except Exception as ex: logger.error("Re-index after neutralize toggle failed for %s: %s (file has NO index until next re-index)", fileId, ex) @@ -909,7 +1234,7 @@ def update_file( ) -> FileItem: """Update file info""" try: - _EDITABLE_FIELDS = {"fileName", "scope", "tags", "description", "neutralize"} + _EDITABLE_FIELDS = {"fileName", "folderId", "scope", "tags", "description", "neutralize"} safeData = {k: v for k, v in file_info.items() if k in _EDITABLE_FIELDS} if not safeData: raise HTTPException(status_code=400, detail=routeApiMsg("No editable fields provided")) diff --git a/modules/routes/routeSharepoint.py b/modules/routes/routeSharepoint.py index e42611ac..1ee21900 100644 --- a/modules/routes/routeSharepoint.py +++ b/modules/routes/routeSharepoint.py @@ -128,7 +128,7 @@ async def getSharepointFolderOptionsByReference( # Set access token on SharePoint service if not services.sharepoint.setAccessTokenFromConnection(connection): raise HTTPException( - status_code=status.HTTP_401_UNAUTHORIZED, + status_code=status.HTTP_502_BAD_GATEWAY, detail=routeApiMsg("Failed to set SharePoint access token. Connection may be expired or invalid.") ) diff --git a/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py b/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py index cee81618..56ba791a 100644 --- a/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py +++ b/modules/serviceCenter/services/serviceAgent/actionToolAdapter.py @@ -3,7 +3,7 @@ """ActionToolAdapter: wraps existing workflow actions (dynamicMode=True) as agent tools.""" import logging -from typing import Dict, Any, List +from typing import Dict, Any, List, Optional from modules.serviceCenter.services.serviceAgent.datamodelAgent import ( ToolDefinition, ToolResult @@ -44,7 +44,7 @@ class ActionToolAdapter: compoundName = f"{shortName}_{actionName}" toolDef = _buildToolDefinition(compoundName, actionDef, actionInfo) - handler = _createDispatchHandler(self._actionExecutor, shortName, actionName) + handler = _createDispatchHandler(self._actionExecutor, shortName, actionName, self._actionExecutor.services) toolRegistry.registerFromDefinition(toolDef, handler) self._registeredTools.append(compoundName) registered += 1 @@ -186,7 +186,7 @@ def _catalogTypeToJsonSchema(typeStr: str, _depth: int = 0) -> Dict[str, Any]: return {"type": "string", "description": f"unknown type '{typeStr}' (defaulted to string)"} -def _createDispatchHandler(actionExecutor, methodName: str, actionName: str): +def _createDispatchHandler(actionExecutor, methodName: str, actionName: str, services=None): """Create an async handler that dispatches to the ActionExecutor. Parameter validation and Ref-payload normalization (collapsing @@ -204,7 +204,7 @@ def _createDispatchHandler(actionExecutor, methodName: str, actionName: str): if "mandateId" not in args and context.get("mandateId"): args["mandateId"] = context["mandateId"] result = await actionExecutor.executeAction(methodName, actionName, args) - data = _formatActionResult(result) + data = _formatActionResult(result, services, context) return ToolResult( toolCallId="", toolName=f"{methodName}_{actionName}", @@ -223,9 +223,65 @@ def _createDispatchHandler(actionExecutor, methodName: str, actionName: str): return _handler -def _formatActionResult(result) -> str: - """Format an ActionResult into a text representation for the agent.""" +_INLINE_CONTENT_LIMIT = 2000 + + +def _persistLargeDocument(doc, services, context: Dict[str, Any]) -> Optional[str]: + """Save an ActionDocument with large content as a workspace file. + + Returns a formatted result line (with file id + docItem ref) or None + if persistence is not possible. + """ + if not services: + return None + chatService = getattr(services, "chat", None) + if not chatService: + return None + docData = getattr(doc, "documentData", None) + if not docData or not isinstance(docData, str): + return None + docName = getattr(doc, "documentName", "unnamed") + docBytes = docData.encode("utf-8") + try: + fileItem, _ = chatService.interfaceDbComponent.saveUploadedFile(docBytes, docName) + fiId = context.get("featureInstanceId") or getattr(services, "featureInstanceId", "") + if fiId: + chatService.interfaceDbComponent.updateFile(fileItem.id, {"featureInstanceId": fiId}) + + from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( + _attachFileAsChatDocument, + _formatToolFileResult, + _getOrCreateTempFolder, + ) + tempFolderId = _getOrCreateTempFolder(chatService) + if tempFolderId: + chatService.interfaceDbComponent.updateFile(fileItem.id, {"folderId": tempFolderId}) + + chatDocId = _attachFileAsChatDocument( + services, fileItem, + label=f"action_doc:{docName}", + userMessage=f"Action document: {docName}", + ) + return _formatToolFileResult( + fileItem=fileItem, + chatDocId=chatDocId, + actionLabel="Produced", + extraInfo="Use readFile to read the content.", + ) + except Exception as e: + logger.warning(f"_persistLargeDocument failed for {docName}: {e}") + return None + + +def _formatActionResult(result, services=None, context: Optional[Dict[str, Any]] = None) -> str: + """Format an ActionResult into a text representation for the agent. + + Documents whose content exceeds the inline limit are persisted as + workspace files so the agent can access them via readFile / + ai_process / searchInFileContent. + """ parts = [] + ctx = context or {} if result.resultLabel: parts.append(f"Result: {result.resultLabel}") @@ -238,10 +294,19 @@ def _formatActionResult(result) -> str: for doc in result.documents: docName = getattr(doc, "documentName", "unnamed") docType = getattr(doc, "mimeType", "unknown") - parts.append(f" - {docName} ({docType})") docData = getattr(doc, "documentData", None) - if docData and isinstance(docData, str) and len(docData) < 2000: - parts.append(f" Content: {docData[:2000]}") + + isLarge = docData and isinstance(docData, str) and len(docData) >= _INLINE_CONTENT_LIMIT + if isLarge: + persistedLine = _persistLargeDocument(doc, services, ctx) + if persistedLine: + parts.append(f" - {docName} ({docType})") + parts.append(f" {persistedLine}") + continue + + parts.append(f" - {docName} ({docType})") + if docData and isinstance(docData, str) and len(docData) < _INLINE_CONTENT_LIMIT: + parts.append(f" Content: {docData[:_INLINE_CONTENT_LIMIT]}") if not parts: parts.append("Action completed successfully." if result.success else "Action failed.") diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py index 96ee31bb..c1191c1f 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_dataSourceTools.py @@ -198,7 +198,10 @@ def _registerDataSourceTools(registry: ToolRegistry, services): if isinstance(result, _DR): fileBytes = result.data - fileName = result.fileName or fileName + resolvedName = result.fileName or fileName + if resolvedName != fileName: + logger.debug(f"downloadFromDataSource: connector fileName={result.fileName!r} overrides arg fileName={fileName!r}") + fileName = resolvedName else: fileBytes = result diff --git a/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py b/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py index 7b071996..adb79ecf 100644 --- a/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py +++ b/modules/serviceCenter/services/serviceAgent/coreTools/_mediaTools.py @@ -836,7 +836,7 @@ def _registerMediaTools(registry: ToolRegistry, services): return ToolResult(toolCallId="", toolName="executeCode", success=False, error=f"Language '{language}' not supported. Only 'python' is available.") try: from modules.serviceCenter.services.serviceAgent.sandboxExecutor import executePython - result = await executePython(code) + result = await executePython(code, services=services) if result.get("success"): output = result.get("output", "(no output)") return ToolResult(toolCallId="", toolName="executeCode", success=True, data=output) @@ -886,12 +886,17 @@ def _registerMediaTools(registry: ToolRegistry, services): readOnly=True ) + from modules.serviceCenter.services.serviceAgent.sandboxExecutor import SANDBOX_ALLOWED_MODULES + moduleList = ", ".join(sorted(SANDBOX_ALLOWED_MODULES | {"io"})) registry.register( "executeCode", _executeCode, description=( - "Execute Python code in a sandboxed environment for calculations and data analysis. " - "Available modules: math, statistics, json, csv, re, datetime, collections, itertools, functools, decimal, fractions, random. " - "No file system, network, or OS access. Max 30s execution time. " + f"Execute Python code in a sandboxed environment for calculations and data analysis. " + f"Available modules: {moduleList}. " + "io is restricted to StringIO and BytesIO only (no file access). " + "Built-in readFile(fileId) returns UTF-8 content of a workspace file by its file ID " + "(use the 'file id' from tool outputs, e.g. data = readFile('019af...')). " + "No other file system, network, or OS access. Max 30s execution time. " "Use print() to produce output." ), parameters={ diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py index 372ec5b2..17eb83e4 100644 --- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py +++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py @@ -69,7 +69,15 @@ class _ServicesAdapter: @property def workflow(self): - return self._context.workflow + return getattr(self, "_workflow_override", None) or self._context.workflow + + @workflow.setter + def workflow(self, value): + self._workflow_override = value + try: + self._context.workflow = value + except (AttributeError, TypeError): + pass @property def ai(self): @@ -95,6 +103,13 @@ class _ServicesAdapter: def extraction(self): return self._getService("extraction") + @property + def interfaceDbComponent(self): + try: + return self.chat.interfaceDbComponent + except Exception: + return None + @property def rbac(self): """Same RbacClass as workflow hub (MethodBase permission checks during discoverMethods).""" diff --git a/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py b/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py index 15362e65..e4671a70 100644 --- a/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py +++ b/modules/serviceCenter/services/serviceAgent/sandboxExecutor.py @@ -10,8 +10,8 @@ from typing import Dict, Any logger = logging.getLogger(__name__) -_PYTHON_ALLOWED_MODULES = { - "math", "statistics", "json", "csv", "re", "datetime", +SANDBOX_ALLOWED_MODULES = { + "math", "statistics", "json", "csv", "re", "datetime", "time", "collections", "itertools", "functools", "decimal", "fractions", "random", "string", "textwrap", "operator", "copy", } @@ -19,17 +19,33 @@ _PYTHON_ALLOWED_MODULES = { _PYTHON_BLOCKED_BUILTINS = { "open", "exec", "eval", "compile", "__import__", "globals", "locals", "getattr", "setattr", "delattr", "breakpoint", "exit", "quit", - "input", "memoryview", "type", + "input", "memoryview", } _MAX_EXECUTION_TIME_S = 30 _MAX_OUTPUT_CHARS = 50000 +_RESTRICTED_IO = None + +def _getRestrictedIo(): + """Return a restricted ``io`` module exposing only StringIO/BytesIO.""" + global _RESTRICTED_IO + if _RESTRICTED_IO is None: + import types + m = types.ModuleType("io") + m.StringIO = io.StringIO + m.BytesIO = io.BytesIO + _RESTRICTED_IO = m + return _RESTRICTED_IO + + def _safeImport(name, *args, **kwargs): """Restricted import that only allows whitelisted modules.""" - if name not in _PYTHON_ALLOWED_MODULES: - raise ImportError(f"Module '{name}' is not allowed. Permitted: {', '.join(sorted(_PYTHON_ALLOWED_MODULES))}") + if name == "io": + return _getRestrictedIo() + if name not in SANDBOX_ALLOWED_MODULES: + raise ImportError(f"Module '{name}' is not allowed. Permitted: io (StringIO/BytesIO only), {', '.join(sorted(SANDBOX_ALLOWED_MODULES))}") return __builtins__["__import__"](name, *args, **kwargs) if isinstance(__builtins__, dict) else __import__(name, *args, **kwargs) @@ -48,7 +64,7 @@ def _buildRestrictedGlobals() -> Dict[str, Any]: safeBuiltins["__name__"] = "__sandbox__" safeBuiltins["__builtins__"] = safeBuiltins - for modName in _PYTHON_ALLOWED_MODULES: + for modName in SANDBOX_ALLOWED_MODULES: try: safeBuiltins[modName] = __import__(modName) except ImportError: @@ -57,12 +73,27 @@ def _buildRestrictedGlobals() -> Dict[str, Any]: return {"__builtins__": safeBuiltins} -async def executePython(code: str) -> Dict[str, Any]: +def _makeReadFile(services): + """Create a readFile(fileId) closure bound to the current services context.""" + def readFile(fileId: str) -> str: + mgmt = getattr(services, 'interfaceDbComponent', None) if services else None + if not mgmt: + raise RuntimeError("readFile: no file store available in this session") + data = mgmt.getFileData(str(fileId)) + if data is None: + raise FileNotFoundError(f"File '{fileId}' not found in workspace") + return data.decode("utf-8") + return readFile + + +async def executePython(code: str, *, services=None) -> Dict[str, Any]: """Execute Python code in a restricted sandbox. Returns {success, output, error}.""" import asyncio def _run(): restrictedGlobals = _buildRestrictedGlobals() + if services: + restrictedGlobals["__builtins__"]["readFile"] = _makeReadFile(services) capturedOutput = io.StringIO() oldStdout = sys.stdout oldStderr = sys.stderr diff --git a/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py b/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py index 6093e1bd..5bcd1d52 100644 --- a/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py +++ b/modules/serviceCenter/services/serviceClickup/mainServiceClickup.py @@ -166,12 +166,28 @@ class ClickupService: page: int = 0, include_closed: bool = False, subtasks: bool = True, + dateCreatedGt: Optional[int] = None, + dateCreatedLt: Optional[int] = None, + dateUpdatedGt: Optional[int] = None, + dateUpdatedLt: Optional[int] = None, + customFields: Optional[List[Dict[str, Any]]] = None, ) -> Dict[str, Any]: params: Dict[str, Any] = { "page": page, "subtasks": str(subtasks).lower(), "include_closed": str(include_closed).lower(), } + if dateCreatedGt is not None: + params["date_created_gt"] = dateCreatedGt + if dateCreatedLt is not None: + params["date_created_lt"] = dateCreatedLt + if dateUpdatedGt is not None: + params["date_updated_gt"] = dateUpdatedGt + if dateUpdatedLt is not None: + params["date_updated_lt"] = dateUpdatedLt + if customFields: + import json as _json + params["custom_fields"] = _json.dumps(customFields) return await self._request("GET", f"/list/{list_id}/task", params=params) async def getTask(self, task_id: str, *, include_subtasks: bool = True) -> Dict[str, Any]: diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py index 962b8f04..cb6d77ca 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeCsv.py @@ -79,7 +79,7 @@ class RendererCodeCsv(BaseCodeRenderer): return renderedDocs - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render method for document generation compatibility. Delegates to document renderer if needed, or handles code files directly. diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py index 924ba861..dff849ef 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeJson.py @@ -91,7 +91,7 @@ class RendererCodeJson(BaseCodeRenderer): return renderedDocs - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render method for document generation compatibility. Delegates to document renderer if needed, or handles code files directly. diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py index edab8f8e..6967f746 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCodeXml.py @@ -78,7 +78,7 @@ class RendererCodeXml(BaseCodeRenderer): return renderedDocs - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """ Render method for document generation compatibility. For XML, we only support code generation (no document renderer exists yet). diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py index 91312299..f5ee252b 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererCsv.py @@ -39,7 +39,7 @@ class RendererCsv(BaseRenderer): """ return ["table", "code_block"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to CSV format. Produces one CSV file per table section.""" try: # Validate JSON structure diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py index 2aff559f..8141b798 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererImage.py @@ -43,7 +43,7 @@ class RendererImage(BaseRenderer): """ return ["image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to image format using AI image generation.""" try: # Generate AI image from content diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py index 076210bc..470d4543 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererJson.py @@ -42,7 +42,7 @@ class RendererJson(BaseRenderer): # Return all types except image return [st for st in supportedSectionTypes if st != "image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to JSON format.""" try: # The extracted content should already be JSON from the AI diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py index a3b8b5b3..552266e9 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererMarkdown.py @@ -40,7 +40,7 @@ class RendererMarkdown(BaseRenderer): from modules.datamodels.datamodelJson import supportedSectionTypes return [st for st in supportedSectionTypes if st != "image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to Markdown format.""" try: # Generate markdown from JSON structure diff --git a/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py b/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py index 15a7161c..94400df9 100644 --- a/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py +++ b/modules/serviceCenter/services/serviceGeneration/renderers/rendererText.py @@ -76,7 +76,7 @@ class RendererText(BaseRenderer): # Text renderer accepts all types except images return [st for st in supportedSectionTypes if st != "image"] - async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None, *, style: Dict[str, Any] = None) -> List[RenderedDocument]: """Render extracted JSON content to plain text format.""" try: # Generate text from JSON structure diff --git a/modules/workflows/methods/methodAi/actions/process.py b/modules/workflows/methods/methodAi/actions/process.py index 50500929..2af480e7 100644 --- a/modules/workflows/methods/methodAi/actions/process.py +++ b/modules/workflows/methods/methodAi/actions/process.py @@ -75,8 +75,10 @@ def _action_docs_to_content_parts(services, docs: List[Any]) -> List[ContentPart def _resolve_file_refs_to_content_parts(services, fileIdRefs) -> List[ContentPart]: """Fetch files by ID from the file store and extract content. - Used for automation2 workflows where documents are file-store references, - not chat message attachments.""" + Used ONLY for automation2 workflows where documents are file-store + references, not chat message attachments. In the agent/chat context, + ``DocumentItemReference`` holds ChatDocument IDs that must be resolved + via ``getChatDocumentsFromDocumentList`` instead.""" from modules.datamodels.datamodelExtraction import ExtractionOptions, MergeStrategy mgmt = getattr(services, 'interfaceDbComponent', None) @@ -171,16 +173,24 @@ async def process(self, parameters: Dict[str, Any]) -> ActionResult: f"to DocumentReferenceList with {len(documentList.references)} references" ) - # Resolve DocumentItemReferences (file-ID refs from automation2) directly - # from the file store. These cannot be resolved via chat messages. + # DocumentItemReferences carry either file-store IDs (automation2) + # or ChatDocument IDs (agent context with docItem: refs). + # Route based on context: if a chat workflow with messages exists, + # let getChatDocumentsFromDocumentList handle them (it resolves + # docItem:uuid via workflow.messages). Otherwise fall through to + # the file-store path for automation2. from modules.datamodels.datamodelDocref import DocumentItemReference fileIdRefs = [r for r in documentList.references if isinstance(r, DocumentItemReference)] if fileIdRefs: - extractedParts = _resolve_file_refs_to_content_parts(self.services, fileIdRefs) - if extractedParts: - inline_content_parts = (inline_content_parts or []) + extractedParts - remaining = [r for r in documentList.references if not isinstance(r, DocumentItemReference)] - documentList = DocumentReferenceList(references=remaining) + chatService = getattr(self.services, 'chat', None) + workflow = getattr(chatService, '_workflow', None) if chatService else None + hasChatContext = workflow and getattr(workflow, 'messages', None) + if not hasChatContext: + extractedParts = _resolve_file_refs_to_content_parts(self.services, fileIdRefs) + if extractedParts: + inline_content_parts = (inline_content_parts or []) + extractedParts + remaining = [r for r in documentList.references if not isinstance(r, DocumentItemReference)] + documentList = DocumentReferenceList(references=remaining) # Optional: if omitted, formats determined from prompt. Default "txt" is validation fallback only. resultType = parameters.get("resultType") diff --git a/modules/workflows/methods/methodClickup/actions/list_tasks.py b/modules/workflows/methods/methodClickup/actions/list_tasks.py index 4caf9e31..9ae57f94 100644 --- a/modules/workflows/methods/methodClickup/actions/list_tasks.py +++ b/modules/workflows/methods/methodClickup/actions/list_tasks.py @@ -31,8 +31,30 @@ async def list_tasks(self, parameters: Dict[str, Any]) -> ActionResult: page = int(parameters.get("page") or 0) include_closed = bool(parameters.get("includeClosed", False)) + + dateFilters = {} + for key in ("dateCreatedGt", "dateCreatedLt", "dateUpdatedGt", "dateUpdatedLt"): + val = parameters.get(key) + if val is not None and str(val).strip(): + try: + dateFilters[key] = int(val) + except (ValueError, TypeError): + pass + + rawCustomFields = parameters.get("customFields") + customFields = None + if rawCustomFields: + if isinstance(rawCustomFields, str): + try: + customFields = json.loads(rawCustomFields) + except json.JSONDecodeError: + return ActionResult.isFailure(error="customFields must be valid JSON array") + elif isinstance(rawCustomFields, list): + customFields = rawCustomFields + data = await self.services.clickup.getTasksInList( - list_id, page=page, include_closed=include_closed, subtasks=True + list_id, page=page, include_closed=include_closed, subtasks=True, + **dateFilters, customFields=customFields, ) if isinstance(data, dict) and data.get("error"): return ActionResult.isFailure(error=str(data.get("error")) + (data.get("body") or "")) diff --git a/modules/workflows/methods/methodClickup/methodClickup.py b/modules/workflows/methods/methodClickup/methodClickup.py index 17f42300..725929dd 100644 --- a/modules/workflows/methods/methodClickup/methodClickup.py +++ b/modules/workflows/methods/methodClickup/methodClickup.py @@ -66,6 +66,41 @@ class MethodClickup(MethodBase): default=False, description="Include closed tasks", ), + "dateCreatedGt": WorkflowActionParameter( + name="dateCreatedGt", + type="int", + frontendType=FrontendType.NUMBER, + required=False, + description="Filter: created after this Unix ms timestamp", + ), + "dateCreatedLt": WorkflowActionParameter( + name="dateCreatedLt", + type="int", + frontendType=FrontendType.NUMBER, + required=False, + description="Filter: created before this Unix ms timestamp", + ), + "dateUpdatedGt": WorkflowActionParameter( + name="dateUpdatedGt", + type="int", + frontendType=FrontendType.NUMBER, + required=False, + description="Filter: updated after this Unix ms timestamp", + ), + "dateUpdatedLt": WorkflowActionParameter( + name="dateUpdatedLt", + type="int", + frontendType=FrontendType.NUMBER, + required=False, + description="Filter: updated before this Unix ms timestamp", + ), + "customFields": WorkflowActionParameter( + name="customFields", + type="str", + frontendType=FrontendType.TEXTAREA, + required=False, + description='JSON array of custom field filters per ClickUp API, e.g. [{"field_id":"abc","operator":"=","value":"123"}]', + ), }, execute=list_tasks.__get__(self, self.__class__), ), diff --git a/scripts/stage0_filefolder_schema_check.py b/scripts/stage0_filefolder_schema_check.py new file mode 100644 index 00000000..861d8671 --- /dev/null +++ b/scripts/stage0_filefolder_schema_check.py @@ -0,0 +1,58 @@ +"""Stage 0: verify FileFolder table + FileItem.folderId column in management DB. + +Run from the gateway directory (same as uvicorn): + python -m scripts.stage0_filefolder_schema_check +""" +from modules.connectors.connectorDbPostgre import getCachedConnector +from modules.shared.configuration import APP_CONFIG + +managementDatabase = "poweron_management" + +dbHost = APP_CONFIG.get("DB_HOST", "_no_config_default_data") +dbUser = APP_CONFIG.get("DB_USER") +dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET") +dbPort = int(APP_CONFIG.get("DB_PORT", 5432)) + +c = getCachedConnector( + dbHost=dbHost, + dbDatabase=managementDatabase, + dbUser=dbUser, + dbPassword=dbPassword, + dbPort=dbPort, + userId=None, +) +if not c or not c.connection: + print("STAGE0: DB_CONNECTION=none (check config.ini / .env)") + raise SystemExit(2) + +cur = c.connection.cursor() + + +def _scalar(cur): + row = cur.fetchone() + if row is None: + return None + if isinstance(row, dict): + return next(iter(row.values())) + return row[0] + + +cur.execute( + """ + SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'FileFolder' + ) AS ok + """ +) +print("STAGE0: FileFolder_table=", _scalar(cur)) +cur.execute( + """ + SELECT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'FileItem' AND column_name = 'folderId' + ) AS ok + """ +) +print("STAGE0: FileItem_folderId_column=", _scalar(cur)) +cur.close() diff --git a/tests/unit/interfaces/test_folderRbac.py b/tests/unit/interfaces/test_folderRbac.py new file mode 100644 index 00000000..049f392d --- /dev/null +++ b/tests/unit/interfaces/test_folderRbac.py @@ -0,0 +1,327 @@ +# Copyright (c) 2026 Patrick Motsch +# All rights reserved. +"""Unit tests for folder RBAC two-user matrix (ownership & scope visibility).""" + +import uuid +import pytest +from unittest.mock import Mock, patch, MagicMock +from typing import Dict, Any, List, Optional + +from modules.datamodels.datamodelFiles import FileFolder, FileItem +from modules.datamodels.datamodelUam import User, UserPermissions, AccessLevel +from modules.interfaces.interfaceDbManagement import ComponentObjects, FileNotFoundError + + +_MANDATE_ID = "mandate-test-1" +_FEATURE_INSTANCE_ID = "fi-test-1" +_USER_A = "user-a-id" +_USER_B = "user-b-id" + + +# ── Fakes & helpers ────────────────────────────────────────────────────────── + +class _FakeDb: + """In-memory database mock.""" + + def __init__(self): + self._tables: Dict[str, Dict[str, Dict[str, Any]]] = {} + self.connection = MagicMock() + + def getRecordset(self, modelClass, recordFilter=None): + tableName = modelClass.__name__ + records = list(self._tables.get(tableName, {}).values()) + if not recordFilter: + return records + return [ + r for r in records + if all(r.get(k) == v for k, v in recordFilter.items()) + ] + + def recordCreate(self, modelClass, data): + tableName = modelClass.__name__ + self._tables.setdefault(tableName, {}) + rec = data.model_dump() if hasattr(data, "model_dump") else dict(data) + rec.setdefault("id", str(uuid.uuid4())) + self._tables[tableName][rec["id"]] = rec + return rec + + def recordModify(self, modelClass, recordId, updates): + tbl = self._tables.get(modelClass.__name__, {}) + if recordId in tbl: + tbl[recordId].update(updates) + return True + return False + + def recordDelete(self, modelClass, recordId): + tbl = self._tables.get(modelClass.__name__, {}) + if recordId in tbl: + del tbl[recordId] + return True + return False + + def updateContext(self, userId): + pass + + def _ensure_connection(self): + pass + + def _ensureTableExists(self, modelClass): + return True + + def seed(self, modelClass, record: Dict[str, Any]): + tableName = modelClass.__name__ + self._tables.setdefault(tableName, {}) + self._tables[tableName][record["id"]] = dict(record) + + +def _makeUser(userId, username="testuser"): + return User(id=userId, username=username, language="en") + + +def _makeRbac( + createLevel=AccessLevel.ALL, + readLevel=AccessLevel.ALL, + updateLevel=AccessLevel.MY, + deleteLevel=AccessLevel.MY, +): + """Default: regular user can read all, but write only own records.""" + rbac = Mock() + perms = UserPermissions( + view=True, + read=readLevel, + create=createLevel, + update=updateLevel, + delete=deleteLevel, + ) + rbac.getUserPermissions.return_value = perms + return rbac + + +def _buildComponent(userId, fakeDb, rbac=None): + with patch.object(ComponentObjects, "__init__", lambda self: None): + comp = ComponentObjects() + comp.db = fakeDb + comp.currentUser = _makeUser(userId) + comp.userId = userId + comp.mandateId = _MANDATE_ID + comp.featureInstanceId = _FEATURE_INSTANCE_ID + comp.rbac = rbac or _makeRbac() + comp.userLanguage = "en" + return comp + + +def _makeFolder( + folderId=None, name="Folder", parentId=None, + userId=_USER_A, scope="personal", neutralize=False, +): + return { + "id": folderId or str(uuid.uuid4()), + "name": name, + "parentId": parentId, + "mandateId": _MANDATE_ID, + "featureInstanceId": _FEATURE_INSTANCE_ID, + "scope": scope, + "neutralize": neutralize, + "sysCreatedBy": userId, + "sysCreatedAt": 1700000000.0, + "sysModifiedAt": 1700000000.0, + "sysModifiedBy": None, + } + + +def _makeFile(fileId=None, folderId=None, userId=_USER_A, scope="personal"): + return { + "id": fileId or str(uuid.uuid4()), + "fileName": "test.txt", + "mimeType": "text/plain", + "fileHash": "abc123", + "fileSize": 100, + "folderId": folderId, + "mandateId": _MANDATE_ID, + "featureInstanceId": _FEATURE_INSTANCE_ID, + "scope": scope, + "neutralize": False, + "sysCreatedBy": userId, + "sysCreatedAt": 1700000000.0, + "sysModifiedAt": 1700000000.0, + "sysModifiedBy": None, + "tags": None, + "description": None, + "status": None, + } + + +def _scopeAwareMock(fakeDb): + """Side-effect for getRecordsetWithRBAC that simulates scope-based visibility. + + Visibility rules: + - Owner (sysCreatedBy == currentUser.id) always sees the record + - scope='global' -> visible to everyone + - scope='mandate' -> visible when mandateId matches + - scope='featureInstance' -> visible when featureInstanceId matches + - scope='personal' -> owner only (already covered above) + """ + def _fn(connector, modelClass, currentUser, recordFilter=None, **kwargs): + requestMandateId = kwargs.get("mandateId", _MANDATE_ID) + requestFiId = kwargs.get("featureInstanceId", _FEATURE_INSTANCE_ID) + allRecords = fakeDb.getRecordset(modelClass, recordFilter=recordFilter) + visible = [] + for rec in allRecords: + if rec.get("sysCreatedBy") == currentUser.id: + visible.append(rec) + continue + scope = rec.get("scope", "personal") + if scope == "global": + visible.append(rec) + elif scope == "mandate" and rec.get("mandateId") == requestMandateId: + visible.append(rec) + elif scope == "featureInstance" and rec.get("featureInstanceId") == requestFiId: + visible.append(rec) + return visible + return _fn + + +# ── Test class ─────────────────────────────────────────────────────────────── + +@patch("modules.interfaces.interfaceDbManagement.getRecordsetWithRBAC") +class TestFolderRbac: + """Two-user matrix: ownership, scope visibility, and write-access guards.""" + + # ── 1. Ownership visibility ─────────────────────────────────────────── + + def testUserAFolderInOwnTreeNotInUserBOwnTree(self, mockRbacGet): + """User A's personal folder appears in A's own tree, not in B's.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", name="A-Folder", userId=_USER_A)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compA = _buildComponent(_USER_A, fakeDb) + ownA = compA.getOwnFolderTree() + assert any(f["id"] == "fa-1" for f in ownA) + + compB = _buildComponent(_USER_B, fakeDb) + ownB = compB.getOwnFolderTree() + assert not any(f["id"] == "fa-1" for f in ownB) + + # ── 2. Scope change -> shared visibility ────────────────────────────── + + def testScopeChangeToMandateMakesVisibleToUserB(self, mockRbacGet): + """Changing scope from personal to mandate makes the folder appear + in User B's shared tree.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="personal", userId=_USER_A)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + sharedBefore = compB.getSharedFolderTree() + assert not any(f["id"] == "fa-1" for f in sharedBefore) + + fakeDb.recordModify(FileFolder, "fa-1", {"scope": "mandate"}) + + sharedAfter = compB.getSharedFolderTree() + assert any(f["id"] == "fa-1" for f in sharedAfter) + + # ── 3-7. Non-owner cannot mutate ────────────────────────────────────── + + def testUserBCannotRenameFolderOfUserA(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + with pytest.raises(PermissionError): + compB.renameFolder("fa-1", "Hijacked") + + def testUserBCannotMoveFolderOfUserA(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A)) + fakeDb.seed(FileFolder, _makeFolder(folderId="fb-1", scope="mandate", userId=_USER_B)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + with pytest.raises(PermissionError): + compB.moveFolder("fa-1", "fb-1") + + def testUserBCannotDeleteFolderOfUserA(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + with pytest.raises(PermissionError): + compB.deleteFolderCascade("fa-1") + + def testUserBCannotPatchScopeOnFolderOfUserA(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + with pytest.raises(PermissionError): + compB.patchFolderScope("fa-1", "personal") + + def testUserBCannotPatchNeutralizeOnFolderOfUserA(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="fa-1", scope="mandate", userId=_USER_A)) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + with pytest.raises(PermissionError): + compB.patchFolderNeutralize("fa-1", True) + + # ── 8. contextOrphan ────────────────────────────────────────────────── + + def testContextOrphanWhenParentFolderNotShared(self, mockRbacGet): + """User A's parent folder is personal, child folder is mandate. + User B sees only the child, flagged as contextOrphan.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder( + folderId="parent-f", name="Private Parent", userId=_USER_A, scope="personal", + )) + fakeDb.seed(FileFolder, _makeFolder( + folderId="child-f", name="Shared Child", userId=_USER_A, + parentId="parent-f", scope="mandate", + )) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + shared = compB.getSharedFolderTree() + + assert len(shared) == 1 + assert shared[0]["id"] == "child-f" + assert shared[0]["contextOrphan"] is True + + # ── 9. Shared folder children visible ───────────────────────────────── + + def testSharedFolderMakesChildrenVisible(self, mockRbacGet): + """When User A shares a folder tree (scope=mandate), all child folders + become visible in User B's shared tree.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder( + folderId="root-f", name="Root", userId=_USER_A, scope="mandate", + )) + fakeDb.seed(FileFolder, _makeFolder( + folderId="child1-f", name="Child 1", userId=_USER_A, + parentId="root-f", scope="mandate", + )) + fakeDb.seed(FileFolder, _makeFolder( + folderId="child2-f", name="Child 2", userId=_USER_A, + parentId="root-f", scope="mandate", + )) + fakeDb.seed(FileFolder, _makeFolder( + folderId="grandchild-f", name="Grandchild", userId=_USER_A, + parentId="child1-f", scope="mandate", + )) + mockRbacGet.side_effect = _scopeAwareMock(fakeDb) + + compB = _buildComponent(_USER_B, fakeDb) + shared = compB.getSharedFolderTree() + + sharedIds = {f["id"] for f in shared} + assert sharedIds == {"root-f", "child1-f", "child2-f", "grandchild-f"} + + byId = {f["id"]: f for f in shared} + assert byId["root-f"]["contextOrphan"] is False + assert byId["child1-f"]["contextOrphan"] is False + assert byId["child2-f"]["contextOrphan"] is False + assert byId["grandchild-f"]["contextOrphan"] is False diff --git a/tests/unit/routes/test_folder_crud.py b/tests/unit/routes/test_folder_crud.py new file mode 100644 index 00000000..86eaf480 --- /dev/null +++ b/tests/unit/routes/test_folder_crud.py @@ -0,0 +1,392 @@ +# Copyright (c) 2026 Patrick Motsch +# All rights reserved. +"""Unit tests for folder CRUD operations in ComponentObjects.""" + +import uuid +import pytest +from unittest.mock import Mock, patch, MagicMock +from typing import Dict, Any, List, Optional + +from modules.datamodels.datamodelFiles import FileFolder, FileItem +from modules.datamodels.datamodelUam import User, UserPermissions, AccessLevel +from modules.interfaces.interfaceDbManagement import ComponentObjects, FileNotFoundError + + +_MANDATE_ID = "mandate-test-1" +_FEATURE_INSTANCE_ID = "fi-test-1" +_USER_ID = "user-a-id" + + +# ── Fakes & helpers ────────────────────────────────────────────────────────── + +class _FakeDb: + """In-memory database mock that mimics DatabaseConnector for unit tests.""" + + def __init__(self): + self._tables: Dict[str, Dict[str, Dict[str, Any]]] = {} + self.connection = MagicMock() + + def getRecordset(self, modelClass, recordFilter=None): + tableName = modelClass.__name__ + records = list(self._tables.get(tableName, {}).values()) + if not recordFilter: + return records + return [ + r for r in records + if all(r.get(k) == v for k, v in recordFilter.items()) + ] + + def recordCreate(self, modelClass, data): + tableName = modelClass.__name__ + self._tables.setdefault(tableName, {}) + rec = data.model_dump() if hasattr(data, "model_dump") else dict(data) + rec.setdefault("id", str(uuid.uuid4())) + self._tables[tableName][rec["id"]] = rec + return rec + + def recordModify(self, modelClass, recordId, updates): + tableName = modelClass.__name__ + tbl = self._tables.get(tableName, {}) + if recordId in tbl: + tbl[recordId].update(updates) + return True + return False + + def recordDelete(self, modelClass, recordId): + tableName = modelClass.__name__ + tbl = self._tables.get(tableName, {}) + if recordId in tbl: + del tbl[recordId] + return True + return False + + def updateContext(self, userId): + pass + + def _ensure_connection(self): + pass + + def _ensureTableExists(self, modelClass): + return True + + def seed(self, modelClass, record: Dict[str, Any]): + tableName = modelClass.__name__ + self._tables.setdefault(tableName, {}) + self._tables[tableName][record["id"]] = dict(record) + + +def _makeUser(userId=_USER_ID, username="testuser"): + return User(id=userId, username=username, language="en") + + +def _makeRbac( + createLevel=AccessLevel.ALL, + readLevel=AccessLevel.ALL, + updateLevel=AccessLevel.ALL, + deleteLevel=AccessLevel.ALL, +): + rbac = Mock() + perms = UserPermissions( + view=True, + read=readLevel, + create=createLevel, + update=updateLevel, + delete=deleteLevel, + ) + rbac.getUserPermissions.return_value = perms + return rbac + + +def _buildComponent( + userId=_USER_ID, + fakeDb=None, + rbac=None, + mandateId=_MANDATE_ID, + featureInstanceId=_FEATURE_INSTANCE_ID, +): + """Construct a ComponentObjects with mocked internals (no real DB).""" + with patch.object(ComponentObjects, "__init__", lambda self: None): + comp = ComponentObjects() + comp.db = fakeDb or _FakeDb() + comp.currentUser = _makeUser(userId) + comp.userId = userId + comp.mandateId = mandateId + comp.featureInstanceId = featureInstanceId + comp.rbac = rbac or _makeRbac() + comp.userLanguage = "en" + return comp + + +def _rbacFromFakeDb(fakeDb): + """Side-effect for getRecordsetWithRBAC that delegates to _FakeDb.""" + def _fn(connector, modelClass, currentUser, recordFilter=None, **kwargs): + return fakeDb.getRecordset(modelClass, recordFilter=recordFilter) + return _fn + + +def _makeFolder( + folderId=None, name="Folder", parentId=None, + userId=_USER_ID, scope="personal", neutralize=False, +): + return { + "id": folderId or str(uuid.uuid4()), + "name": name, + "parentId": parentId, + "mandateId": _MANDATE_ID, + "featureInstanceId": _FEATURE_INSTANCE_ID, + "scope": scope, + "neutralize": neutralize, + "sysCreatedBy": userId, + "sysCreatedAt": 1700000000.0, + "sysModifiedAt": 1700000000.0, + "sysModifiedBy": None, + } + + +def _makeFile(fileId=None, folderId=None, userId=_USER_ID, scope="personal"): + return { + "id": fileId or str(uuid.uuid4()), + "fileName": "test.txt", + "mimeType": "text/plain", + "fileHash": "abc123", + "fileSize": 100, + "folderId": folderId, + "mandateId": _MANDATE_ID, + "featureInstanceId": _FEATURE_INSTANCE_ID, + "scope": scope, + "neutralize": False, + "sysCreatedBy": userId, + "sysCreatedAt": 1700000000.0, + "sysModifiedAt": 1700000000.0, + "sysModifiedBy": None, + "tags": None, + "description": None, + "status": None, + } + + +# ── Test class ─────────────────────────────────────────────────────────────── + +@patch("modules.interfaces.interfaceDbManagement.getRecordsetWithRBAC") +class TestFolderCrud: + """Tests for folder create / rename / move / delete / patch operations.""" + + # ── Create ──────────────────────────────────────────────────────────── + + def testCreateFolderHappyPath(self, mockRbacGet): + fakeDb = _FakeDb() + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.createFolder("Test Folder") + + assert result["name"] == "Test Folder" + assert result["scope"] == "personal" + assert result["parentId"] is None + assert result["mandateId"] == _MANDATE_ID + + def testCreateFolderWithParent(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="parent-1", name="Parent")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.createFolder("Child Folder", parentId="parent-1") + + assert result["name"] == "Child Folder" + assert result["parentId"] == "parent-1" + + def testCreateFolderMissingNameNoInterfaceValidation(self, mockRbacGet): + """Interface does not validate empty name; the route layer returns 400.""" + fakeDb = _FakeDb() + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.createFolder("") + assert result["name"] == "" + + # ── Rename ──────────────────────────────────────────────────────────── + + def testRenameFolderHappyPath(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", name="Old Name")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.renameFolder("f-1", "New Name") + + assert result["name"] == "New Name" + assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["name"] == "New Name" + + def testRenameFolderNotFound(self, mockRbacGet): + fakeDb = _FakeDb() + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + with pytest.raises(FileNotFoundError): + comp.renameFolder("nonexistent", "New Name") + + # ── Move ────────────────────────────────────────────────────────────── + + def testMoveFolderHappyPath(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", name="Movable")) + fakeDb.seed(FileFolder, _makeFolder(folderId="t-1", name="Target")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.moveFolder("f-1", "t-1") + + assert result["parentId"] == "t-1" + assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["parentId"] == "t-1" + + def testMoveFolderToRoot(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", name="Nested", parentId="old")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.moveFolder("f-1", None) + + assert result["parentId"] is None + + def testMoveFolderCircularReference(self, mockRbacGet): + """A -> B -> C: moving A under C creates a cycle.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="a", name="A", parentId=None)) + fakeDb.seed(FileFolder, _makeFolder(folderId="b", name="B", parentId="a")) + fakeDb.seed(FileFolder, _makeFolder(folderId="c", name="C", parentId="b")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + with pytest.raises(ValueError, match="circular reference"): + comp.moveFolder("a", "c") + + # ── Delete cascade ──────────────────────────────────────────────────── + + def testDeleteFolderCascade(self, mockRbacGet): + """Deleting root folder removes root + child + their files.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="root", name="Root")) + fakeDb.seed(FileFolder, _makeFolder(folderId="child", name="Child", parentId="root")) + fakeDb.seed(FileItem, _makeFile(fileId="file-1", folderId="root")) + fakeDb.seed(FileItem, _makeFile(fileId="file-2", folderId="child")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.deleteFolderCascade("root") + + assert result["deletedFolders"] == 2 + assert result["deletedFiles"] == 2 + + def testDeleteFolderNotFound(self, mockRbacGet): + fakeDb = _FakeDb() + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + with pytest.raises(FileNotFoundError): + comp.deleteFolderCascade("nonexistent") + + # ── Patch scope ─────────────────────────────────────────────────────── + + def testPatchScopeNoCascade(self, mockRbacGet): + """Change folder scope without cascading to files.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", scope="personal")) + fakeDb.seed(FileItem, _makeFile(fileId="file-1", folderId="f-1")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.patchFolderScope("f-1", "mandate", cascadeToFiles=False) + + assert result["scope"] == "mandate" + assert result["filesUpdated"] == 0 + assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["scope"] == "mandate" + assert fakeDb.getRecordset(FileItem, {"id": "file-1"})[0]["scope"] == "personal" + + def testPatchScopeWithCascade(self, mockRbacGet): + """cascadeToFiles=True updates only owned files in the folder.""" + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", scope="personal")) + fakeDb.seed(FileItem, _makeFile(fileId="own-file", folderId="f-1")) + fakeDb.seed(FileItem, _makeFile(fileId="other-file", folderId="f-1", userId="user-b")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.patchFolderScope("f-1", "mandate", cascadeToFiles=True) + + assert result["filesUpdated"] == 1 + assert fakeDb.getRecordset(FileItem, {"id": "own-file"})[0]["scope"] == "mandate" + assert fakeDb.getRecordset(FileItem, {"id": "other-file"})[0]["scope"] == "personal" + + def testPatchScopeInvalid(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + with pytest.raises(ValueError, match="Invalid scope"): + comp.patchFolderScope("f-1", "invalid_scope") + + # ── Patch neutralize ────────────────────────────────────────────────── + + def testPatchNeutralizeToggle(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="f-1", neutralize=False)) + fakeDb.seed(FileItem, _makeFile(fileId="file-1", folderId="f-1")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + resultOn = comp.patchFolderNeutralize("f-1", True) + assert resultOn["neutralize"] is True + assert resultOn["filesUpdated"] == 1 + assert fakeDb.getRecordset(FileFolder, {"id": "f-1"})[0]["neutralize"] is True + assert fakeDb.getRecordset(FileItem, {"id": "file-1"})[0]["neutralize"] is True + + resultOff = comp.patchFolderNeutralize("f-1", False) + assert resultOff["neutralize"] is False + assert fakeDb.getRecordset(FileItem, {"id": "file-1"})[0]["neutralize"] is False + + # ── Tree queries ────────────────────────────────────────────────────── + + def testGetOwnFolderTree(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="own-1", name="Mine")) + fakeDb.seed(FileFolder, _makeFolder(folderId="other-1", name="Theirs", userId="user-b")) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.getOwnFolderTree() + + assert len(result) == 1 + assert result[0]["id"] == "own-1" + + def testGetSharedFolderTreeWithContextOrphan(self, mockRbacGet): + fakeDb = _FakeDb() + fakeDb.seed(FileFolder, _makeFolder(folderId="own", name="Own")) + fakeDb.seed(FileFolder, _makeFolder( + folderId="shared-root", name="Shared Root", userId="user-b", scope="mandate", + )) + fakeDb.seed(FileFolder, _makeFolder( + folderId="shared-child", name="Shared Child", userId="user-b", + parentId="shared-root", scope="mandate", + )) + fakeDb.seed(FileFolder, _makeFolder( + folderId="orphan", name="Orphan", userId="user-b", + parentId="invisible-parent", scope="mandate", + )) + comp = _buildComponent(fakeDb=fakeDb) + mockRbacGet.side_effect = _rbacFromFakeDb(fakeDb) + + result = comp.getSharedFolderTree() + + ids = {r["id"] for r in result} + assert "own" not in ids + assert "shared-root" in ids + assert "shared-child" in ids + assert "orphan" in ids + + byId = {r["id"]: r for r in result} + assert byId["shared-root"]["contextOrphan"] is False + assert byId["shared-child"]["contextOrphan"] is False + assert byId["orphan"]["contextOrphan"] is True From e93ce7117449772afb10d7d5e47353ca9c0c1e2d Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 3 May 2026 22:19:23 +0200 Subject: [PATCH 18/18] fixed ux for expand object scrolling --- .../workflows/automation2/executors/actionNodeExecutor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/workflows/automation2/executors/actionNodeExecutor.py b/modules/workflows/automation2/executors/actionNodeExecutor.py index 6162aa2d..163ed3b2 100644 --- a/modules/workflows/automation2/executors/actionNodeExecutor.py +++ b/modules/workflows/automation2/executors/actionNodeExecutor.py @@ -377,7 +377,11 @@ class ActionNodeExecutor: if nodeType.startswith("ai."): out["prompt"] = promptText out["response"] = extractedContext - out["context"] = f"{promptText}\n\n{extractedContext}" if promptText and extractedContext else (extractedContext or promptText) + inputContext = resolvedParams.get("context") + if inputContext is not None: + out["context"] = inputContext if isinstance(inputContext, str) else json.dumps(inputContext, ensure_ascii=False, default=str) + else: + out["context"] = "" # Structured output if extractedContext: try: