P0: injection facade

2026-04-21 11:27:44 +02:00 · 2026-04-21 11:27:44 +02:00 · 9d82d3d353
commit 9d82d3d353
parent ba21005401
7 changed files with 536 additions and 29 deletions
--- a/modules/features/commcoach/serviceCommcoachIndexer.py
+++ b/modules/features/commcoach/serviceCommcoachIndexer.py
@ -174,14 +174,26 @@ async def indexSessionData(
            for c in chunks
        ]

-        await knowledgeService.indexFile(
-            fileId=syntheticFileId,
-            fileName=f"coaching-session-{sessionId[:8]}",
-            mimeType="application/x-coaching-session",
-            userId=userId,
-            featureInstanceId=featureInstanceId,
-            mandateId=mandateId,
-            contentObjects=contentObjects,
+        from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+        await knowledgeService.requestIngestion(
+            IngestionJob(
+                sourceKind="coaching_session",
+                sourceId=syntheticFileId,
+                fileName=f"coaching-session-{sessionId[:8]}",
+                mimeType="application/x-coaching-session",
+                userId=userId,
+                featureInstanceId=featureInstanceId,
+                mandateId=mandateId,
+                contentObjects=contentObjects,
+                provenance={
+                    "lane": "feature",
+                    "feature": "commcoach",
+                    "sessionId": sessionId,
+                    "contextId": contextId,
+                    "messageCount": len(messages or []),
+                },
+            )
        )
        logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
    except Exception as e:
--- a/modules/routes/routeDataFiles.py
+++ b/modules/routes/routeDataFiles.py
@ -77,7 +77,7 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
    """Background task: pre-scan + extraction + knowledge indexing.
    Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
    Step 2: Content extraction via runExtraction -> ContentParts
-    Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
+    Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store"""
    userId = user.id if hasattr(user, "id") else str(user)
    try:
        mgmtInterface = interfaceDbManagement.getInterface(user)
@ -181,15 +181,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
        )
        knowledgeService = getService("knowledge", ctx)

-        await knowledgeService.indexFile(
-            fileId=fileId,
-            fileName=fileName,
-            mimeType=mimeType,
-            userId=userId,
-            featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
-            mandateId=str(mandate_id) if mandate_id else "",
-            contentObjects=contentObjects,
-            structure=contentIndex.structure,
+        from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+
+        await knowledgeService.requestIngestion(
+            IngestionJob(
+                sourceKind="file",
+                sourceId=fileId,
+                fileName=fileName,
+                mimeType=mimeType,
+                userId=userId,
+                featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
+                mandateId=str(mandate_id) if mandate_id else "",
+                contentObjects=contentObjects,
+                structure=contentIndex.structure,
+                provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"},
+            )
        )

        # Re-acquire interface after await to avoid stale user context from the singleton
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
@ -434,11 +434,19 @@ def _registerDocumentTools(registry: ToolRegistry, services):

                            if contentObjects:
                                _diFiId, _diMId = _resolveFileScope(fileId, context)
-                                await knowledgeService.indexFile(
-                                    fileId=fileId, fileName=fileName, mimeType=fileMime,
-                                    userId=context.get("userId", ""), contentObjects=contentObjects,
-                                    featureInstanceId=_diFiId,
-                                    mandateId=_diMId,
+                                from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+                                await knowledgeService.requestIngestion(
+                                    IngestionJob(
+                                        sourceKind="agent_tool",
+                                        sourceId=fileId,
+                                        fileName=fileName,
+                                        mimeType=fileMime,
+                                        userId=context.get("userId", ""),
+                                        contentObjects=contentObjects,
+                                        featureInstanceId=_diFiId,
+                                        mandateId=_diMId,
+                                        provenance={"lane": "agent", "tool": "describeImage"},
+                                    )
                                )

                            chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
@ -132,11 +132,19 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
                            try:
                                userId = context.get("userId", "")
                                _fiId, _mId = _resolveFileScope(fileId, context)
-                                await knowledgeService.indexFile(
-                                    fileId=fileId, fileName=fileName, mimeType=mimeType,
-                                    userId=userId, contentObjects=contentObjects,
-                                    featureInstanceId=_fiId,
-                                    mandateId=_mId,
+                                from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
+                                await knowledgeService.requestIngestion(
+                                    IngestionJob(
+                                        sourceKind="agent_tool",
+                                        sourceId=fileId,
+                                        fileName=fileName,
+                                        mimeType=mimeType,
+                                        userId=userId,
+                                        contentObjects=contentObjects,
+                                        featureInstanceId=_fiId,
+                                        mandateId=_mId,
+                                        provenance={"lane": "agent", "tool": "readFile"},
+                                    )
                                )
                            except Exception as e:
                                logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
--- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
+++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
@ -2,9 +2,13 @@
 # All rights reserved.
 """Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""

+import hashlib
+import json
 import logging
 import re
-from typing import Any, Callable, Dict, List, Optional
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Union

 from modules.datamodels.datamodelKnowledge import (
    FileContentIndex, ContentChunk, WorkflowMemory,
@ -20,6 +24,65 @@ DEFAULT_CHUNK_TOKENS = 400
 DEFAULT_CONTEXT_BUDGET = 12000


+# =============================================================================
+# Ingestion façade (P0 of unified-knowledge-indexing concept)
+# =============================================================================
+
+@dataclass
+class IngestionJob:
+    """One request to add or refresh content in the unified knowledge store.
+
+    Callers from any lane (routes, feature hooks, agent tools, connector sync)
+    describe the work they want done via this object; idempotency, scope
+    resolution, and embedding are handled by KnowledgeService.requestIngestion.
+    """
+    sourceKind: str
+    sourceId: str
+    fileName: str
+    mimeType: str
+    userId: str
+    contentObjects: List[Dict[str, Any]] = field(default_factory=list)
+    featureInstanceId: str = ""
+    mandateId: str = ""
+    structure: Optional[Dict[str, Any]] = None
+    containerPath: Optional[str] = None
+    contentVersion: Optional[str] = None
+    provenance: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class IngestionHandle:
+    """Result of requestIngestion. Stable across in-process and future queue impls."""
+    jobId: str
+    status: str
+    contentHash: str
+    fileId: str
+    index: Optional[FileContentIndex] = None
+    error: Optional[str] = None
+
+
+def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
+    """Deterministic SHA256 over (contentObjectId, contentType, data) tuples.
+
+    Sorted by contentObjectId so re-ordering of extractor output does not
+    invalidate the cache; text whitespace is preserved intentionally because
+    chunk boundaries depend on it.
+    """
+    normalized = sorted(
+        (
+            (
+                str(o.get("contentObjectId", "") or ""),
+                str(o.get("contentType", "text") or "text"),
+                o.get("data", "") or "",
+            )
+            for o in (contentObjects or [])
+        ),
+        key=lambda t: t[0],
+    )
+    payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
 class KnowledgeService:
    """Service for Knowledge Store operations: indexing, retrieval, and context building."""

@ -46,6 +109,196 @@ class KnowledgeService:
        results = await self._embed([text])
        return results[0] if results else []

+    # =========================================================================
+    # Ingestion façade (single entry point for all lanes)
+    # =========================================================================
+
+    async def requestIngestion(self, job: IngestionJob) -> IngestionHandle:
+        """Unified entry point for filling the knowledge corpus.
+
+        Applies idempotency based on a content hash (or caller-supplied
+        `contentVersion`) persisted in `FileContentIndex.structure._ingestion`.
+        Re-runs indexing only when the hash differs or the previous run did
+        not reach `indexed` state. Runs embedding synchronously for now
+        (callers already schedule background tasks where needed).
+        """
+        jobId = f"{job.sourceKind}:{job.sourceId}"
+        startMs = time.time()
+        contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects)
+
+        # 1. Check for duplicate via existing FileContentIndex row.
+        existing = None
+        try:
+            existing = self._knowledgeDb.getFileContentIndex(job.sourceId)
+        except Exception:
+            existing = None
+
+        if existing:
+            existingStructure = (
+                existing.get("structure") if isinstance(existing, dict)
+                else getattr(existing, "structure", {})
+            ) or {}
+            existingMeta = existingStructure.get("_ingestion", {}) or {}
+            existingStatus = (
+                existing.get("status") if isinstance(existing, dict)
+                else getattr(existing, "status", "")
+            ) or ""
+            if existingMeta.get("hash") == contentHash and existingStatus == "indexed":
+                logger.info(
+                    "ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s",
+                    job.sourceKind, job.sourceId, contentHash[:12],
+                    extra={
+                        "event": "ingestion.skipped.duplicate",
+                        "jobId": jobId,
+                        "sourceKind": job.sourceKind,
+                        "sourceId": job.sourceId,
+                        "hash": contentHash,
+                        "durationMs": int((time.time() - startMs) * 1000),
+                    },
+                )
+                return IngestionHandle(
+                    jobId=jobId,
+                    status="duplicate",
+                    contentHash=contentHash,
+                    fileId=job.sourceId,
+                    index=None,
+                )
+
+        # 2. Prepare ingestion metadata; stays in structure._ingestion so
+        #    later connector revoke/purge can filter chunks by sourceKind /
+        #    provenance.connectionId without a schema migration.
+        ingestionMeta = {
+            "hash": contentHash,
+            "sourceKind": job.sourceKind,
+            "sourceId": job.sourceId,
+            "contentVersion": job.contentVersion,
+            "indexedAt": getUtcTimestamp(),
+            "provenance": dict(job.provenance or {}),
+        }
+        structure = dict(job.structure or {})
+        structure["_ingestion"] = ingestionMeta
+
+        logger.info(
+            "ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s",
+            job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12],
+            extra={
+                "event": "ingestion.queued",
+                "jobId": jobId,
+                "sourceKind": job.sourceKind,
+                "sourceId": job.sourceId,
+                "hash": contentHash,
+                "objectCount": len(job.contentObjects or []),
+            },
+        )
+
+        # 3. Run real indexing.
+        try:
+            index = await self._indexFileInternal(
+                fileId=job.sourceId,
+                fileName=job.fileName,
+                mimeType=job.mimeType,
+                userId=job.userId,
+                featureInstanceId=job.featureInstanceId,
+                mandateId=job.mandateId,
+                contentObjects=job.contentObjects or [],
+                structure=structure,
+                containerPath=job.containerPath,
+            )
+        except Exception as exc:
+            logger.error(
+                "ingestion.failed sourceKind=%s sourceId=%s error=%s",
+                job.sourceKind, job.sourceId, exc,
+                exc_info=True,
+                extra={
+                    "event": "ingestion.failed",
+                    "jobId": jobId,
+                    "sourceKind": job.sourceKind,
+                    "sourceId": job.sourceId,
+                    "hash": contentHash,
+                    "error": str(exc),
+                    "durationMs": int((time.time() - startMs) * 1000),
+                },
+            )
+            try:
+                self._knowledgeDb.updateFileStatus(job.sourceId, "failed")
+            except Exception:
+                pass
+            return IngestionHandle(
+                jobId=jobId,
+                status="failed",
+                contentHash=contentHash,
+                fileId=job.sourceId,
+                index=None,
+                error=str(exc),
+            )
+
+        logger.info(
+            "ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d",
+            job.sourceKind, job.sourceId, len(job.contentObjects or []),
+            int((time.time() - startMs) * 1000),
+            extra={
+                "event": "ingestion.indexed",
+                "jobId": jobId,
+                "sourceKind": job.sourceKind,
+                "sourceId": job.sourceId,
+                "hash": contentHash,
+                "objectCount": len(job.contentObjects or []),
+                "durationMs": int((time.time() - startMs) * 1000),
+            },
+        )
+        return IngestionHandle(
+            jobId=jobId,
+            status="indexed",
+            contentHash=contentHash,
+            fileId=job.sourceId,
+            index=index,
+        )
+
+    def getIngestionStatus(
+        self, handleOrJobId: Union[IngestionHandle, str]
+    ) -> Dict[str, Any]:
+        """Map a handle or `sourceKind:sourceId` jobId to a status snapshot."""
+        if isinstance(handleOrJobId, IngestionHandle):
+            sourceId = handleOrJobId.fileId
+            jobId = handleOrJobId.jobId
+        elif isinstance(handleOrJobId, str) and ":" in handleOrJobId:
+            jobId = handleOrJobId
+            sourceId = handleOrJobId.split(":", 1)[1]
+        else:
+            jobId = str(handleOrJobId)
+            sourceId = str(handleOrJobId)
+
+        row = None
+        try:
+            row = self._knowledgeDb.getFileContentIndex(sourceId)
+        except Exception:
+            row = None
+        if not row:
+            return {
+                "jobId": jobId,
+                "sourceId": sourceId,
+                "status": "unknown",
+                "contentHash": None,
+            }
+
+        structure = (
+            row.get("structure") if isinstance(row, dict)
+            else getattr(row, "structure", {})
+        ) or {}
+        meta = structure.get("_ingestion", {}) or {}
+        status = (
+            row.get("status") if isinstance(row, dict)
+            else getattr(row, "status", "")
+        ) or "unknown"
+        return {
+            "jobId": jobId,
+            "sourceId": sourceId,
+            "status": status,
+            "contentHash": meta.get("hash"),
+            "sourceKind": meta.get("sourceKind"),
+            "indexedAt": meta.get("indexedAt"),
+        }
+
    # =========================================================================
    # File Indexing (called after extraction, before embedding)
    # =========================================================================
@ -61,6 +314,54 @@ class KnowledgeService:
        contentObjects: List[Dict[str, Any]] = None,
        structure: Dict[str, Any] = None,
        containerPath: str = None,
+    ) -> Optional[FileContentIndex]:
+        """Backward-compatible wrapper delegating to requestIngestion.
+
+        Existing callers that still invoke `indexFile` directly automatically
+        participate in the idempotency/metrics layer. New callers should
+        prefer `requestIngestion` so they can pass `sourceKind` and
+        `provenance` for connector revoke/purge later.
+        """
+        job = IngestionJob(
+            sourceKind="file",
+            sourceId=fileId,
+            fileName=fileName,
+            mimeType=mimeType,
+            userId=userId,
+            featureInstanceId=featureInstanceId,
+            mandateId=mandateId,
+            contentObjects=list(contentObjects or []),
+            structure=structure,
+            containerPath=containerPath,
+        )
+        handle = await self.requestIngestion(job)
+        if handle.index is not None:
+            return handle.index
+        if handle.status == "duplicate":
+            row = None
+            try:
+                row = self._knowledgeDb.getFileContentIndex(fileId)
+            except Exception:
+                row = None
+            if isinstance(row, dict):
+                try:
+                    return FileContentIndex(**row)
+                except Exception:
+                    return None
+            return row
+        return None
+
+    async def _indexFileInternal(
+        self,
+        fileId: str,
+        fileName: str,
+        mimeType: str,
+        userId: str,
+        featureInstanceId: str = "",
+        mandateId: str = "",
+        contentObjects: List[Dict[str, Any]] = None,
+        structure: Dict[str, Any] = None,
+        containerPath: str = None,
    ) -> FileContentIndex:
        """Index a file's content objects and create embeddings for text chunks.

--- a/tests/unit/serviceKnowledge/init.py
+++ b/tests/unit/serviceKnowledge/init.py
--- a/tests/unit/serviceKnowledge/test_requestIngestion.py
+++ b/tests/unit/serviceKnowledge/test_requestIngestion.py
@ -0,0 +1,172 @@
+# Copyright (c) 2025 Patrick Motsch
+# All rights reserved.
+"""Unit tests for the P0 ingestion facade on KnowledgeService.
+
+Covers acceptance criteria AC4 (idempotent ingestion for unchanged content)
+and hash stability. The knowledge DB interface and AI embedding service are
+stubbed so the test runs without any external dependency.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
+    IngestionJob,
+    KnowledgeService,
+    _computeIngestionHash,
+)
+
+
+class _StubKnowledgeDb:
+    """Minimal in-memory stand-in for interfaceDbKnowledge."""
+
+    def __init__(self):
+        self.index = None
+        self.upsertIndexCalls = 0
+        self.upsertChunkCalls = 0
+
+    def upsertFileContentIndex(self, index):
+        self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index)
+        self.upsertIndexCalls += 1
+
+    def upsertContentChunk(self, chunk):
+        self.upsertChunkCalls += 1
+
+    def updateFileStatus(self, fileId, status):
+        if self.index is not None:
+            self.index["status"] = status
+
+    def getFileContentIndex(self, fileId):
+        return self.index
+
+
+def _makeService():
+    """Create a KnowledgeService with stubbed db and ai dependencies."""
+    stubDb = _StubKnowledgeDb()
+
+    aiService = MagicMock()
+
+    async def _callEmbedding(texts):
+        return MagicMock(
+            errorCount=0,
+            content="",
+            metadata={"embeddings": [[0.0] * 4 for _ in texts]},
+        )
+
+    aiService.callEmbedding = _callEmbedding
+
+    def getService(name):
+        if name == "ai":
+            return aiService
+        raise KeyError(name)
+
+    context = MagicMock()
+    context.user = MagicMock()
+    # Return a non-empty but empty-dict record so the FileItem lookup branch
+    # in _indexFileInternal resolves without touching a real DB.
+    context.interfaceDbComponent = MagicMock()
+    context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}])
+
+    with patch(
+        "modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface",
+        return_value=stubDb,
+    ):
+        service = KnowledgeService(context, getService)
+
+    return service, stubDb
+
+
+@pytest.mark.asyncio
+async def test_duplicate_skipped():
+    service, db = _makeService()
+    job = IngestionJob(
+        sourceKind="file",
+        sourceId="file-123",
+        fileName="a.txt",
+        mimeType="text/plain",
+        userId="u1",
+        contentObjects=[
+            {"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
+        ],
+    )
+
+    first = await service.requestIngestion(job)
+    assert first.status == "indexed"
+    chunksAfterFirst = db.upsertChunkCalls
+    assert chunksAfterFirst >= 1
+
+    second = await service.requestIngestion(job)
+    assert second.status == "duplicate"
+    assert second.contentHash == first.contentHash
+    # No additional embedding work.
+    assert db.upsertChunkCalls == chunksAfterFirst
+
+
+@pytest.mark.asyncio
+async def test_reindex_on_content_change():
+    service, db = _makeService()
+    base = IngestionJob(
+        sourceKind="file",
+        sourceId="file-123",
+        fileName="a.txt",
+        mimeType="text/plain",
+        userId="u1",
+        contentObjects=[
+            {"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
+        ],
+    )
+    first = await service.requestIngestion(base)
+    assert first.status == "indexed"
+    chunksAfterFirst = db.upsertChunkCalls
+
+    changed = IngestionJob(
+        sourceKind="file",
+        sourceId="file-123",
+        fileName="a.txt",
+        mimeType="text/plain",
+        userId="u1",
+        contentObjects=[
+            {"contentObjectId": "c1", "contentType": "text", "data": "hello universe"}
+        ],
+    )
+    second = await service.requestIngestion(changed)
+    assert second.status == "indexed"
+    assert second.contentHash != first.contentHash
+    assert db.upsertChunkCalls > chunksAfterFirst
+
+
+def test_hash_stable_under_reordering():
+    a = [
+        {"contentObjectId": "c1", "contentType": "text", "data": "alpha"},
+        {"contentObjectId": "c2", "contentType": "text", "data": "beta"},
+    ]
+    b = list(reversed(a))
+    assert _computeIngestionHash(a) == _computeIngestionHash(b)
+
+
+def test_hash_changes_on_data_edit():
+    a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}]
+    b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}]
+    assert _computeIngestionHash(a) != _computeIngestionHash(b)
+
+
+@pytest.mark.asyncio
+async def test_get_ingestion_status_after_index():
+    service, _db = _makeService()
+    job = IngestionJob(
+        sourceKind="coaching_session",
+        sourceId="coaching-session:abc",
+        fileName="session",
+        mimeType="application/x-coaching-session",
+        userId="u1",
+        contentObjects=[
+            {"contentObjectId": "m0", "contentType": "text", "data": "User: hi"}
+        ],
+        provenance={"lane": "feature", "feature": "commcoach"},
+    )
+    handle = await service.requestIngestion(job)
+    status = service.getIngestionStatus(handle)
+    assert status["status"] == "indexed"
+    assert status["sourceKind"] == "coaching_session"
+    assert status["contentHash"] == handle.contentHash