fix(rag): stable ingestion idempotency across re-extractions (AC4)

Re-indexing the same file always triggered a full embedding run — ingestion.skipped.duplicate never fired. Two independent causes: 1. _computeIngestionHash included contentObjectId in its payload, but extractors generate fresh uuid4() per run, making the hash a per-run nonce. Now hashed over (contentType, data) in extractor order — stable across re-extractions, sensitive to content, ordering, and type changes. 2. _autoIndexFile upserted the fresh pre-scan FileContentIndex before requestIngestion's duplicate check, wiping structure._ingestion and status=indexed from the prior run. The pre-upsert now merges the existing _ingestion metadata and preserves the indexed status. Verified end-to-end: second PATCH /scope on an already-indexed file logs and returns in ~2s with zero embedding API calls. Adds test_ingestion_hash_stability.py (5 cases).
2026-04-21 12:42:40 +02:00 · 2026-04-21 12:42:40 +02:00 · dff3d41845
commit dff3d41845
parent a7f4055130
5 changed files with 128 additions and 172 deletions
--- a/modules/routes/routeDataFiles.py
+++ b/modules/routes/routeDataFiles.py
@ -122,9 +122,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
            f"{contentIndex.totalObjects} objects"
        )
-        # Persist FileContentIndex immediately
+        # Persist FileContentIndex immediately.
        # IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any
        # prior successful run — otherwise this upsert wipes the idempotency cache
        # and requestIngestion cannot detect duplicates (AC4 breaks).
        from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
        knowledgeDb = getKnowledgeInterface()
        try:
            _existing = knowledgeDb.getFileContentIndex(fileId)
        except Exception:
            _existing = None
        if _existing:
            _existingStruct = (
                _existing.get("structure") if isinstance(_existing, dict)
                else getattr(_existing, "structure", {})
            ) or {}
            _existingStatus = (
                _existing.get("status") if isinstance(_existing, dict)
                else getattr(_existing, "status", "")
            ) or ""
            if "_ingestion" in _existingStruct:
                contentIndex.structure = dict(contentIndex.structure or {})
                contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"]
            if _existingStatus == "indexed":
                contentIndex.status = "indexed"
        knowledgeDb.upsertFileContentIndex(contentIndex)
        # Step 2: Content extraction (AI-free, produces ContentParts)
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_documentTools.py
@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
 from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
    _getOrCreateTempFolder,
    _looksLikeBinary,
    _resolveFileScope,
    _MAX_TOOL_RESULT_CHARS,
 )
@ -392,74 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
                    if chunkMime:
                        mimeType = chunkMime
-            # 2) File not yet indexed -> trigger extraction via ExtractionService, then retry
+            # 2) Direct image file (not a container) - use raw file data
            if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId):
                try:
                    chatService = services.chat
                    fileInfo = chatService.getFileInfo(fileId)
                    fileContent = chatService.getFileContent(fileId)
                    if fileContent and fileInfo:
                        rawData = fileContent.get("data", "")
                        if isinstance(rawData, str) and len(rawData) > 100:
                            rawBytes = _b64.b64decode(rawData)
                        elif isinstance(rawData, bytes):
                            rawBytes = rawData
                        else:
                            rawBytes = None
                        if rawBytes:
                            from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
                            from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
                            from modules.datamodels.datamodelExtraction import ExtractionOptions
                            fileMime = fileInfo.get("mimeType", "application/octet-stream")
                            fileName = fileInfo.get("fileName", fileId)
                            extracted = runExtraction(
                                ExtractorRegistry(), None,
                                rawBytes, fileName, fileMime,
                                ExtractionOptions(mergeStrategy=None),
                            )
                            contentObjects = []
                            for part in extracted.parts:
                                tg = (part.typeGroup or "").lower()
                                ct = "image" if tg == "image" else "text"
                                if not part.data or not part.data.strip():
                                    continue
                                contentObjects.append({
                                    "contentObjectId": part.id,
                                    "contentType": ct,
                                    "data": part.data,
                                    "contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})},
                                })
                            if contentObjects:
                                _diFiId, _diMId = _resolveFileScope(fileId, context)
                                from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
                                await knowledgeService.requestIngestion(
                                    IngestionJob(
                                        sourceKind="agent_tool",
                                        sourceId=fileId,
                                        fileName=fileName,
                                        mimeType=fileMime,
                                        userId=context.get("userId", ""),
                                        contentObjects=contentObjects,
                                        featureInstanceId=_diFiId,
                                        mandateId=_diMId,
                                        provenance={"lane": "agent", "tool": "describeImage"},
                                    )
                                )
                            chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
                            imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
                            if pageIndex is not None:
                                imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex]
                            if imageChunks:
                                imageData = imageChunks[0].get("data", "")
                except Exception as extractErr:
                    logger.warning(f"describeImage: on-demand extraction failed: {extractErr}")
            # 3) Direct image file (not a container) - use raw file data
            if not imageData:
                chatService = services.chat
                fileContent = chatService.getFileContent(fileId)
@ -469,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
                        imageData = fileContent.get("data", "")
                        mimeType = fileMimeType
-            # 4) PDF page rendering: render the requested page as an image via PyMuPDF
+            # 3) PDF page rendering: render the requested page as an image via PyMuPDF
            if not imageData:
                chatService = services.chat
                fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None
--- a/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
+++ b/modules/serviceCenter/services/serviceAgent/coreTools/_workspaceTools.py
@ -14,7 +14,6 @@ from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
    _getOrCreateInstanceFolder,
    _getOrCreateTempFolder,
    _looksLikeBinary,
    _resolveFileScope,
    _MAX_TOOL_RESULT_CHARS,
 )
@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
            return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required")
        try:
            knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None
            fileStatus = None
            # 1) Knowledge Store: return already-extracted text chunks
            if knowledgeService:
@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
                        data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]",
                    )
-            # 2) Not indexed yet: try on-demand extraction
+            # 2) Not indexed yet: inspect file type to decide how to serve the agent
            #    (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline)
            chatService = services.chat
            fileInfo = chatService.getFileInfo(fileId)
            if not fileInfo:
@ -100,92 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
                isBinary = _looksLikeBinary(rawBytes)
            if isBinary:
                try:
                    from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
                    from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
                    from modules.datamodels.datamodelExtraction import ExtractionOptions
                    extracted = runExtraction(
                        ExtractorRegistry(), ChunkerRegistry(),
                        rawBytes, fileName, mimeType,
                        ExtractionOptions(mergeStrategy=None),
                    )
                    contentObjects = []
                    for part in extracted.parts:
                        tg = (part.typeGroup or "").lower()
                        ct = "image" if tg == "image" else "text"
                        if not part.data or not part.data.strip():
                            continue
                        contentObjects.append({
                            "contentObjectId": part.id,
                            "contentType": ct,
                            "data": part.data,
                            "contextRef": {
                                "containerPath": fileName,
                                "location": part.label or "file",
                                **(part.metadata or {}),
                            },
                        })
                    if contentObjects:
                        if knowledgeService:
                            try:
                                userId = context.get("userId", "")
                                _fiId, _mId = _resolveFileScope(fileId, context)
                                from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
                                await knowledgeService.requestIngestion(
                                    IngestionJob(
                                        sourceKind="agent_tool",
                                        sourceId=fileId,
                                        fileName=fileName,
                                        mimeType=mimeType,
                                        userId=userId,
                                        contentObjects=contentObjects,
                                        featureInstanceId=_fiId,
                                        mandateId=_mId,
                                        provenance={"lane": "agent", "tool": "readFile"},
                                    )
                                )
                            except Exception as e:
                                logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
                        joined = ""
                        if knowledgeService:
                            _chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
                            _textChunks = [
                                c for c in (_chunks or [])
                                if c.get("contentType") != "image" and c.get("data")
                            ]
                            if _textChunks:
                                joined = "\n\n".join(c["data"] for c in _textChunks)
                        if not joined:
                            textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"]
                            joined = "\n\n".join(textParts) if textParts else ""
                        if joined:
                            chunked = _applyOffsetLimit(joined, offset, limit)
                            if chunked is not None:
                                return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked)
                            if len(joined) > _MAX_TOOL_RESULT_CHARS:
                                joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated – showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]"
                            return ToolResult(
                                toolCallId="", toolName="readFile", success=True,
                                data=joined,
                            )
                        imgCount = sum(1 for o in contentObjects if o["contentType"] == "image")
                        return ToolResult(
                            toolCallId="", toolName="readFile", success=True,
                            data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' "
                                 f"({imgCount} images, no readable text). "
                                 f"Use describeImage(fileId='{fileId}') to analyze visual content.]",
                        )
                except Exception as extractErr:
                    logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}")
                return ToolResult(
                    toolCallId="", toolName="readFile", success=True,
-                    data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. "
+                    data=(
-                         f"Text extraction not available. Use describeImage for images.]",
+                        f"[File '{fileName}' ({mimeType}) is not yet indexed "
                        f"(status: {fileStatus or 'unknown'}). Indexing runs automatically "
                        f"on upload. Please wait a few seconds and retry, or re-upload the file. "
                        f"For visual content use describeImage(fileId='{fileId}').]"
                    ),
                )
            # 3) Text file: decode raw bytes
--- a/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
+++ b/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py
@ -62,23 +62,23 @@ class IngestionHandle:
 def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
-    """Deterministic SHA256 over (contentObjectId, contentType, data) tuples.
+    """Deterministic SHA256 over (contentType, data) tuples in extractor order.
-    Sorted by contentObjectId so re-ordering of extractor output does not
+    `contentObjectId` is intentionally excluded because extractors generate
-    invalidate the cache; text whitespace is preserved intentionally because
+    fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable
-    chunk boundaries depend on it.
+    across re-extractions of the same source — defeating idempotency.
    Order is preserved (no sort) because two different documents can share the
    same multiset of parts but differ in arrangement (e.g. swapped pages).
    Text whitespace is preserved intentionally because chunk boundaries
    depend on it.
    """
-    normalized = sorted(
+    normalized = [
        (
-            (
+            str(o.get("contentType", "text") or "text"),
-                str(o.get("contentObjectId", "") or ""),
+            o.get("data", "") or "",
-                str(o.get("contentType", "text") or "text"),
+        )
-                o.get("data", "") or "",
+        for o in (contentObjects or [])
-            )
+    ]
            for o in (contentObjects or [])
        ),
        key=lambda t: t[0],
    )
    payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
--- a/tests/unit/services/test_ingestion_hash_stability.py
+++ b/tests/unit/services/test_ingestion_hash_stability.py
@ -0,0 +1,81 @@
 #!/usr/bin/env python3
 # Copyright (c) 2025 Patrick Motsch
 # All rights reserved.
 """Test that _computeIngestionHash is stable across re-extractions of the same source.
 Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion
 hash MUST therefore be derived from content (contentType + data + order) only —
 otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and
 triggers full re-embedding.
 """
 import os
 import sys
 import uuid
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
 from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
    _computeIngestionHash,
 )
 def _makeObjects(seed: str = "alpha"):
    """Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would."""
    return [
        {
            "contentObjectId": str(uuid.uuid4()),
            "contentType": "text",
            "data": f"Page 1 of {seed}",
        },
        {
            "contentObjectId": str(uuid.uuid4()),
            "contentType": "text",
            "data": f"Page 2 of {seed}",
        },
        {
            "contentObjectId": str(uuid.uuid4()),
            "contentType": "binary",
            "data": "<image-bytes-as-b64>",
        },
    ]
 def test_hash_stable_across_uuid_regeneration():
    """Same content + different contentObjectIds → same hash."""
    a = _makeObjects("alpha")
    b = _makeObjects("alpha")  # identical data, fresh UUIDs
    assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b]
    assert _computeIngestionHash(a) == _computeIngestionHash(b)
 def test_hash_changes_when_data_changes():
    a = _makeObjects("alpha")
    b = _makeObjects("beta")
    assert _computeIngestionHash(a) != _computeIngestionHash(b)
 def test_hash_is_order_sensitive():
    """Reordered pages produce a different hash (different document)."""
    a = _makeObjects("alpha")
    b = list(reversed(a))
    assert _computeIngestionHash(a) != _computeIngestionHash(b)
 def test_hash_distinguishes_text_vs_binary_with_same_payload():
    a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}]
    b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}]
    assert _computeIngestionHash(a) != _computeIngestionHash(b)
 def test_hash_handles_empty_input():
    assert _computeIngestionHash([]) == _computeIngestionHash([])
 if __name__ == "__main__":
    test_hash_stable_across_uuid_regeneration()
    test_hash_changes_when_data_changes()
    test_hash_is_order_sensitive()
    test_hash_distinguishes_text_vs_binary_with_same_payload()
    test_hash_handles_empty_input()
    print("OK — all 5 ingestion-hash stability tests passed")