fix(rag): stable ingestion idempotency across re-extractions (AC4)

Re-indexing the same file always triggered a full embedding run —
ingestion.skipped.duplicate never fired. Two independent causes:

1. _computeIngestionHash included contentObjectId in its payload, but
   extractors generate fresh uuid4() per run, making the hash a
   per-run nonce. Now hashed over (contentType, data) in extractor
   order — stable across re-extractions, sensitive to content,
   ordering, and type changes.
2. _autoIndexFile upserted the fresh pre-scan FileContentIndex before
   requestIngestion's duplicate check, wiping structure._ingestion
   and status=indexed from the prior run. The pre-upsert now merges
   the existing _ingestion metadata and preserves the indexed status.

Verified end-to-end: second PATCH /scope on an already-indexed file
logs  and returns in ~2s
with zero embedding API calls.

Adds test_ingestion_hash_stability.py (5 cases).
This commit is contained in:
Ida 2026-04-21 12:42:40 +02:00
parent a7f4055130
commit dff3d41845
5 changed files with 128 additions and 172 deletions

View file

@ -122,9 +122,30 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
f"{contentIndex.totalObjects} objects" f"{contentIndex.totalObjects} objects"
) )
# Persist FileContentIndex immediately # Persist FileContentIndex immediately.
# IMPORTANT: preserve `_ingestion` metadata and `status="indexed"` from any
# prior successful run — otherwise this upsert wipes the idempotency cache
# and requestIngestion cannot detect duplicates (AC4 breaks).
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface() knowledgeDb = getKnowledgeInterface()
try:
_existing = knowledgeDb.getFileContentIndex(fileId)
except Exception:
_existing = None
if _existing:
_existingStruct = (
_existing.get("structure") if isinstance(_existing, dict)
else getattr(_existing, "structure", {})
) or {}
_existingStatus = (
_existing.get("status") if isinstance(_existing, dict)
else getattr(_existing, "status", "")
) or ""
if "_ingestion" in _existingStruct:
contentIndex.structure = dict(contentIndex.structure or {})
contentIndex.structure["_ingestion"] = _existingStruct["_ingestion"]
if _existingStatus == "indexed":
contentIndex.status = "indexed"
knowledgeDb.upsertFileContentIndex(contentIndex) knowledgeDb.upsertFileContentIndex(contentIndex)
# Step 2: Content extraction (AI-free, produces ContentParts) # Step 2: Content extraction (AI-free, produces ContentParts)

View file

@ -11,8 +11,6 @@ from modules.serviceCenter.services.serviceAgent.toolRegistry import ToolRegistr
from modules.serviceCenter.services.serviceAgent.coreTools._helpers import ( from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_getOrCreateTempFolder, _getOrCreateTempFolder,
_looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS, _MAX_TOOL_RESULT_CHARS,
) )
@ -392,74 +390,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
if chunkMime: if chunkMime:
mimeType = chunkMime mimeType = chunkMime
# 2) File not yet indexed -> trigger extraction via ExtractionService, then retry # 2) Direct image file (not a container) - use raw file data
if not imageData and knowledgeService and not knowledgeService.isFileIndexed(fileId):
try:
chatService = services.chat
fileInfo = chatService.getFileInfo(fileId)
fileContent = chatService.getFileContent(fileId)
if fileContent and fileInfo:
rawData = fileContent.get("data", "")
if isinstance(rawData, str) and len(rawData) > 100:
rawBytes = _b64.b64decode(rawData)
elif isinstance(rawData, bytes):
rawBytes = rawData
else:
rawBytes = None
if rawBytes:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
fileMime = fileInfo.get("mimeType", "application/octet-stream")
fileName = fileInfo.get("fileName", fileId)
extracted = runExtraction(
ExtractorRegistry(), None,
rawBytes, fileName, fileMime,
ExtractionOptions(mergeStrategy=None),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {"containerPath": fileName, "location": part.label, **(part.metadata or {})},
})
if contentObjects:
_diFiId, _diMId = _resolveFileScope(fileId, context)
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="agent_tool",
sourceId=fileId,
fileName=fileName,
mimeType=fileMime,
userId=context.get("userId", ""),
contentObjects=contentObjects,
featureInstanceId=_diFiId,
mandateId=_diMId,
provenance={"lane": "agent", "tool": "describeImage"},
)
)
chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if pageIndex is not None:
imageChunks = [c for c in imageChunks if c.get("contextRef", {}).get("pageIndex") == pageIndex]
if imageChunks:
imageData = imageChunks[0].get("data", "")
except Exception as extractErr:
logger.warning(f"describeImage: on-demand extraction failed: {extractErr}")
# 3) Direct image file (not a container) - use raw file data
if not imageData: if not imageData:
chatService = services.chat chatService = services.chat
fileContent = chatService.getFileContent(fileId) fileContent = chatService.getFileContent(fileId)
@ -469,7 +400,7 @@ def _registerDocumentTools(registry: ToolRegistry, services):
imageData = fileContent.get("data", "") imageData = fileContent.get("data", "")
mimeType = fileMimeType mimeType = fileMimeType
# 4) PDF page rendering: render the requested page as an image via PyMuPDF # 3) PDF page rendering: render the requested page as an image via PyMuPDF
if not imageData: if not imageData:
chatService = services.chat chatService = services.chat
fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None fileInfo = chatService.getFileInfo(fileId) if hasattr(chatService, "getFileInfo") else None

View file

@ -14,7 +14,6 @@ from modules.serviceCenter.services.serviceAgent.coreTools._helpers import (
_getOrCreateInstanceFolder, _getOrCreateInstanceFolder,
_getOrCreateTempFolder, _getOrCreateTempFolder,
_looksLikeBinary, _looksLikeBinary,
_resolveFileScope,
_MAX_TOOL_RESULT_CHARS, _MAX_TOOL_RESULT_CHARS,
) )
@ -50,6 +49,7 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required") return ToolResult(toolCallId="", toolName="readFile", success=False, error="fileId is required")
try: try:
knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None knowledgeService = services.getService("knowledge") if hasattr(services, "getService") else None
fileStatus = None
# 1) Knowledge Store: return already-extracted text chunks # 1) Knowledge Store: return already-extracted text chunks
if knowledgeService: if knowledgeService:
@ -77,7 +77,8 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]", data=f"[File {fileId} is currently being processed (status: {fileStatus}). Try again shortly.]",
) )
# 2) Not indexed yet: try on-demand extraction # 2) Not indexed yet: inspect file type to decide how to serve the agent
# (binary -> instruct agent to wait / re-upload; text -> decode raw bytes inline)
chatService = services.chat chatService = services.chat
fileInfo = chatService.getFileInfo(fileId) fileInfo = chatService.getFileInfo(fileId)
if not fileInfo: if not fileInfo:
@ -100,92 +101,14 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
isBinary = _looksLikeBinary(rawBytes) isBinary = _looksLikeBinary(rawBytes)
if isBinary: if isBinary:
try:
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
extracted = runExtraction(
ExtractorRegistry(), ChunkerRegistry(),
rawBytes, fileName, mimeType,
ExtractionOptions(mergeStrategy=None),
)
contentObjects = []
for part in extracted.parts:
tg = (part.typeGroup or "").lower()
ct = "image" if tg == "image" else "text"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": ct,
"data": part.data,
"contextRef": {
"containerPath": fileName,
"location": part.label or "file",
**(part.metadata or {}),
},
})
if contentObjects:
if knowledgeService:
try:
userId = context.get("userId", "")
_fiId, _mId = _resolveFileScope(fileId, context)
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="agent_tool",
sourceId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
contentObjects=contentObjects,
featureInstanceId=_fiId,
mandateId=_mId,
provenance={"lane": "agent", "tool": "readFile"},
)
)
except Exception as e:
logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
joined = ""
if knowledgeService:
_chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
_textChunks = [
c for c in (_chunks or [])
if c.get("contentType") != "image" and c.get("data")
]
if _textChunks:
joined = "\n\n".join(c["data"] for c in _textChunks)
if not joined:
textParts = [o["data"] for o in contentObjects if o["contentType"] != "image"]
joined = "\n\n".join(textParts) if textParts else ""
if joined:
chunked = _applyOffsetLimit(joined, offset, limit)
if chunked is not None:
return ToolResult(toolCallId="", toolName="readFile", success=True, data=chunked)
if len(joined) > _MAX_TOOL_RESULT_CHARS:
joined = joined[:_MAX_TOOL_RESULT_CHARS] + f"\n\n[Truncated showing first {_MAX_TOOL_RESULT_CHARS} chars of {len(joined)}. Use offset/limit to read specific sections.]"
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=joined,
)
imgCount = sum(1 for o in contentObjects if o["contentType"] == "image")
return ToolResult(
toolCallId="", toolName="readFile", success=True,
data=f"[Extracted {len(contentObjects)} content objects from '{fileName}' "
f"({imgCount} images, no readable text). "
f"Use describeImage(fileId='{fileId}') to analyze visual content.]",
)
except Exception as extractErr:
logger.warning(f"readFile extraction failed for {fileId} ({fileName}): {extractErr}")
return ToolResult( return ToolResult(
toolCallId="", toolName="readFile", success=True, toolCallId="", toolName="readFile", success=True,
data=f"[Binary file: '{fileName}', type={mimeType}, size={len(rawBytes)} bytes. " data=(
f"Text extraction not available. Use describeImage for images.]", f"[File '{fileName}' ({mimeType}) is not yet indexed "
f"(status: {fileStatus or 'unknown'}). Indexing runs automatically "
f"on upload. Please wait a few seconds and retry, or re-upload the file. "
f"For visual content use describeImage(fileId='{fileId}').]"
),
) )
# 3) Text file: decode raw bytes # 3) Text file: decode raw bytes

View file

@ -62,23 +62,23 @@ class IngestionHandle:
def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str: def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
"""Deterministic SHA256 over (contentObjectId, contentType, data) tuples. """Deterministic SHA256 over (contentType, data) tuples in extractor order.
Sorted by contentObjectId so re-ordering of extractor output does not `contentObjectId` is intentionally excluded because extractors generate
invalidate the cache; text whitespace is preserved intentionally because fresh UUIDs per run (`uuid.uuid4()`), which would make the hash unstable
chunk boundaries depend on it. across re-extractions of the same source defeating idempotency.
Order is preserved (no sort) because two different documents can share the
same multiset of parts but differ in arrangement (e.g. swapped pages).
Text whitespace is preserved intentionally because chunk boundaries
depend on it.
""" """
normalized = sorted( normalized = [
( (
( str(o.get("contentType", "text") or "text"),
str(o.get("contentObjectId", "") or ""), o.get("data", "") or "",
str(o.get("contentType", "text") or "text"), )
o.get("data", "") or "", for o in (contentObjects or [])
) ]
for o in (contentObjects or [])
),
key=lambda t: t[0],
)
payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":")) payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
return hashlib.sha256(payload.encode("utf-8")).hexdigest() return hashlib.sha256(payload.encode("utf-8")).hexdigest()

View file

@ -0,0 +1,81 @@
#!/usr/bin/env python3
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Test that _computeIngestionHash is stable across re-extractions of the same source.
Extractors generate fresh contentObjectIds (uuid.uuid4()) per run. The ingestion
hash MUST therefore be derived from content (contentType + data + order) only
otherwise idempotency (AC4) silently fails: every re-extraction looks "new" and
triggers full re-embedding.
"""
import os
import sys
import uuid
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
_computeIngestionHash,
)
def _makeObjects(seed: str = "alpha"):
"""Build a synthetic contentObjects list as routeDataFiles._autoIndexFile would."""
return [
{
"contentObjectId": str(uuid.uuid4()),
"contentType": "text",
"data": f"Page 1 of {seed}",
},
{
"contentObjectId": str(uuid.uuid4()),
"contentType": "text",
"data": f"Page 2 of {seed}",
},
{
"contentObjectId": str(uuid.uuid4()),
"contentType": "binary",
"data": "<image-bytes-as-b64>",
},
]
def test_hash_stable_across_uuid_regeneration():
"""Same content + different contentObjectIds → same hash."""
a = _makeObjects("alpha")
b = _makeObjects("alpha") # identical data, fresh UUIDs
assert [o["contentObjectId"] for o in a] != [o["contentObjectId"] for o in b]
assert _computeIngestionHash(a) == _computeIngestionHash(b)
def test_hash_changes_when_data_changes():
a = _makeObjects("alpha")
b = _makeObjects("beta")
assert _computeIngestionHash(a) != _computeIngestionHash(b)
def test_hash_is_order_sensitive():
"""Reordered pages produce a different hash (different document)."""
a = _makeObjects("alpha")
b = list(reversed(a))
assert _computeIngestionHash(a) != _computeIngestionHash(b)
def test_hash_distinguishes_text_vs_binary_with_same_payload():
a = [{"contentObjectId": "x", "contentType": "text", "data": "hello"}]
b = [{"contentObjectId": "x", "contentType": "binary", "data": "hello"}]
assert _computeIngestionHash(a) != _computeIngestionHash(b)
def test_hash_handles_empty_input():
assert _computeIngestionHash([]) == _computeIngestionHash([])
if __name__ == "__main__":
test_hash_stable_across_uuid_regeneration()
test_hash_changes_when_data_changes()
test_hash_is_order_sensitive()
test_hash_distinguishes_text_vs_binary_with_same_payload()
test_hash_handles_empty_input()
print("OK — all 5 ingestion-hash stability tests passed")