P0: injection facade
This commit is contained in:
parent
ba21005401
commit
9d82d3d353
7 changed files with 536 additions and 29 deletions
|
|
@ -174,14 +174,26 @@ async def indexSessionData(
|
||||||
for c in chunks
|
for c in chunks
|
||||||
]
|
]
|
||||||
|
|
||||||
await knowledgeService.indexFile(
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
fileId=syntheticFileId,
|
|
||||||
fileName=f"coaching-session-{sessionId[:8]}",
|
await knowledgeService.requestIngestion(
|
||||||
mimeType="application/x-coaching-session",
|
IngestionJob(
|
||||||
userId=userId,
|
sourceKind="coaching_session",
|
||||||
featureInstanceId=featureInstanceId,
|
sourceId=syntheticFileId,
|
||||||
mandateId=mandateId,
|
fileName=f"coaching-session-{sessionId[:8]}",
|
||||||
contentObjects=contentObjects,
|
mimeType="application/x-coaching-session",
|
||||||
|
userId=userId,
|
||||||
|
featureInstanceId=featureInstanceId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
provenance={
|
||||||
|
"lane": "feature",
|
||||||
|
"feature": "commcoach",
|
||||||
|
"sessionId": sessionId,
|
||||||
|
"contextId": contextId,
|
||||||
|
"messageCount": len(messages or []),
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
|
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
||||||
"""Background task: pre-scan + extraction + knowledge indexing.
|
"""Background task: pre-scan + extraction + knowledge indexing.
|
||||||
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
|
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
|
||||||
Step 2: Content extraction via runExtraction -> ContentParts
|
Step 2: Content extraction via runExtraction -> ContentParts
|
||||||
Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
|
Step 3: KnowledgeService.requestIngestion -> idempotent chunking + embedding -> Knowledge Store"""
|
||||||
userId = user.id if hasattr(user, "id") else str(user)
|
userId = user.id if hasattr(user, "id") else str(user)
|
||||||
try:
|
try:
|
||||||
mgmtInterface = interfaceDbManagement.getInterface(user)
|
mgmtInterface = interfaceDbManagement.getInterface(user)
|
||||||
|
|
@ -181,15 +181,21 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
||||||
)
|
)
|
||||||
knowledgeService = getService("knowledge", ctx)
|
knowledgeService = getService("knowledge", ctx)
|
||||||
|
|
||||||
await knowledgeService.indexFile(
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
fileId=fileId,
|
|
||||||
fileName=fileName,
|
await knowledgeService.requestIngestion(
|
||||||
mimeType=mimeType,
|
IngestionJob(
|
||||||
userId=userId,
|
sourceKind="file",
|
||||||
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
|
sourceId=fileId,
|
||||||
mandateId=str(mandate_id) if mandate_id else "",
|
fileName=fileName,
|
||||||
contentObjects=contentObjects,
|
mimeType=mimeType,
|
||||||
structure=contentIndex.structure,
|
userId=userId,
|
||||||
|
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
|
||||||
|
mandateId=str(mandate_id) if mandate_id else "",
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
structure=contentIndex.structure,
|
||||||
|
provenance={"lane": "upload", "route": "routeDataFiles._autoIndexFile"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Re-acquire interface after await to avoid stale user context from the singleton
|
# Re-acquire interface after await to avoid stale user context from the singleton
|
||||||
|
|
|
||||||
|
|
@ -434,11 +434,19 @@ def _registerDocumentTools(registry: ToolRegistry, services):
|
||||||
|
|
||||||
if contentObjects:
|
if contentObjects:
|
||||||
_diFiId, _diMId = _resolveFileScope(fileId, context)
|
_diFiId, _diMId = _resolveFileScope(fileId, context)
|
||||||
await knowledgeService.indexFile(
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
fileId=fileId, fileName=fileName, mimeType=fileMime,
|
await knowledgeService.requestIngestion(
|
||||||
userId=context.get("userId", ""), contentObjects=contentObjects,
|
IngestionJob(
|
||||||
featureInstanceId=_diFiId,
|
sourceKind="agent_tool",
|
||||||
mandateId=_diMId,
|
sourceId=fileId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType=fileMime,
|
||||||
|
userId=context.get("userId", ""),
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
featureInstanceId=_diFiId,
|
||||||
|
mandateId=_diMId,
|
||||||
|
provenance={"lane": "agent", "tool": "describeImage"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
|
chunks = knowledgeService._knowledgeDb.getContentChunks(fileId)
|
||||||
|
|
|
||||||
|
|
@ -132,11 +132,19 @@ def _registerWorkspaceTools(registry: ToolRegistry, services):
|
||||||
try:
|
try:
|
||||||
userId = context.get("userId", "")
|
userId = context.get("userId", "")
|
||||||
_fiId, _mId = _resolveFileScope(fileId, context)
|
_fiId, _mId = _resolveFileScope(fileId, context)
|
||||||
await knowledgeService.indexFile(
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
|
||||||
fileId=fileId, fileName=fileName, mimeType=mimeType,
|
await knowledgeService.requestIngestion(
|
||||||
userId=userId, contentObjects=contentObjects,
|
IngestionJob(
|
||||||
featureInstanceId=_fiId,
|
sourceKind="agent_tool",
|
||||||
mandateId=_mId,
|
sourceId=fileId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType=mimeType,
|
||||||
|
userId=userId,
|
||||||
|
contentObjects=contentObjects,
|
||||||
|
featureInstanceId=_fiId,
|
||||||
|
mandateId=_mId,
|
||||||
|
provenance={"lane": "agent", "tool": "readFile"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
|
logger.warning(f"readFile: knowledge indexing failed for {fileId}: {e}")
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,13 @@
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
|
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any, Callable, Dict, List, Optional
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
|
|
||||||
from modules.datamodels.datamodelKnowledge import (
|
from modules.datamodels.datamodelKnowledge import (
|
||||||
FileContentIndex, ContentChunk, WorkflowMemory,
|
FileContentIndex, ContentChunk, WorkflowMemory,
|
||||||
|
|
@ -20,6 +24,65 @@ DEFAULT_CHUNK_TOKENS = 400
|
||||||
DEFAULT_CONTEXT_BUDGET = 12000
|
DEFAULT_CONTEXT_BUDGET = 12000
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Ingestion façade (P0 of unified-knowledge-indexing concept)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IngestionJob:
|
||||||
|
"""One request to add or refresh content in the unified knowledge store.
|
||||||
|
|
||||||
|
Callers from any lane (routes, feature hooks, agent tools, connector sync)
|
||||||
|
describe the work they want done via this object; idempotency, scope
|
||||||
|
resolution, and embedding are handled by KnowledgeService.requestIngestion.
|
||||||
|
"""
|
||||||
|
sourceKind: str
|
||||||
|
sourceId: str
|
||||||
|
fileName: str
|
||||||
|
mimeType: str
|
||||||
|
userId: str
|
||||||
|
contentObjects: List[Dict[str, Any]] = field(default_factory=list)
|
||||||
|
featureInstanceId: str = ""
|
||||||
|
mandateId: str = ""
|
||||||
|
structure: Optional[Dict[str, Any]] = None
|
||||||
|
containerPath: Optional[str] = None
|
||||||
|
contentVersion: Optional[str] = None
|
||||||
|
provenance: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IngestionHandle:
|
||||||
|
"""Result of requestIngestion. Stable across in-process and future queue impls."""
|
||||||
|
jobId: str
|
||||||
|
status: str
|
||||||
|
contentHash: str
|
||||||
|
fileId: str
|
||||||
|
index: Optional[FileContentIndex] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _computeIngestionHash(contentObjects: List[Dict[str, Any]]) -> str:
|
||||||
|
"""Deterministic SHA256 over (contentObjectId, contentType, data) tuples.
|
||||||
|
|
||||||
|
Sorted by contentObjectId so re-ordering of extractor output does not
|
||||||
|
invalidate the cache; text whitespace is preserved intentionally because
|
||||||
|
chunk boundaries depend on it.
|
||||||
|
"""
|
||||||
|
normalized = sorted(
|
||||||
|
(
|
||||||
|
(
|
||||||
|
str(o.get("contentObjectId", "") or ""),
|
||||||
|
str(o.get("contentType", "text") or "text"),
|
||||||
|
o.get("data", "") or "",
|
||||||
|
)
|
||||||
|
for o in (contentObjects or [])
|
||||||
|
),
|
||||||
|
key=lambda t: t[0],
|
||||||
|
)
|
||||||
|
payload = json.dumps(normalized, ensure_ascii=False, separators=(",", ":"))
|
||||||
|
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeService:
|
class KnowledgeService:
|
||||||
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
|
"""Service for Knowledge Store operations: indexing, retrieval, and context building."""
|
||||||
|
|
||||||
|
|
@ -46,6 +109,196 @@ class KnowledgeService:
|
||||||
results = await self._embed([text])
|
results = await self._embed([text])
|
||||||
return results[0] if results else []
|
return results[0] if results else []
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Ingestion façade (single entry point for all lanes)
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def requestIngestion(self, job: IngestionJob) -> IngestionHandle:
|
||||||
|
"""Unified entry point for filling the knowledge corpus.
|
||||||
|
|
||||||
|
Applies idempotency based on a content hash (or caller-supplied
|
||||||
|
`contentVersion`) persisted in `FileContentIndex.structure._ingestion`.
|
||||||
|
Re-runs indexing only when the hash differs or the previous run did
|
||||||
|
not reach `indexed` state. Runs embedding synchronously for now
|
||||||
|
(callers already schedule background tasks where needed).
|
||||||
|
"""
|
||||||
|
jobId = f"{job.sourceKind}:{job.sourceId}"
|
||||||
|
startMs = time.time()
|
||||||
|
contentHash = job.contentVersion or _computeIngestionHash(job.contentObjects)
|
||||||
|
|
||||||
|
# 1. Check for duplicate via existing FileContentIndex row.
|
||||||
|
existing = None
|
||||||
|
try:
|
||||||
|
existing = self._knowledgeDb.getFileContentIndex(job.sourceId)
|
||||||
|
except Exception:
|
||||||
|
existing = None
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
existingStructure = (
|
||||||
|
existing.get("structure") if isinstance(existing, dict)
|
||||||
|
else getattr(existing, "structure", {})
|
||||||
|
) or {}
|
||||||
|
existingMeta = existingStructure.get("_ingestion", {}) or {}
|
||||||
|
existingStatus = (
|
||||||
|
existing.get("status") if isinstance(existing, dict)
|
||||||
|
else getattr(existing, "status", "")
|
||||||
|
) or ""
|
||||||
|
if existingMeta.get("hash") == contentHash and existingStatus == "indexed":
|
||||||
|
logger.info(
|
||||||
|
"ingestion.skipped.duplicate sourceKind=%s sourceId=%s hash=%s",
|
||||||
|
job.sourceKind, job.sourceId, contentHash[:12],
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.skipped.duplicate",
|
||||||
|
"jobId": jobId,
|
||||||
|
"sourceKind": job.sourceKind,
|
||||||
|
"sourceId": job.sourceId,
|
||||||
|
"hash": contentHash,
|
||||||
|
"durationMs": int((time.time() - startMs) * 1000),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return IngestionHandle(
|
||||||
|
jobId=jobId,
|
||||||
|
status="duplicate",
|
||||||
|
contentHash=contentHash,
|
||||||
|
fileId=job.sourceId,
|
||||||
|
index=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Prepare ingestion metadata; stays in structure._ingestion so
|
||||||
|
# later connector revoke/purge can filter chunks by sourceKind /
|
||||||
|
# provenance.connectionId without a schema migration.
|
||||||
|
ingestionMeta = {
|
||||||
|
"hash": contentHash,
|
||||||
|
"sourceKind": job.sourceKind,
|
||||||
|
"sourceId": job.sourceId,
|
||||||
|
"contentVersion": job.contentVersion,
|
||||||
|
"indexedAt": getUtcTimestamp(),
|
||||||
|
"provenance": dict(job.provenance or {}),
|
||||||
|
}
|
||||||
|
structure = dict(job.structure or {})
|
||||||
|
structure["_ingestion"] = ingestionMeta
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"ingestion.queued sourceKind=%s sourceId=%s objects=%d hash=%s",
|
||||||
|
job.sourceKind, job.sourceId, len(job.contentObjects or []), contentHash[:12],
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.queued",
|
||||||
|
"jobId": jobId,
|
||||||
|
"sourceKind": job.sourceKind,
|
||||||
|
"sourceId": job.sourceId,
|
||||||
|
"hash": contentHash,
|
||||||
|
"objectCount": len(job.contentObjects or []),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Run real indexing.
|
||||||
|
try:
|
||||||
|
index = await self._indexFileInternal(
|
||||||
|
fileId=job.sourceId,
|
||||||
|
fileName=job.fileName,
|
||||||
|
mimeType=job.mimeType,
|
||||||
|
userId=job.userId,
|
||||||
|
featureInstanceId=job.featureInstanceId,
|
||||||
|
mandateId=job.mandateId,
|
||||||
|
contentObjects=job.contentObjects or [],
|
||||||
|
structure=structure,
|
||||||
|
containerPath=job.containerPath,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(
|
||||||
|
"ingestion.failed sourceKind=%s sourceId=%s error=%s",
|
||||||
|
job.sourceKind, job.sourceId, exc,
|
||||||
|
exc_info=True,
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.failed",
|
||||||
|
"jobId": jobId,
|
||||||
|
"sourceKind": job.sourceKind,
|
||||||
|
"sourceId": job.sourceId,
|
||||||
|
"hash": contentHash,
|
||||||
|
"error": str(exc),
|
||||||
|
"durationMs": int((time.time() - startMs) * 1000),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
self._knowledgeDb.updateFileStatus(job.sourceId, "failed")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return IngestionHandle(
|
||||||
|
jobId=jobId,
|
||||||
|
status="failed",
|
||||||
|
contentHash=contentHash,
|
||||||
|
fileId=job.sourceId,
|
||||||
|
index=None,
|
||||||
|
error=str(exc),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"ingestion.indexed sourceKind=%s sourceId=%s objects=%d durationMs=%d",
|
||||||
|
job.sourceKind, job.sourceId, len(job.contentObjects or []),
|
||||||
|
int((time.time() - startMs) * 1000),
|
||||||
|
extra={
|
||||||
|
"event": "ingestion.indexed",
|
||||||
|
"jobId": jobId,
|
||||||
|
"sourceKind": job.sourceKind,
|
||||||
|
"sourceId": job.sourceId,
|
||||||
|
"hash": contentHash,
|
||||||
|
"objectCount": len(job.contentObjects or []),
|
||||||
|
"durationMs": int((time.time() - startMs) * 1000),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return IngestionHandle(
|
||||||
|
jobId=jobId,
|
||||||
|
status="indexed",
|
||||||
|
contentHash=contentHash,
|
||||||
|
fileId=job.sourceId,
|
||||||
|
index=index,
|
||||||
|
)
|
||||||
|
|
||||||
|
def getIngestionStatus(
|
||||||
|
self, handleOrJobId: Union[IngestionHandle, str]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Map a handle or `sourceKind:sourceId` jobId to a status snapshot."""
|
||||||
|
if isinstance(handleOrJobId, IngestionHandle):
|
||||||
|
sourceId = handleOrJobId.fileId
|
||||||
|
jobId = handleOrJobId.jobId
|
||||||
|
elif isinstance(handleOrJobId, str) and ":" in handleOrJobId:
|
||||||
|
jobId = handleOrJobId
|
||||||
|
sourceId = handleOrJobId.split(":", 1)[1]
|
||||||
|
else:
|
||||||
|
jobId = str(handleOrJobId)
|
||||||
|
sourceId = str(handleOrJobId)
|
||||||
|
|
||||||
|
row = None
|
||||||
|
try:
|
||||||
|
row = self._knowledgeDb.getFileContentIndex(sourceId)
|
||||||
|
except Exception:
|
||||||
|
row = None
|
||||||
|
if not row:
|
||||||
|
return {
|
||||||
|
"jobId": jobId,
|
||||||
|
"sourceId": sourceId,
|
||||||
|
"status": "unknown",
|
||||||
|
"contentHash": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
structure = (
|
||||||
|
row.get("structure") if isinstance(row, dict)
|
||||||
|
else getattr(row, "structure", {})
|
||||||
|
) or {}
|
||||||
|
meta = structure.get("_ingestion", {}) or {}
|
||||||
|
status = (
|
||||||
|
row.get("status") if isinstance(row, dict)
|
||||||
|
else getattr(row, "status", "")
|
||||||
|
) or "unknown"
|
||||||
|
return {
|
||||||
|
"jobId": jobId,
|
||||||
|
"sourceId": sourceId,
|
||||||
|
"status": status,
|
||||||
|
"contentHash": meta.get("hash"),
|
||||||
|
"sourceKind": meta.get("sourceKind"),
|
||||||
|
"indexedAt": meta.get("indexedAt"),
|
||||||
|
}
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# File Indexing (called after extraction, before embedding)
|
# File Indexing (called after extraction, before embedding)
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
|
|
@ -61,6 +314,54 @@ class KnowledgeService:
|
||||||
contentObjects: List[Dict[str, Any]] = None,
|
contentObjects: List[Dict[str, Any]] = None,
|
||||||
structure: Dict[str, Any] = None,
|
structure: Dict[str, Any] = None,
|
||||||
containerPath: str = None,
|
containerPath: str = None,
|
||||||
|
) -> Optional[FileContentIndex]:
|
||||||
|
"""Backward-compatible wrapper delegating to requestIngestion.
|
||||||
|
|
||||||
|
Existing callers that still invoke `indexFile` directly automatically
|
||||||
|
participate in the idempotency/metrics layer. New callers should
|
||||||
|
prefer `requestIngestion` so they can pass `sourceKind` and
|
||||||
|
`provenance` for connector revoke/purge later.
|
||||||
|
"""
|
||||||
|
job = IngestionJob(
|
||||||
|
sourceKind="file",
|
||||||
|
sourceId=fileId,
|
||||||
|
fileName=fileName,
|
||||||
|
mimeType=mimeType,
|
||||||
|
userId=userId,
|
||||||
|
featureInstanceId=featureInstanceId,
|
||||||
|
mandateId=mandateId,
|
||||||
|
contentObjects=list(contentObjects or []),
|
||||||
|
structure=structure,
|
||||||
|
containerPath=containerPath,
|
||||||
|
)
|
||||||
|
handle = await self.requestIngestion(job)
|
||||||
|
if handle.index is not None:
|
||||||
|
return handle.index
|
||||||
|
if handle.status == "duplicate":
|
||||||
|
row = None
|
||||||
|
try:
|
||||||
|
row = self._knowledgeDb.getFileContentIndex(fileId)
|
||||||
|
except Exception:
|
||||||
|
row = None
|
||||||
|
if isinstance(row, dict):
|
||||||
|
try:
|
||||||
|
return FileContentIndex(**row)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return row
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _indexFileInternal(
|
||||||
|
self,
|
||||||
|
fileId: str,
|
||||||
|
fileName: str,
|
||||||
|
mimeType: str,
|
||||||
|
userId: str,
|
||||||
|
featureInstanceId: str = "",
|
||||||
|
mandateId: str = "",
|
||||||
|
contentObjects: List[Dict[str, Any]] = None,
|
||||||
|
structure: Dict[str, Any] = None,
|
||||||
|
containerPath: str = None,
|
||||||
) -> FileContentIndex:
|
) -> FileContentIndex:
|
||||||
"""Index a file's content objects and create embeddings for text chunks.
|
"""Index a file's content objects and create embeddings for text chunks.
|
||||||
|
|
||||||
|
|
|
||||||
0
tests/unit/serviceKnowledge/__init__.py
Normal file
0
tests/unit/serviceKnowledge/__init__.py
Normal file
172
tests/unit/serviceKnowledge/test_requestIngestion.py
Normal file
172
tests/unit/serviceKnowledge/test_requestIngestion.py
Normal file
|
|
@ -0,0 +1,172 @@
|
||||||
|
# Copyright (c) 2025 Patrick Motsch
|
||||||
|
# All rights reserved.
|
||||||
|
"""Unit tests for the P0 ingestion facade on KnowledgeService.
|
||||||
|
|
||||||
|
Covers acceptance criteria AC4 (idempotent ingestion for unchanged content)
|
||||||
|
and hash stability. The knowledge DB interface and AI embedding service are
|
||||||
|
stubbed so the test runs without any external dependency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import (
|
||||||
|
IngestionJob,
|
||||||
|
KnowledgeService,
|
||||||
|
_computeIngestionHash,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _StubKnowledgeDb:
|
||||||
|
"""Minimal in-memory stand-in for interfaceDbKnowledge."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.index = None
|
||||||
|
self.upsertIndexCalls = 0
|
||||||
|
self.upsertChunkCalls = 0
|
||||||
|
|
||||||
|
def upsertFileContentIndex(self, index):
|
||||||
|
self.index = index.model_dump() if hasattr(index, "model_dump") else dict(index)
|
||||||
|
self.upsertIndexCalls += 1
|
||||||
|
|
||||||
|
def upsertContentChunk(self, chunk):
|
||||||
|
self.upsertChunkCalls += 1
|
||||||
|
|
||||||
|
def updateFileStatus(self, fileId, status):
|
||||||
|
if self.index is not None:
|
||||||
|
self.index["status"] = status
|
||||||
|
|
||||||
|
def getFileContentIndex(self, fileId):
|
||||||
|
return self.index
|
||||||
|
|
||||||
|
|
||||||
|
def _makeService():
|
||||||
|
"""Create a KnowledgeService with stubbed db and ai dependencies."""
|
||||||
|
stubDb = _StubKnowledgeDb()
|
||||||
|
|
||||||
|
aiService = MagicMock()
|
||||||
|
|
||||||
|
async def _callEmbedding(texts):
|
||||||
|
return MagicMock(
|
||||||
|
errorCount=0,
|
||||||
|
content="",
|
||||||
|
metadata={"embeddings": [[0.0] * 4 for _ in texts]},
|
||||||
|
)
|
||||||
|
|
||||||
|
aiService.callEmbedding = _callEmbedding
|
||||||
|
|
||||||
|
def getService(name):
|
||||||
|
if name == "ai":
|
||||||
|
return aiService
|
||||||
|
raise KeyError(name)
|
||||||
|
|
||||||
|
context = MagicMock()
|
||||||
|
context.user = MagicMock()
|
||||||
|
# Return a non-empty but empty-dict record so the FileItem lookup branch
|
||||||
|
# in _indexFileInternal resolves without touching a real DB.
|
||||||
|
context.interfaceDbComponent = MagicMock()
|
||||||
|
context.interfaceDbComponent.getRecordset = MagicMock(return_value=[{}])
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge.getKnowledgeInterface",
|
||||||
|
return_value=stubDb,
|
||||||
|
):
|
||||||
|
service = KnowledgeService(context, getService)
|
||||||
|
|
||||||
|
return service, stubDb
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_duplicate_skipped():
|
||||||
|
service, db = _makeService()
|
||||||
|
job = IngestionJob(
|
||||||
|
sourceKind="file",
|
||||||
|
sourceId="file-123",
|
||||||
|
fileName="a.txt",
|
||||||
|
mimeType="text/plain",
|
||||||
|
userId="u1",
|
||||||
|
contentObjects=[
|
||||||
|
{"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
first = await service.requestIngestion(job)
|
||||||
|
assert first.status == "indexed"
|
||||||
|
chunksAfterFirst = db.upsertChunkCalls
|
||||||
|
assert chunksAfterFirst >= 1
|
||||||
|
|
||||||
|
second = await service.requestIngestion(job)
|
||||||
|
assert second.status == "duplicate"
|
||||||
|
assert second.contentHash == first.contentHash
|
||||||
|
# No additional embedding work.
|
||||||
|
assert db.upsertChunkCalls == chunksAfterFirst
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_reindex_on_content_change():
|
||||||
|
service, db = _makeService()
|
||||||
|
base = IngestionJob(
|
||||||
|
sourceKind="file",
|
||||||
|
sourceId="file-123",
|
||||||
|
fileName="a.txt",
|
||||||
|
mimeType="text/plain",
|
||||||
|
userId="u1",
|
||||||
|
contentObjects=[
|
||||||
|
{"contentObjectId": "c1", "contentType": "text", "data": "hello world"}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
first = await service.requestIngestion(base)
|
||||||
|
assert first.status == "indexed"
|
||||||
|
chunksAfterFirst = db.upsertChunkCalls
|
||||||
|
|
||||||
|
changed = IngestionJob(
|
||||||
|
sourceKind="file",
|
||||||
|
sourceId="file-123",
|
||||||
|
fileName="a.txt",
|
||||||
|
mimeType="text/plain",
|
||||||
|
userId="u1",
|
||||||
|
contentObjects=[
|
||||||
|
{"contentObjectId": "c1", "contentType": "text", "data": "hello universe"}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
second = await service.requestIngestion(changed)
|
||||||
|
assert second.status == "indexed"
|
||||||
|
assert second.contentHash != first.contentHash
|
||||||
|
assert db.upsertChunkCalls > chunksAfterFirst
|
||||||
|
|
||||||
|
|
||||||
|
def test_hash_stable_under_reordering():
|
||||||
|
a = [
|
||||||
|
{"contentObjectId": "c1", "contentType": "text", "data": "alpha"},
|
||||||
|
{"contentObjectId": "c2", "contentType": "text", "data": "beta"},
|
||||||
|
]
|
||||||
|
b = list(reversed(a))
|
||||||
|
assert _computeIngestionHash(a) == _computeIngestionHash(b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hash_changes_on_data_edit():
|
||||||
|
a = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha"}]
|
||||||
|
b = [{"contentObjectId": "c1", "contentType": "text", "data": "alpha!"}]
|
||||||
|
assert _computeIngestionHash(a) != _computeIngestionHash(b)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_ingestion_status_after_index():
|
||||||
|
service, _db = _makeService()
|
||||||
|
job = IngestionJob(
|
||||||
|
sourceKind="coaching_session",
|
||||||
|
sourceId="coaching-session:abc",
|
||||||
|
fileName="session",
|
||||||
|
mimeType="application/x-coaching-session",
|
||||||
|
userId="u1",
|
||||||
|
contentObjects=[
|
||||||
|
{"contentObjectId": "m0", "contentType": "text", "data": "User: hi"}
|
||||||
|
],
|
||||||
|
provenance={"lane": "feature", "feature": "commcoach"},
|
||||||
|
)
|
||||||
|
handle = await service.requestIngestion(job)
|
||||||
|
status = service.getIngestionStatus(handle)
|
||||||
|
assert status["status"] == "indexed"
|
||||||
|
assert status["sourceKind"] == "coaching_session"
|
||||||
|
assert status["contentHash"] == handle.contentHash
|
||||||
Loading…
Reference in a new issue