rag stats

This commit is contained in:
ValueOn AG 2026-03-23 00:17:59 +01:00
parent 3934cdd3ee
commit da096bb6e2
4 changed files with 259 additions and 1 deletions

View file

@ -31,6 +31,15 @@ UI_OBJECTS = [
"label": {"en": "Settings", "de": "Einstellungen", "fr": "Parametres"}, "label": {"en": "Settings", "de": "Einstellungen", "fr": "Parametres"},
"meta": {"area": "settings"} "meta": {"area": "settings"}
}, },
{
"objectKey": "ui.feature.workspace.rag-insights",
"label": {
"en": "Knowledge insights",
"de": "Wissens-Insights",
"fr": "Aperçu des connaissances",
},
"meta": {"area": "rag-insights"},
},
] ]
RESOURCE_OBJECTS = [ RESOURCE_OBJECTS = [
@ -83,6 +92,7 @@ TEMPLATE_ROLES = [
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True}, {"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
{"context": "UI", "item": "ui.feature.workspace.editor", "view": True}, {"context": "UI", "item": "ui.feature.workspace.editor", "view": True},
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True}, {"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
{"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True},
{"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"}, {"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"},
] ]
}, },
@ -97,6 +107,7 @@ TEMPLATE_ROLES = [
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True}, {"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
{"context": "UI", "item": "ui.feature.workspace.editor", "view": True}, {"context": "UI", "item": "ui.feature.workspace.editor", "view": True},
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True}, {"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
{"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True},
{"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True},

View file

@ -24,6 +24,7 @@ from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription
) )
from modules.interfaces import interfaceDbChat, interfaceDbManagement from modules.interfaces import interfaceDbChat, interfaceDbManagement
from modules.features.workspace import interfaceFeatureWorkspace from modules.features.workspace import interfaceFeatureWorkspace
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.interfaces.interfaceAiObjects import AiObjects from modules.interfaces.interfaceAiObjects import AiObjects
from modules.serviceCenter.core.serviceStreaming import get_event_manager from modules.serviceCenter.core.serviceStreaming import get_event_manager
from modules.serviceCenter.services.serviceAgent.datamodelAgent import AgentEventTypeEnum, PendingFileEdit from modules.serviceCenter.services.serviceAgent.datamodelAgent import AgentEventTypeEnum, PendingFileEdit
@ -1906,3 +1907,53 @@ async def updateGeneralSettings(
wsInterface.saveWorkspaceUserSettings(data) wsInterface.saveWorkspaceUserSettings(data)
return await getGeneralSettings(request, instanceId, context) return await getGeneralSettings(request, instanceId, context)
# =========================================================================
# RAG / Knowledge — anonymised instance statistics (presentation / KPIs)
# =========================================================================
def _collectWorkspaceFileIdsForStats(instanceId: str, mandateId: Optional[str]) -> List[str]:
"""All FileItem ids for this feature instance (any user). Knowledge rows are often stored
without featureInstanceId; we correlate by file id from the Management DB."""
from modules.datamodels.datamodelFiles import FileItem
from modules.interfaces.interfaceDbManagement import ComponentObjects
co = ComponentObjects()
rows = co.db.getRecordset(FileItem, recordFilter={"featureInstanceId": instanceId})
out: List[str] = []
m = str(mandateId) if mandateId else ""
for r in rows or []:
rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None)
if not rid:
continue
if m:
mid = r.get("mandateId") if isinstance(r, dict) else getattr(r, "mandateId", "") or ""
if mid and mid != m:
continue
out.append(str(rid))
return out
@router.get("/{instanceId}/rag-statistics")
@limiter.limit("60/minute")
async def getRagStatistics(
request: Request,
instanceId: str = Path(...),
days: int = Query(90, ge=7, le=365, description="Timeline window in days"),
context: RequestContext = Depends(getRequestContext),
):
"""Aggregated, non-identifying knowledge-store metrics for this workspace instance."""
mandateId, _instanceConfig = _validateInstanceAccess(instanceId, context)
workspaceFileIds = _collectWorkspaceFileIdsForStats(instanceId, mandateId)
kdb = getKnowledgeInterface(context.user)
stats = kdb.getRagStatisticsForInstance(
featureInstanceId=instanceId,
mandateId=str(mandateId) if mandateId else "",
timelineDays=days,
workspaceFileIds=workspaceFileIds,
)
if isinstance(stats, dict):
stats.setdefault("scope", {})
stats["scope"]["workspaceFileIdsResolved"] = len(workspaceFileIds)
return JSONResponse(stats)

View file

@ -288,6 +288,183 @@ class KnowledgeObjects:
minScore=minScore, minScore=minScore,
) )
def getRagStatisticsForInstance(
self,
featureInstanceId: str,
mandateId: str,
timelineDays: int = 90,
workspaceFileIds: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""Aggregate anonymised RAG / knowledge-store metrics for one workspace instance.
No file names, user identifiers, or chunk text are returned only counts and
distributions suitable for dashboards and presentations.
workspaceFileIds: optional list of FileItem ids for this feature instance (from Management DB).
Index pipelines often stored rows with empty featureInstanceId; linking by file id fixes stats.
"""
if not featureInstanceId:
return {"error": "featureInstanceId required"}
ws_ids = [x for x in (workspaceFileIds or []) if x]
ws_id_set = set(ws_ids)
files_inst = self.db.getRecordset(
FileContentIndex,
recordFilter={"featureInstanceId": featureInstanceId},
)
files_shared: List[Dict[str, Any]] = []
if mandateId:
files_shared = self.db.getRecordset(
FileContentIndex,
recordFilter={"mandateId": mandateId, "isShared": True},
)
by_id: Dict[str, Dict[str, Any]] = {}
for row in files_inst + files_shared:
rid = row.get("id")
if rid and rid not in by_id:
by_id[rid] = row
for fid in ws_ids:
if fid in by_id:
continue
row = self.getFileContentIndex(fid)
if row:
by_id[fid] = row
files = list(by_id.values())
chunks_by_id: Dict[str, Dict[str, Any]] = {}
inst_chunks = self.db.getRecordset(
ContentChunk,
recordFilter={"featureInstanceId": featureInstanceId},
)
for c in inst_chunks:
cid = c.get("id")
if cid:
chunks_by_id[cid] = c
for fid in ws_id_set:
for c in self.getContentChunks(fid):
cid = c.get("id")
if cid and cid not in chunks_by_id:
chunks_by_id[cid] = c
covered_file_ids = {c.get("fileId") for c in chunks_by_id.values() if c.get("fileId")}
for row in files:
fid = row.get("id")
if fid and fid not in covered_file_ids:
for c in self.getContentChunks(fid):
cid = c.get("id")
if cid and cid not in chunks_by_id:
chunks_by_id[cid] = c
chunks = list(chunks_by_id.values())
def _mimeCategory(mime: str) -> str:
m = (mime or "").lower()
if "pdf" in m:
return "pdf"
if "wordprocessing" in m or "msword" in m or "officedocument.wordprocessing" in m:
return "office_doc"
if "spreadsheet" in m or "excel" in m or "officedocument.spreadsheet" in m:
return "office_sheet"
if "presentation" in m or "officedocument.presentation" in m:
return "office_slides"
if m.startswith("text/"):
return "text"
if m.startswith("image/"):
return "image"
if "html" in m:
return "html"
return "other"
def _utcDay(ts: Any) -> str:
if ts is None:
return ""
try:
return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d")
except (TypeError, ValueError, OSError):
return ""
status_counts: Dict[str, int] = defaultdict(int)
mime_counts: Dict[str, int] = defaultdict(int)
extracted_by_day: Dict[str, int] = defaultdict(int)
total_bytes = 0
user_ids = set()
for row in files:
st = row.get("status") or "unknown"
status_counts[st] += 1
mime_counts[_mimeCategory(row.get("mimeType") or "")] += 1
day = _utcDay(row.get("extractedAt"))
if day:
extracted_by_day[day] += 1
try:
total_bytes += int(row.get("totalSize") or 0)
except (TypeError, ValueError):
pass
uid = row.get("userId")
if uid:
user_ids.add(str(uid))
content_type_counts: Dict[str, int] = defaultdict(int)
chunks_with_embedding = 0
for c in chunks:
ct = c.get("contentType") or "other"
content_type_counts[ct] += 1
emb = c.get("embedding")
if emb is not None and (
(isinstance(emb, list) and len(emb) > 0)
or (isinstance(emb, str) and len(emb) > 10)
):
chunks_with_embedding += 1
wf_mem = self.db.getRecordset(
WorkflowMemory,
recordFilter={"featureInstanceId": featureInstanceId},
)
cutoff = datetime.now(timezone.utc) - timedelta(days=max(1, int(timelineDays)))
cutoff_ts = cutoff.timestamp()
timeline: List[Dict[str, Any]] = []
for day in sorted(extracted_by_day.keys()):
try:
d = datetime.strptime(day, "%Y-%m-%d").replace(tzinfo=timezone.utc)
except ValueError:
continue
if d.timestamp() >= cutoff_ts:
timeline.append({"date": day, "indexedDocuments": extracted_by_day[day]})
if len(timeline) > 120:
timeline = timeline[-120:]
total_chunks = len(chunks)
embedding_pct = round(100.0 * chunks_with_embedding / total_chunks, 1) if total_chunks else 0.0
return {
"scope": {
"featureInstanceId": featureInstanceId,
"mandateScopedShared": bool(mandateId),
},
"kpis": {
"indexedDocuments": len(files),
"indexedBytesTotal": total_bytes,
"contributorUsers": len(user_ids),
"contentChunks": total_chunks,
"chunksWithEmbedding": chunks_with_embedding,
"embeddingCoveragePercent": embedding_pct,
"workflowEntities": len(wf_mem),
},
"indexedDocumentsByStatus": dict(sorted(status_counts.items())),
"documentsByMimeCategory": dict(sorted(mime_counts.items(), key=lambda x: -x[1])),
"chunksByContentType": dict(sorted(content_type_counts.items())),
"timelineIndexedDocuments": timeline,
"generatedAtUtc": datetime.now(timezone.utc).isoformat(),
}
def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects: def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects:
"""Get or create a KnowledgeObjects singleton.""" """Get or create a KnowledgeObjects singleton."""

View file

@ -37,6 +37,17 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
mgmtInterface.updateFile(fileId, {"status": "active"}) mgmtInterface.updateFile(fileId, {"status": "active"})
return return
file_meta = mgmtInterface.getFile(fileId)
feature_instance_id = ""
mandate_id = ""
if file_meta:
if isinstance(file_meta, dict):
feature_instance_id = file_meta.get("featureInstanceId") or ""
mandate_id = file_meta.get("mandateId") or ""
else:
feature_instance_id = getattr(file_meta, "featureInstanceId", None) or ""
mandate_id = getattr(file_meta, "mandateId", None) or ""
logger.info(f"Auto-index starting for {fileName} ({len(rawBytes)} bytes, {mimeType})") logger.info(f"Auto-index starting for {fileName} ({len(rawBytes)} bytes, {mimeType})")
# Step 1: Structure Pre-Scan (AI-free) # Step 1: Structure Pre-Scan (AI-free)
@ -47,6 +58,8 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
fileId=fileId, fileId=fileId,
fileName=fileName, fileName=fileName,
userId=userId, userId=userId,
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
mandateId=str(mandate_id) if mandate_id else "",
) )
logger.info( logger.info(
f"Pre-scan complete for {fileName}: " f"Pre-scan complete for {fileName}: "
@ -105,7 +118,11 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
from modules.serviceCenter import getService from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(user=user, mandate_id="", feature_instance_id="") ctx = ServiceCenterContext(
user=user,
mandate_id=str(mandate_id) if mandate_id else "",
feature_instance_id=str(feature_instance_id) if feature_instance_id else "",
)
knowledgeService = getService("knowledge", ctx) knowledgeService = getService("knowledge", ctx)
await knowledgeService.indexFile( await knowledgeService.indexFile(
@ -113,6 +130,8 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
fileName=fileName, fileName=fileName,
mimeType=mimeType, mimeType=mimeType,
userId=userId, userId=userId,
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
mandateId=str(mandate_id) if mandate_id else "",
contentObjects=contentObjects, contentObjects=contentObjects,
structure=contentIndex.structure, structure=contentIndex.structure,
) )