rag stats
This commit is contained in:
parent
3934cdd3ee
commit
da096bb6e2
4 changed files with 259 additions and 1 deletions
|
|
@ -31,6 +31,15 @@ UI_OBJECTS = [
|
|||
"label": {"en": "Settings", "de": "Einstellungen", "fr": "Parametres"},
|
||||
"meta": {"area": "settings"}
|
||||
},
|
||||
{
|
||||
"objectKey": "ui.feature.workspace.rag-insights",
|
||||
"label": {
|
||||
"en": "Knowledge insights",
|
||||
"de": "Wissens-Insights",
|
||||
"fr": "Aperçu des connaissances",
|
||||
},
|
||||
"meta": {"area": "rag-insights"},
|
||||
},
|
||||
]
|
||||
|
||||
RESOURCE_OBJECTS = [
|
||||
|
|
@ -83,6 +92,7 @@ TEMPLATE_ROLES = [
|
|||
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
|
||||
{"context": "UI", "item": "ui.feature.workspace.editor", "view": True},
|
||||
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
|
||||
{"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True},
|
||||
{"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"},
|
||||
]
|
||||
},
|
||||
|
|
@ -97,6 +107,7 @@ TEMPLATE_ROLES = [
|
|||
{"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True},
|
||||
{"context": "UI", "item": "ui.feature.workspace.editor", "view": True},
|
||||
{"context": "UI", "item": "ui.feature.workspace.settings", "view": True},
|
||||
{"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True},
|
||||
{"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True},
|
||||
{"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True},
|
||||
{"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True},
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription
|
|||
)
|
||||
from modules.interfaces import interfaceDbChat, interfaceDbManagement
|
||||
from modules.features.workspace import interfaceFeatureWorkspace
|
||||
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
|
||||
from modules.interfaces.interfaceAiObjects import AiObjects
|
||||
from modules.serviceCenter.core.serviceStreaming import get_event_manager
|
||||
from modules.serviceCenter.services.serviceAgent.datamodelAgent import AgentEventTypeEnum, PendingFileEdit
|
||||
|
|
@ -1906,3 +1907,53 @@ async def updateGeneralSettings(
|
|||
wsInterface.saveWorkspaceUserSettings(data)
|
||||
|
||||
return await getGeneralSettings(request, instanceId, context)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# RAG / Knowledge — anonymised instance statistics (presentation / KPIs)
|
||||
# =========================================================================
|
||||
|
||||
def _collectWorkspaceFileIdsForStats(instanceId: str, mandateId: Optional[str]) -> List[str]:
|
||||
"""All FileItem ids for this feature instance (any user). Knowledge rows are often stored
|
||||
without featureInstanceId; we correlate by file id from the Management DB."""
|
||||
from modules.datamodels.datamodelFiles import FileItem
|
||||
from modules.interfaces.interfaceDbManagement import ComponentObjects
|
||||
|
||||
co = ComponentObjects()
|
||||
rows = co.db.getRecordset(FileItem, recordFilter={"featureInstanceId": instanceId})
|
||||
out: List[str] = []
|
||||
m = str(mandateId) if mandateId else ""
|
||||
for r in rows or []:
|
||||
rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None)
|
||||
if not rid:
|
||||
continue
|
||||
if m:
|
||||
mid = r.get("mandateId") if isinstance(r, dict) else getattr(r, "mandateId", "") or ""
|
||||
if mid and mid != m:
|
||||
continue
|
||||
out.append(str(rid))
|
||||
return out
|
||||
|
||||
|
||||
@router.get("/{instanceId}/rag-statistics")
|
||||
@limiter.limit("60/minute")
|
||||
async def getRagStatistics(
|
||||
request: Request,
|
||||
instanceId: str = Path(...),
|
||||
days: int = Query(90, ge=7, le=365, description="Timeline window in days"),
|
||||
context: RequestContext = Depends(getRequestContext),
|
||||
):
|
||||
"""Aggregated, non-identifying knowledge-store metrics for this workspace instance."""
|
||||
mandateId, _instanceConfig = _validateInstanceAccess(instanceId, context)
|
||||
workspaceFileIds = _collectWorkspaceFileIdsForStats(instanceId, mandateId)
|
||||
kdb = getKnowledgeInterface(context.user)
|
||||
stats = kdb.getRagStatisticsForInstance(
|
||||
featureInstanceId=instanceId,
|
||||
mandateId=str(mandateId) if mandateId else "",
|
||||
timelineDays=days,
|
||||
workspaceFileIds=workspaceFileIds,
|
||||
)
|
||||
if isinstance(stats, dict):
|
||||
stats.setdefault("scope", {})
|
||||
stats["scope"]["workspaceFileIdsResolved"] = len(workspaceFileIds)
|
||||
return JSONResponse(stats)
|
||||
|
|
|
|||
|
|
@ -288,6 +288,183 @@ class KnowledgeObjects:
|
|||
minScore=minScore,
|
||||
)
|
||||
|
||||
def getRagStatisticsForInstance(
|
||||
self,
|
||||
featureInstanceId: str,
|
||||
mandateId: str,
|
||||
timelineDays: int = 90,
|
||||
workspaceFileIds: Optional[List[str]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Aggregate anonymised RAG / knowledge-store metrics for one workspace instance.
|
||||
|
||||
No file names, user identifiers, or chunk text are returned — only counts and
|
||||
distributions suitable for dashboards and presentations.
|
||||
|
||||
workspaceFileIds: optional list of FileItem ids for this feature instance (from Management DB).
|
||||
Index pipelines often stored rows with empty featureInstanceId; linking by file id fixes stats.
|
||||
"""
|
||||
if not featureInstanceId:
|
||||
return {"error": "featureInstanceId required"}
|
||||
|
||||
ws_ids = [x for x in (workspaceFileIds or []) if x]
|
||||
ws_id_set = set(ws_ids)
|
||||
|
||||
files_inst = self.db.getRecordset(
|
||||
FileContentIndex,
|
||||
recordFilter={"featureInstanceId": featureInstanceId},
|
||||
)
|
||||
files_shared: List[Dict[str, Any]] = []
|
||||
if mandateId:
|
||||
files_shared = self.db.getRecordset(
|
||||
FileContentIndex,
|
||||
recordFilter={"mandateId": mandateId, "isShared": True},
|
||||
)
|
||||
|
||||
by_id: Dict[str, Dict[str, Any]] = {}
|
||||
for row in files_inst + files_shared:
|
||||
rid = row.get("id")
|
||||
if rid and rid not in by_id:
|
||||
by_id[rid] = row
|
||||
|
||||
for fid in ws_ids:
|
||||
if fid in by_id:
|
||||
continue
|
||||
row = self.getFileContentIndex(fid)
|
||||
if row:
|
||||
by_id[fid] = row
|
||||
|
||||
files = list(by_id.values())
|
||||
|
||||
chunks_by_id: Dict[str, Dict[str, Any]] = {}
|
||||
inst_chunks = self.db.getRecordset(
|
||||
ContentChunk,
|
||||
recordFilter={"featureInstanceId": featureInstanceId},
|
||||
)
|
||||
for c in inst_chunks:
|
||||
cid = c.get("id")
|
||||
if cid:
|
||||
chunks_by_id[cid] = c
|
||||
|
||||
for fid in ws_id_set:
|
||||
for c in self.getContentChunks(fid):
|
||||
cid = c.get("id")
|
||||
if cid and cid not in chunks_by_id:
|
||||
chunks_by_id[cid] = c
|
||||
|
||||
covered_file_ids = {c.get("fileId") for c in chunks_by_id.values() if c.get("fileId")}
|
||||
for row in files:
|
||||
fid = row.get("id")
|
||||
if fid and fid not in covered_file_ids:
|
||||
for c in self.getContentChunks(fid):
|
||||
cid = c.get("id")
|
||||
if cid and cid not in chunks_by_id:
|
||||
chunks_by_id[cid] = c
|
||||
|
||||
chunks = list(chunks_by_id.values())
|
||||
|
||||
def _mimeCategory(mime: str) -> str:
|
||||
m = (mime or "").lower()
|
||||
if "pdf" in m:
|
||||
return "pdf"
|
||||
if "wordprocessing" in m or "msword" in m or "officedocument.wordprocessing" in m:
|
||||
return "office_doc"
|
||||
if "spreadsheet" in m or "excel" in m or "officedocument.spreadsheet" in m:
|
||||
return "office_sheet"
|
||||
if "presentation" in m or "officedocument.presentation" in m:
|
||||
return "office_slides"
|
||||
if m.startswith("text/"):
|
||||
return "text"
|
||||
if m.startswith("image/"):
|
||||
return "image"
|
||||
if "html" in m:
|
||||
return "html"
|
||||
return "other"
|
||||
|
||||
def _utcDay(ts: Any) -> str:
|
||||
if ts is None:
|
||||
return ""
|
||||
try:
|
||||
return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d")
|
||||
except (TypeError, ValueError, OSError):
|
||||
return ""
|
||||
|
||||
status_counts: Dict[str, int] = defaultdict(int)
|
||||
mime_counts: Dict[str, int] = defaultdict(int)
|
||||
extracted_by_day: Dict[str, int] = defaultdict(int)
|
||||
total_bytes = 0
|
||||
user_ids = set()
|
||||
|
||||
for row in files:
|
||||
st = row.get("status") or "unknown"
|
||||
status_counts[st] += 1
|
||||
mime_counts[_mimeCategory(row.get("mimeType") or "")] += 1
|
||||
day = _utcDay(row.get("extractedAt"))
|
||||
if day:
|
||||
extracted_by_day[day] += 1
|
||||
try:
|
||||
total_bytes += int(row.get("totalSize") or 0)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
uid = row.get("userId")
|
||||
if uid:
|
||||
user_ids.add(str(uid))
|
||||
|
||||
content_type_counts: Dict[str, int] = defaultdict(int)
|
||||
chunks_with_embedding = 0
|
||||
for c in chunks:
|
||||
ct = c.get("contentType") or "other"
|
||||
content_type_counts[ct] += 1
|
||||
emb = c.get("embedding")
|
||||
if emb is not None and (
|
||||
(isinstance(emb, list) and len(emb) > 0)
|
||||
or (isinstance(emb, str) and len(emb) > 10)
|
||||
):
|
||||
chunks_with_embedding += 1
|
||||
|
||||
wf_mem = self.db.getRecordset(
|
||||
WorkflowMemory,
|
||||
recordFilter={"featureInstanceId": featureInstanceId},
|
||||
)
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=max(1, int(timelineDays)))
|
||||
cutoff_ts = cutoff.timestamp()
|
||||
|
||||
timeline: List[Dict[str, Any]] = []
|
||||
for day in sorted(extracted_by_day.keys()):
|
||||
try:
|
||||
d = datetime.strptime(day, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
if d.timestamp() >= cutoff_ts:
|
||||
timeline.append({"date": day, "indexedDocuments": extracted_by_day[day]})
|
||||
|
||||
if len(timeline) > 120:
|
||||
timeline = timeline[-120:]
|
||||
|
||||
total_chunks = len(chunks)
|
||||
embedding_pct = round(100.0 * chunks_with_embedding / total_chunks, 1) if total_chunks else 0.0
|
||||
|
||||
return {
|
||||
"scope": {
|
||||
"featureInstanceId": featureInstanceId,
|
||||
"mandateScopedShared": bool(mandateId),
|
||||
},
|
||||
"kpis": {
|
||||
"indexedDocuments": len(files),
|
||||
"indexedBytesTotal": total_bytes,
|
||||
"contributorUsers": len(user_ids),
|
||||
"contentChunks": total_chunks,
|
||||
"chunksWithEmbedding": chunks_with_embedding,
|
||||
"embeddingCoveragePercent": embedding_pct,
|
||||
"workflowEntities": len(wf_mem),
|
||||
},
|
||||
"indexedDocumentsByStatus": dict(sorted(status_counts.items())),
|
||||
"documentsByMimeCategory": dict(sorted(mime_counts.items(), key=lambda x: -x[1])),
|
||||
"chunksByContentType": dict(sorted(content_type_counts.items())),
|
||||
"timelineIndexedDocuments": timeline,
|
||||
"generatedAtUtc": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
||||
|
||||
def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects:
|
||||
"""Get or create a KnowledgeObjects singleton."""
|
||||
|
|
|
|||
|
|
@ -37,6 +37,17 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
|||
mgmtInterface.updateFile(fileId, {"status": "active"})
|
||||
return
|
||||
|
||||
file_meta = mgmtInterface.getFile(fileId)
|
||||
feature_instance_id = ""
|
||||
mandate_id = ""
|
||||
if file_meta:
|
||||
if isinstance(file_meta, dict):
|
||||
feature_instance_id = file_meta.get("featureInstanceId") or ""
|
||||
mandate_id = file_meta.get("mandateId") or ""
|
||||
else:
|
||||
feature_instance_id = getattr(file_meta, "featureInstanceId", None) or ""
|
||||
mandate_id = getattr(file_meta, "mandateId", None) or ""
|
||||
|
||||
logger.info(f"Auto-index starting for {fileName} ({len(rawBytes)} bytes, {mimeType})")
|
||||
|
||||
# Step 1: Structure Pre-Scan (AI-free)
|
||||
|
|
@ -47,6 +58,8 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
|||
fileId=fileId,
|
||||
fileName=fileName,
|
||||
userId=userId,
|
||||
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
|
||||
mandateId=str(mandate_id) if mandate_id else "",
|
||||
)
|
||||
logger.info(
|
||||
f"Pre-scan complete for {fileName}: "
|
||||
|
|
@ -105,7 +118,11 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
|||
from modules.serviceCenter import getService
|
||||
from modules.serviceCenter.context import ServiceCenterContext
|
||||
|
||||
ctx = ServiceCenterContext(user=user, mandate_id="", feature_instance_id="")
|
||||
ctx = ServiceCenterContext(
|
||||
user=user,
|
||||
mandate_id=str(mandate_id) if mandate_id else "",
|
||||
feature_instance_id=str(feature_instance_id) if feature_instance_id else "",
|
||||
)
|
||||
knowledgeService = getService("knowledge", ctx)
|
||||
|
||||
await knowledgeService.indexFile(
|
||||
|
|
@ -113,6 +130,8 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
|
|||
fileName=fileName,
|
||||
mimeType=mimeType,
|
||||
userId=userId,
|
||||
featureInstanceId=str(feature_instance_id) if feature_instance_id else "",
|
||||
mandateId=str(mandate_id) if mandate_id else "",
|
||||
contentObjects=contentObjects,
|
||||
structure=contentIndex.structure,
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue