diff --git a/modules/features/workspace/mainWorkspace.py b/modules/features/workspace/mainWorkspace.py index 81526414..353129cc 100644 --- a/modules/features/workspace/mainWorkspace.py +++ b/modules/features/workspace/mainWorkspace.py @@ -31,6 +31,15 @@ UI_OBJECTS = [ "label": {"en": "Settings", "de": "Einstellungen", "fr": "Parametres"}, "meta": {"area": "settings"} }, + { + "objectKey": "ui.feature.workspace.rag-insights", + "label": { + "en": "Knowledge insights", + "de": "Wissens-Insights", + "fr": "Aperçu des connaissances", + }, + "meta": {"area": "rag-insights"}, + }, ] RESOURCE_OBJECTS = [ @@ -83,6 +92,7 @@ TEMPLATE_ROLES = [ {"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True}, {"context": "UI", "item": "ui.feature.workspace.editor", "view": True}, {"context": "UI", "item": "ui.feature.workspace.settings", "view": True}, + {"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True}, {"context": "DATA", "item": None, "view": True, "read": "m", "create": "n", "update": "n", "delete": "n"}, ] }, @@ -97,6 +107,7 @@ TEMPLATE_ROLES = [ {"context": "UI", "item": "ui.feature.workspace.dashboard", "view": True}, {"context": "UI", "item": "ui.feature.workspace.editor", "view": True}, {"context": "UI", "item": "ui.feature.workspace.settings", "view": True}, + {"context": "UI", "item": "ui.feature.workspace.rag-insights", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.start", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.stop", "view": True}, {"context": "RESOURCE", "item": "resource.feature.workspace.files", "view": True}, diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py index dd8481ff..6b8c529b 100644 --- a/modules/features/workspace/routeFeatureWorkspace.py +++ b/modules/features/workspace/routeFeatureWorkspace.py @@ -24,6 +24,7 @@ from modules.serviceCenter.services.serviceSubscription.mainServiceSubscription ) from modules.interfaces import interfaceDbChat, interfaceDbManagement from modules.features.workspace import interfaceFeatureWorkspace +from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface from modules.interfaces.interfaceAiObjects import AiObjects from modules.serviceCenter.core.serviceStreaming import get_event_manager from modules.serviceCenter.services.serviceAgent.datamodelAgent import AgentEventTypeEnum, PendingFileEdit @@ -1906,3 +1907,53 @@ async def updateGeneralSettings( wsInterface.saveWorkspaceUserSettings(data) return await getGeneralSettings(request, instanceId, context) + + +# ========================================================================= +# RAG / Knowledge — anonymised instance statistics (presentation / KPIs) +# ========================================================================= + +def _collectWorkspaceFileIdsForStats(instanceId: str, mandateId: Optional[str]) -> List[str]: + """All FileItem ids for this feature instance (any user). Knowledge rows are often stored + without featureInstanceId; we correlate by file id from the Management DB.""" + from modules.datamodels.datamodelFiles import FileItem + from modules.interfaces.interfaceDbManagement import ComponentObjects + + co = ComponentObjects() + rows = co.db.getRecordset(FileItem, recordFilter={"featureInstanceId": instanceId}) + out: List[str] = [] + m = str(mandateId) if mandateId else "" + for r in rows or []: + rid = r.get("id") if isinstance(r, dict) else getattr(r, "id", None) + if not rid: + continue + if m: + mid = r.get("mandateId") if isinstance(r, dict) else getattr(r, "mandateId", "") or "" + if mid and mid != m: + continue + out.append(str(rid)) + return out + + +@router.get("/{instanceId}/rag-statistics") +@limiter.limit("60/minute") +async def getRagStatistics( + request: Request, + instanceId: str = Path(...), + days: int = Query(90, ge=7, le=365, description="Timeline window in days"), + context: RequestContext = Depends(getRequestContext), +): + """Aggregated, non-identifying knowledge-store metrics for this workspace instance.""" + mandateId, _instanceConfig = _validateInstanceAccess(instanceId, context) + workspaceFileIds = _collectWorkspaceFileIdsForStats(instanceId, mandateId) + kdb = getKnowledgeInterface(context.user) + stats = kdb.getRagStatisticsForInstance( + featureInstanceId=instanceId, + mandateId=str(mandateId) if mandateId else "", + timelineDays=days, + workspaceFileIds=workspaceFileIds, + ) + if isinstance(stats, dict): + stats.setdefault("scope", {}) + stats["scope"]["workspaceFileIdsResolved"] = len(workspaceFileIds) + return JSONResponse(stats) diff --git a/modules/interfaces/interfaceDbKnowledge.py b/modules/interfaces/interfaceDbKnowledge.py index ae822db8..adf8ed0a 100644 --- a/modules/interfaces/interfaceDbKnowledge.py +++ b/modules/interfaces/interfaceDbKnowledge.py @@ -288,6 +288,183 @@ class KnowledgeObjects: minScore=minScore, ) + def getRagStatisticsForInstance( + self, + featureInstanceId: str, + mandateId: str, + timelineDays: int = 90, + workspaceFileIds: Optional[List[str]] = None, + ) -> Dict[str, Any]: + """Aggregate anonymised RAG / knowledge-store metrics for one workspace instance. + + No file names, user identifiers, or chunk text are returned — only counts and + distributions suitable for dashboards and presentations. + + workspaceFileIds: optional list of FileItem ids for this feature instance (from Management DB). + Index pipelines often stored rows with empty featureInstanceId; linking by file id fixes stats. + """ + if not featureInstanceId: + return {"error": "featureInstanceId required"} + + ws_ids = [x for x in (workspaceFileIds or []) if x] + ws_id_set = set(ws_ids) + + files_inst = self.db.getRecordset( + FileContentIndex, + recordFilter={"featureInstanceId": featureInstanceId}, + ) + files_shared: List[Dict[str, Any]] = [] + if mandateId: + files_shared = self.db.getRecordset( + FileContentIndex, + recordFilter={"mandateId": mandateId, "isShared": True}, + ) + + by_id: Dict[str, Dict[str, Any]] = {} + for row in files_inst + files_shared: + rid = row.get("id") + if rid and rid not in by_id: + by_id[rid] = row + + for fid in ws_ids: + if fid in by_id: + continue + row = self.getFileContentIndex(fid) + if row: + by_id[fid] = row + + files = list(by_id.values()) + + chunks_by_id: Dict[str, Dict[str, Any]] = {} + inst_chunks = self.db.getRecordset( + ContentChunk, + recordFilter={"featureInstanceId": featureInstanceId}, + ) + for c in inst_chunks: + cid = c.get("id") + if cid: + chunks_by_id[cid] = c + + for fid in ws_id_set: + for c in self.getContentChunks(fid): + cid = c.get("id") + if cid and cid not in chunks_by_id: + chunks_by_id[cid] = c + + covered_file_ids = {c.get("fileId") for c in chunks_by_id.values() if c.get("fileId")} + for row in files: + fid = row.get("id") + if fid and fid not in covered_file_ids: + for c in self.getContentChunks(fid): + cid = c.get("id") + if cid and cid not in chunks_by_id: + chunks_by_id[cid] = c + + chunks = list(chunks_by_id.values()) + + def _mimeCategory(mime: str) -> str: + m = (mime or "").lower() + if "pdf" in m: + return "pdf" + if "wordprocessing" in m or "msword" in m or "officedocument.wordprocessing" in m: + return "office_doc" + if "spreadsheet" in m or "excel" in m or "officedocument.spreadsheet" in m: + return "office_sheet" + if "presentation" in m or "officedocument.presentation" in m: + return "office_slides" + if m.startswith("text/"): + return "text" + if m.startswith("image/"): + return "image" + if "html" in m: + return "html" + return "other" + + def _utcDay(ts: Any) -> str: + if ts is None: + return "" + try: + return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d") + except (TypeError, ValueError, OSError): + return "" + + status_counts: Dict[str, int] = defaultdict(int) + mime_counts: Dict[str, int] = defaultdict(int) + extracted_by_day: Dict[str, int] = defaultdict(int) + total_bytes = 0 + user_ids = set() + + for row in files: + st = row.get("status") or "unknown" + status_counts[st] += 1 + mime_counts[_mimeCategory(row.get("mimeType") or "")] += 1 + day = _utcDay(row.get("extractedAt")) + if day: + extracted_by_day[day] += 1 + try: + total_bytes += int(row.get("totalSize") or 0) + except (TypeError, ValueError): + pass + uid = row.get("userId") + if uid: + user_ids.add(str(uid)) + + content_type_counts: Dict[str, int] = defaultdict(int) + chunks_with_embedding = 0 + for c in chunks: + ct = c.get("contentType") or "other" + content_type_counts[ct] += 1 + emb = c.get("embedding") + if emb is not None and ( + (isinstance(emb, list) and len(emb) > 0) + or (isinstance(emb, str) and len(emb) > 10) + ): + chunks_with_embedding += 1 + + wf_mem = self.db.getRecordset( + WorkflowMemory, + recordFilter={"featureInstanceId": featureInstanceId}, + ) + + cutoff = datetime.now(timezone.utc) - timedelta(days=max(1, int(timelineDays))) + cutoff_ts = cutoff.timestamp() + + timeline: List[Dict[str, Any]] = [] + for day in sorted(extracted_by_day.keys()): + try: + d = datetime.strptime(day, "%Y-%m-%d").replace(tzinfo=timezone.utc) + except ValueError: + continue + if d.timestamp() >= cutoff_ts: + timeline.append({"date": day, "indexedDocuments": extracted_by_day[day]}) + + if len(timeline) > 120: + timeline = timeline[-120:] + + total_chunks = len(chunks) + embedding_pct = round(100.0 * chunks_with_embedding / total_chunks, 1) if total_chunks else 0.0 + + return { + "scope": { + "featureInstanceId": featureInstanceId, + "mandateScopedShared": bool(mandateId), + }, + "kpis": { + "indexedDocuments": len(files), + "indexedBytesTotal": total_bytes, + "contributorUsers": len(user_ids), + "contentChunks": total_chunks, + "chunksWithEmbedding": chunks_with_embedding, + "embeddingCoveragePercent": embedding_pct, + "workflowEntities": len(wf_mem), + }, + "indexedDocumentsByStatus": dict(sorted(status_counts.items())), + "documentsByMimeCategory": dict(sorted(mime_counts.items(), key=lambda x: -x[1])), + "chunksByContentType": dict(sorted(content_type_counts.items())), + "timelineIndexedDocuments": timeline, + "generatedAtUtc": datetime.now(timezone.utc).isoformat(), + } + def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects: """Get or create a KnowledgeObjects singleton.""" diff --git a/modules/routes/routeDataFiles.py b/modules/routes/routeDataFiles.py index bb77ffc5..999d07df 100644 --- a/modules/routes/routeDataFiles.py +++ b/modules/routes/routeDataFiles.py @@ -37,6 +37,17 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): mgmtInterface.updateFile(fileId, {"status": "active"}) return + file_meta = mgmtInterface.getFile(fileId) + feature_instance_id = "" + mandate_id = "" + if file_meta: + if isinstance(file_meta, dict): + feature_instance_id = file_meta.get("featureInstanceId") or "" + mandate_id = file_meta.get("mandateId") or "" + else: + feature_instance_id = getattr(file_meta, "featureInstanceId", None) or "" + mandate_id = getattr(file_meta, "mandateId", None) or "" + logger.info(f"Auto-index starting for {fileName} ({len(rawBytes)} bytes, {mimeType})") # Step 1: Structure Pre-Scan (AI-free) @@ -47,6 +58,8 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): fileId=fileId, fileName=fileName, userId=userId, + featureInstanceId=str(feature_instance_id) if feature_instance_id else "", + mandateId=str(mandate_id) if mandate_id else "", ) logger.info( f"Pre-scan complete for {fileName}: " @@ -105,7 +118,11 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): from modules.serviceCenter import getService from modules.serviceCenter.context import ServiceCenterContext - ctx = ServiceCenterContext(user=user, mandate_id="", feature_instance_id="") + ctx = ServiceCenterContext( + user=user, + mandate_id=str(mandate_id) if mandate_id else "", + feature_instance_id=str(feature_instance_id) if feature_instance_id else "", + ) knowledgeService = getService("knowledge", ctx) await knowledgeService.indexFile( @@ -113,6 +130,8 @@ async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user): fileName=fileName, mimeType=mimeType, userId=userId, + featureInstanceId=str(feature_instance_id) if feature_instance_id else "", + mandateId=str(mandate_id) if mandate_id else "", contentObjects=contentObjects, structure=contentIndex.structure, )