571 lines
22 KiB
Python
571 lines
22 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Interface to the Knowledge Store database (poweron_knowledge).
|
|
Provides CRUD for FileContentIndex, ContentChunk, WorkflowMemory
|
|
and semantic search via pgvector.
|
|
"""
|
|
|
|
import logging
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Dict, Any, List, Optional
|
|
|
|
from modules.connectors.connectorDbPostgre import _get_cached_connector
|
|
from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk, RoundMemory, WorkflowMemory
|
|
from modules.datamodels.datamodelUam import User
|
|
from modules.shared.configuration import APP_CONFIG
|
|
from modules.shared.timeUtils import getUtcTimestamp
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_instances: Dict[str, "KnowledgeObjects"] = {}
|
|
|
|
|
|
class KnowledgeObjects:
|
|
"""Interface to the Knowledge Store database.
|
|
Manages FileContentIndex, ContentChunk, and WorkflowMemory with semantic search."""
|
|
|
|
def __init__(self):
|
|
self.currentUser: Optional[User] = None
|
|
self.userId: Optional[str] = None
|
|
self._scopeCache: Dict[str, List[str]] = {}
|
|
self._initializeDatabase()
|
|
|
|
def _initializeDatabase(self):
|
|
dbHost = APP_CONFIG.get("DB_HOST", "_no_config_default_data")
|
|
dbDatabase = "poweron_knowledge"
|
|
dbUser = APP_CONFIG.get("DB_USER")
|
|
dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET")
|
|
dbPort = int(APP_CONFIG.get("DB_PORT", 5432))
|
|
|
|
self.db = _get_cached_connector(
|
|
dbHost=dbHost,
|
|
dbDatabase=dbDatabase,
|
|
dbUser=dbUser,
|
|
dbPassword=dbPassword,
|
|
dbPort=dbPort,
|
|
userId=self.userId,
|
|
)
|
|
logger.info("Knowledge Store database initialized")
|
|
|
|
def setUserContext(self, user: User):
|
|
self.currentUser = user
|
|
self.userId = user.id if user else None
|
|
self._scopeCache = {}
|
|
if self.userId:
|
|
self.db.updateContext(self.userId)
|
|
|
|
# =========================================================================
|
|
# FileContentIndex CRUD
|
|
# =========================================================================
|
|
|
|
def upsertFileContentIndex(self, index: FileContentIndex) -> Dict[str, Any]:
|
|
"""Create or update a FileContentIndex entry."""
|
|
data = index.model_dump()
|
|
existing = self.db._loadRecord(FileContentIndex, index.id)
|
|
if existing:
|
|
return self.db.recordModify(FileContentIndex, index.id, data)
|
|
return self.db.recordCreate(FileContentIndex, data)
|
|
|
|
def getFileContentIndex(self, fileId: str) -> Optional[Dict[str, Any]]:
|
|
"""Get a FileContentIndex by file ID."""
|
|
return self.db._loadRecord(FileContentIndex, fileId)
|
|
|
|
def getFileContentIndexByUser(
|
|
self, userId: str, featureInstanceId: str = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""Get all FileContentIndex entries for a user."""
|
|
recordFilter = {"userId": userId}
|
|
if featureInstanceId:
|
|
recordFilter["featureInstanceId"] = featureInstanceId
|
|
return self.db.getRecordset(FileContentIndex, recordFilter=recordFilter)
|
|
|
|
def updateFileStatus(self, fileId: str, status: str) -> bool:
|
|
"""Update the processing status of a FileContentIndex."""
|
|
existing = self.db._loadRecord(FileContentIndex, fileId)
|
|
if not existing:
|
|
return False
|
|
self.db.recordModify(FileContentIndex, fileId, {"status": status})
|
|
return True
|
|
|
|
def deleteFileContentIndex(self, fileId: str) -> bool:
|
|
"""Delete a FileContentIndex and all associated ContentChunks."""
|
|
existing = self.getFileContentIndex(fileId)
|
|
mandateId = (existing or {}).get("mandateId") or ""
|
|
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
|
|
for chunk in chunks:
|
|
self.db.recordDelete(ContentChunk, chunk["id"])
|
|
ok = self.db.recordDelete(FileContentIndex, fileId)
|
|
if ok and mandateId:
|
|
try:
|
|
from modules.interfaces.interfaceDbBilling import _getRootInterface
|
|
|
|
_getRootInterface().reconcileMandateStorageBilling(str(mandateId))
|
|
except Exception as ex:
|
|
logger.warning("reconcileMandateStorageBilling after delete failed: %s", ex)
|
|
return ok
|
|
|
|
# =========================================================================
|
|
# ContentChunk CRUD
|
|
# =========================================================================
|
|
|
|
def upsertContentChunk(self, chunk: ContentChunk) -> Dict[str, Any]:
|
|
"""Create or update a ContentChunk."""
|
|
data = chunk.model_dump()
|
|
existing = self.db._loadRecord(ContentChunk, chunk.id)
|
|
if existing:
|
|
return self.db.recordModify(ContentChunk, chunk.id, data)
|
|
return self.db.recordCreate(ContentChunk, data)
|
|
|
|
def upsertContentChunks(self, chunks: List[ContentChunk]) -> int:
|
|
"""Batch upsert multiple ContentChunks. Returns count of upserted chunks."""
|
|
count = 0
|
|
for chunk in chunks:
|
|
self.upsertContentChunk(chunk)
|
|
count += 1
|
|
return count
|
|
|
|
def getContentChunks(self, fileId: str) -> List[Dict[str, Any]]:
|
|
"""Get all ContentChunks for a file."""
|
|
return self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
|
|
|
|
def deleteContentChunks(self, fileId: str) -> int:
|
|
"""Delete all ContentChunks for a file. Returns count of deleted chunks."""
|
|
chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
|
|
count = 0
|
|
for chunk in chunks:
|
|
if self.db.recordDelete(ContentChunk, chunk["id"]):
|
|
count += 1
|
|
return count
|
|
|
|
# =========================================================================
|
|
# RoundMemory CRUD
|
|
# =========================================================================
|
|
|
|
def storeRoundMemory(self, memory: RoundMemory) -> Dict[str, Any]:
|
|
"""Create or update a RoundMemory entry (upsert by id)."""
|
|
data = memory.model_dump()
|
|
existing = self.db._loadRecord(RoundMemory, memory.id)
|
|
if existing:
|
|
return self.db.recordModify(RoundMemory, memory.id, data)
|
|
return self.db.recordCreate(RoundMemory, data)
|
|
|
|
def getRoundMemories(self, workflowId: str) -> List[Dict[str, Any]]:
|
|
"""Get all RoundMemory entries for a workflow, sorted by roundNumber."""
|
|
records = self.db.getRecordset(RoundMemory, recordFilter={"workflowId": workflowId})
|
|
records.sort(key=lambda r: r.get("roundNumber", 0))
|
|
return records
|
|
|
|
def getRoundMemoriesByType(
|
|
self, workflowId: str, memoryType: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""Get RoundMemory entries filtered by type (e.g. 'file_ref')."""
|
|
return self.db.getRecordset(
|
|
RoundMemory, recordFilter={"workflowId": workflowId, "memoryType": memoryType}
|
|
)
|
|
|
|
def semanticSearchRoundMemory(
|
|
self,
|
|
queryVector: List[float],
|
|
workflowId: str,
|
|
limit: int = 10,
|
|
minScore: float = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Semantic search across RoundMemory entries for a workflow."""
|
|
return self.db.semanticSearch(
|
|
modelClass=RoundMemory,
|
|
vectorColumn="embedding",
|
|
queryVector=queryVector,
|
|
limit=limit,
|
|
recordFilter={"workflowId": workflowId},
|
|
minScore=minScore,
|
|
)
|
|
|
|
def deleteRoundMemories(self, workflowId: str) -> int:
|
|
"""Delete all RoundMemory entries for a workflow. Returns count."""
|
|
entries = self.db.getRecordset(RoundMemory, recordFilter={"workflowId": workflowId})
|
|
count = 0
|
|
for entry in entries:
|
|
if self.db.recordDelete(RoundMemory, entry["id"]):
|
|
count += 1
|
|
return count
|
|
|
|
# =========================================================================
|
|
# WorkflowMemory CRUD
|
|
# =========================================================================
|
|
|
|
def upsertWorkflowMemory(self, memory: WorkflowMemory) -> Dict[str, Any]:
|
|
"""Create or update a WorkflowMemory entry."""
|
|
data = memory.model_dump()
|
|
existing = self.db._loadRecord(WorkflowMemory, memory.id)
|
|
if existing:
|
|
return self.db.recordModify(WorkflowMemory, memory.id, data)
|
|
return self.db.recordCreate(WorkflowMemory, data)
|
|
|
|
def getWorkflowEntities(self, workflowId: str) -> List[Dict[str, Any]]:
|
|
"""Get all WorkflowMemory entries for a workflow."""
|
|
return self.db.getRecordset(WorkflowMemory, recordFilter={"workflowId": workflowId})
|
|
|
|
def getWorkflowEntity(self, workflowId: str, key: str) -> Optional[Dict[str, Any]]:
|
|
"""Get a specific WorkflowMemory entry by workflow and key."""
|
|
results = self.db.getRecordset(
|
|
WorkflowMemory, recordFilter={"workflowId": workflowId, "key": key}
|
|
)
|
|
return results[0] if results else None
|
|
|
|
def deleteWorkflowMemory(self, workflowId: str) -> int:
|
|
"""Delete all WorkflowMemory entries for a workflow. Returns count."""
|
|
entries = self.db.getRecordset(WorkflowMemory, recordFilter={"workflowId": workflowId})
|
|
count = 0
|
|
for entry in entries:
|
|
if self.db.recordDelete(WorkflowMemory, entry["id"]):
|
|
count += 1
|
|
return count
|
|
|
|
# =========================================================================
|
|
# Semantic Search
|
|
# =========================================================================
|
|
|
|
def _buildScopeFilter(self, userId: str = None, featureInstanceId: str = None, mandateId: str = None) -> dict:
|
|
"""Build a scope-aware filter for RAG queries.
|
|
Returns a filter dict that includes records visible to this user context."""
|
|
return {
|
|
"userId": userId,
|
|
"featureInstanceId": featureInstanceId,
|
|
"mandateId": mandateId,
|
|
}
|
|
|
|
def _getScopedFileIds(self, userId: str = None, featureInstanceId: str = None, mandateId: str = None, isSysAdmin: bool = False) -> List[str]:
|
|
"""Collect FileContentIndex IDs visible under the scope union:
|
|
- scope=personal AND userId matches
|
|
- scope=featureInstance AND featureInstanceId matches
|
|
- scope=mandate AND mandateId matches
|
|
- scope=global (only if isSysAdmin)
|
|
"""
|
|
_cacheKey = f"{userId}:{featureInstanceId}:{mandateId}:{isSysAdmin}"
|
|
if _cacheKey in self._scopeCache:
|
|
return self._scopeCache[_cacheKey]
|
|
|
|
allIds: set = set()
|
|
|
|
if isSysAdmin:
|
|
globalIndexes = self.db.getRecordset(
|
|
FileContentIndex, recordFilter={"scope": "global"}
|
|
)
|
|
for idx in globalIndexes:
|
|
fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
|
|
if fid:
|
|
allIds.add(fid)
|
|
|
|
if userId:
|
|
personalIndexes = self.db.getRecordset(
|
|
FileContentIndex, recordFilter={"scope": "personal", "userId": userId}
|
|
)
|
|
for idx in personalIndexes:
|
|
fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
|
|
if fid:
|
|
allIds.add(fid)
|
|
|
|
if featureInstanceId:
|
|
instanceIndexes = self.db.getRecordset(
|
|
FileContentIndex, recordFilter={"scope": "featureInstance", "featureInstanceId": featureInstanceId}
|
|
)
|
|
for idx in instanceIndexes:
|
|
fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
|
|
if fid:
|
|
allIds.add(fid)
|
|
|
|
if mandateId:
|
|
mandateIndexes = self.db.getRecordset(
|
|
FileContentIndex, recordFilter={"scope": "mandate", "mandateId": mandateId}
|
|
)
|
|
for idx in mandateIndexes:
|
|
fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
|
|
if fid:
|
|
allIds.add(fid)
|
|
|
|
self._scopeCache[_cacheKey] = list(allIds)
|
|
return self._scopeCache[_cacheKey]
|
|
|
|
def semanticSearch(
|
|
self,
|
|
queryVector: List[float],
|
|
userId: str = None,
|
|
featureInstanceId: str = None,
|
|
mandateId: str = None,
|
|
isShared: bool = None,
|
|
scope: str = None,
|
|
limit: int = 10,
|
|
minScore: float = None,
|
|
contentType: str = None,
|
|
isSysAdmin: bool = False,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Semantic search across ContentChunks using pgvector cosine similarity.
|
|
|
|
Args:
|
|
queryVector: Query embedding vector.
|
|
userId: Filter by user (Instance Layer).
|
|
featureInstanceId: Filter by feature instance.
|
|
mandateId: Filter by mandate (for Shared Layer lookups).
|
|
isShared: If True, search Shared Layer via FileContentIndex join.
|
|
scope: If provided, filter by this specific scope value.
|
|
If not provided, use scope-union approach (personal + featureInstance + mandate + global).
|
|
limit: Max results.
|
|
minScore: Minimum cosine similarity (0.0 - 1.0).
|
|
contentType: Filter by content type (text, image, etc.).
|
|
|
|
Returns:
|
|
List of ContentChunk records with _score field, sorted by relevance.
|
|
"""
|
|
recordFilter = {}
|
|
if contentType:
|
|
recordFilter["contentType"] = contentType
|
|
|
|
if scope:
|
|
scopedFileIds = self.db.getRecordset(
|
|
FileContentIndex, recordFilter={"scope": scope}
|
|
)
|
|
fileIds = [
|
|
idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
|
|
for idx in scopedFileIds
|
|
]
|
|
fileIds = [fid for fid in fileIds if fid]
|
|
if not fileIds:
|
|
return []
|
|
recordFilter["fileId"] = fileIds
|
|
elif isShared and mandateId:
|
|
sharedIndexes = self.db.getRecordset(
|
|
FileContentIndex,
|
|
recordFilter={"mandateId": mandateId, "isShared": True},
|
|
)
|
|
sharedFileIds = [idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None) for idx in sharedIndexes]
|
|
sharedFileIds = [fid for fid in sharedFileIds if fid]
|
|
if not sharedFileIds:
|
|
return []
|
|
recordFilter["fileId"] = sharedFileIds
|
|
elif userId or featureInstanceId or mandateId:
|
|
scopedFileIds = self._getScopedFileIds(
|
|
userId=userId,
|
|
featureInstanceId=featureInstanceId,
|
|
mandateId=mandateId,
|
|
isSysAdmin=isSysAdmin,
|
|
)
|
|
if not scopedFileIds:
|
|
return []
|
|
recordFilter["fileId"] = scopedFileIds
|
|
|
|
return self.db.semanticSearch(
|
|
modelClass=ContentChunk,
|
|
vectorColumn="embedding",
|
|
queryVector=queryVector,
|
|
limit=limit,
|
|
recordFilter=recordFilter if recordFilter else None,
|
|
minScore=minScore,
|
|
)
|
|
|
|
def semanticSearchWorkflowMemory(
|
|
self,
|
|
queryVector: List[float],
|
|
workflowId: str,
|
|
limit: int = 5,
|
|
minScore: float = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Semantic search across WorkflowMemory entries."""
|
|
return self.db.semanticSearch(
|
|
modelClass=WorkflowMemory,
|
|
vectorColumn="embedding",
|
|
queryVector=queryVector,
|
|
limit=limit,
|
|
recordFilter={"workflowId": workflowId},
|
|
minScore=minScore,
|
|
)
|
|
|
|
def getRagStatisticsForInstance(
|
|
self,
|
|
featureInstanceId: str,
|
|
mandateId: str,
|
|
timelineDays: int = 90,
|
|
workspaceFileIds: Optional[List[str]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Aggregate anonymised RAG / knowledge-store metrics for one workspace instance.
|
|
|
|
No file names, user identifiers, or chunk text are returned — only counts and
|
|
distributions suitable for dashboards and presentations.
|
|
|
|
workspaceFileIds: optional list of FileItem ids for this feature instance (from Management DB).
|
|
Index pipelines often stored rows with empty featureInstanceId; linking by file id fixes stats.
|
|
"""
|
|
if not featureInstanceId:
|
|
return {"error": "featureInstanceId required"}
|
|
|
|
ws_ids = [x for x in (workspaceFileIds or []) if x]
|
|
ws_id_set = set(ws_ids)
|
|
|
|
files_inst = self.db.getRecordset(
|
|
FileContentIndex,
|
|
recordFilter={"featureInstanceId": featureInstanceId},
|
|
)
|
|
files_shared: List[Dict[str, Any]] = []
|
|
if mandateId:
|
|
files_shared = self.db.getRecordset(
|
|
FileContentIndex,
|
|
recordFilter={"mandateId": mandateId, "isShared": True},
|
|
)
|
|
|
|
by_id: Dict[str, Dict[str, Any]] = {}
|
|
for row in files_inst + files_shared:
|
|
rid = row.get("id")
|
|
if rid and rid not in by_id:
|
|
by_id[rid] = row
|
|
|
|
for fid in ws_ids:
|
|
if fid in by_id:
|
|
continue
|
|
row = self.getFileContentIndex(fid)
|
|
if row:
|
|
by_id[fid] = row
|
|
|
|
files = list(by_id.values())
|
|
|
|
chunks_by_id: Dict[str, Dict[str, Any]] = {}
|
|
inst_chunks = self.db.getRecordset(
|
|
ContentChunk,
|
|
recordFilter={"featureInstanceId": featureInstanceId},
|
|
)
|
|
for c in inst_chunks:
|
|
cid = c.get("id")
|
|
if cid:
|
|
chunks_by_id[cid] = c
|
|
|
|
for fid in ws_id_set:
|
|
for c in self.getContentChunks(fid):
|
|
cid = c.get("id")
|
|
if cid and cid not in chunks_by_id:
|
|
chunks_by_id[cid] = c
|
|
|
|
covered_file_ids = {c.get("fileId") for c in chunks_by_id.values() if c.get("fileId")}
|
|
for row in files:
|
|
fid = row.get("id")
|
|
if fid and fid not in covered_file_ids:
|
|
for c in self.getContentChunks(fid):
|
|
cid = c.get("id")
|
|
if cid and cid not in chunks_by_id:
|
|
chunks_by_id[cid] = c
|
|
|
|
chunks = list(chunks_by_id.values())
|
|
|
|
def _mimeCategory(mime: str) -> str:
|
|
m = (mime or "").lower()
|
|
if "pdf" in m:
|
|
return "pdf"
|
|
if "wordprocessing" in m or "msword" in m or "officedocument.wordprocessing" in m:
|
|
return "office_doc"
|
|
if "spreadsheet" in m or "excel" in m or "officedocument.spreadsheet" in m:
|
|
return "office_sheet"
|
|
if "presentation" in m or "officedocument.presentation" in m:
|
|
return "office_slides"
|
|
if m.startswith("text/"):
|
|
return "text"
|
|
if m.startswith("image/"):
|
|
return "image"
|
|
if "html" in m:
|
|
return "html"
|
|
return "other"
|
|
|
|
def _utcDay(ts: Any) -> str:
|
|
if ts is None:
|
|
return ""
|
|
try:
|
|
return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d")
|
|
except (TypeError, ValueError, OSError):
|
|
return ""
|
|
|
|
status_counts: Dict[str, int] = defaultdict(int)
|
|
mime_counts: Dict[str, int] = defaultdict(int)
|
|
extracted_by_day: Dict[str, int] = defaultdict(int)
|
|
total_bytes = 0
|
|
user_ids = set()
|
|
|
|
for row in files:
|
|
st = row.get("status") or "unknown"
|
|
status_counts[st] += 1
|
|
mime_counts[_mimeCategory(row.get("mimeType") or "")] += 1
|
|
day = _utcDay(row.get("extractedAt"))
|
|
if day:
|
|
extracted_by_day[day] += 1
|
|
try:
|
|
total_bytes += int(row.get("totalSize") or 0)
|
|
except (TypeError, ValueError):
|
|
pass
|
|
uid = row.get("userId")
|
|
if uid:
|
|
user_ids.add(str(uid))
|
|
|
|
content_type_counts: Dict[str, int] = defaultdict(int)
|
|
chunks_with_embedding = 0
|
|
for c in chunks:
|
|
ct = c.get("contentType") or "other"
|
|
content_type_counts[ct] += 1
|
|
emb = c.get("embedding")
|
|
if emb is not None and (
|
|
(isinstance(emb, list) and len(emb) > 0)
|
|
or (isinstance(emb, str) and len(emb) > 10)
|
|
):
|
|
chunks_with_embedding += 1
|
|
|
|
wf_mem = self.db.getRecordset(
|
|
WorkflowMemory,
|
|
recordFilter={"featureInstanceId": featureInstanceId},
|
|
)
|
|
|
|
cutoff = datetime.now(timezone.utc) - timedelta(days=max(1, int(timelineDays)))
|
|
cutoff_ts = cutoff.timestamp()
|
|
|
|
timeline: List[Dict[str, Any]] = []
|
|
for day in sorted(extracted_by_day.keys()):
|
|
try:
|
|
d = datetime.strptime(day, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
except ValueError:
|
|
continue
|
|
if d.timestamp() >= cutoff_ts:
|
|
timeline.append({"date": day, "indexedDocuments": extracted_by_day[day]})
|
|
|
|
if len(timeline) > 120:
|
|
timeline = timeline[-120:]
|
|
|
|
total_chunks = len(chunks)
|
|
embedding_pct = round(100.0 * chunks_with_embedding / total_chunks, 1) if total_chunks else 0.0
|
|
|
|
return {
|
|
"scope": {
|
|
"featureInstanceId": featureInstanceId,
|
|
"mandateScopedShared": bool(mandateId),
|
|
},
|
|
"kpis": {
|
|
"indexedDocuments": len(files),
|
|
"indexedBytesTotal": total_bytes,
|
|
"contributorUsers": len(user_ids),
|
|
"contentChunks": total_chunks,
|
|
"chunksWithEmbedding": chunks_with_embedding,
|
|
"embeddingCoveragePercent": embedding_pct,
|
|
"workflowEntities": len(wf_mem),
|
|
},
|
|
"indexedDocumentsByStatus": dict(sorted(status_counts.items())),
|
|
"documentsByMimeCategory": dict(sorted(mime_counts.items(), key=lambda x: -x[1])),
|
|
"chunksByContentType": dict(sorted(content_type_counts.items())),
|
|
"timelineIndexedDocuments": timeline,
|
|
"generatedAtUtc": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
|
|
def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects:
|
|
"""Get or create a KnowledgeObjects singleton."""
|
|
if "default" not in _instances:
|
|
_instances["default"] = KnowledgeObjects()
|
|
|
|
interface = _instances["default"]
|
|
if currentUser:
|
|
interface.setUserContext(currentUser)
|
|
|
|
return interface
|