gateway/modules/interfaces/interfaceDbKnowledge.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Interface to the Knowledge Store database (poweron_knowledge).
Provides CRUD for FileContentIndex, ContentChunk, WorkflowMemory
and semantic search via pgvector.
"""

import logging
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from typing import Dict, Any, List, Optional

from modules.connectors.connectorDbPostgre import _get_cached_connector
from modules.datamodels.datamodelKnowledge import FileContentIndex, ContentChunk, RoundMemory, WorkflowMemory
from modules.datamodels.datamodelUam import User
from modules.shared.configuration import APP_CONFIG
from modules.shared.timeUtils import getUtcTimestamp

logger = logging.getLogger(__name__)

_instances: Dict[str, "KnowledgeObjects"] = {}


class KnowledgeObjects:
    """Interface to the Knowledge Store database.
    Manages FileContentIndex, ContentChunk, and WorkflowMemory with semantic search."""

    def __init__(self):
        self.currentUser: Optional[User] = None
        self.userId: Optional[str] = None
        self._scopeCache: Dict[str, List[str]] = {}
        self._initializeDatabase()

    def _initializeDatabase(self):
        dbHost = APP_CONFIG.get("DB_HOST", "_no_config_default_data")
        dbDatabase = "poweron_knowledge"
        dbUser = APP_CONFIG.get("DB_USER")
        dbPassword = APP_CONFIG.get("DB_PASSWORD_SECRET")
        dbPort = int(APP_CONFIG.get("DB_PORT", 5432))

        self.db = _get_cached_connector(
            dbHost=dbHost,
            dbDatabase=dbDatabase,
            dbUser=dbUser,
            dbPassword=dbPassword,
            dbPort=dbPort,
            userId=self.userId,
        )
        logger.info("Knowledge Store database initialized")

    def setUserContext(self, user: User):
        self.currentUser = user
        self.userId = user.id if user else None
        self._scopeCache = {}
        if self.userId:
            self.db.updateContext(self.userId)

    # =========================================================================
    # FileContentIndex CRUD
    # =========================================================================

    def upsertFileContentIndex(self, index: FileContentIndex) -> Dict[str, Any]:
        """Create or update a FileContentIndex entry."""
        data = index.model_dump()
        existing = self.db._loadRecord(FileContentIndex, index.id)
        if existing:
            return self.db.recordModify(FileContentIndex, index.id, data)
        return self.db.recordCreate(FileContentIndex, data)

    def getFileContentIndex(self, fileId: str) -> Optional[Dict[str, Any]]:
        """Get a FileContentIndex by file ID."""
        return self.db._loadRecord(FileContentIndex, fileId)

    def getFileContentIndexByUser(
        self, userId: str, featureInstanceId: str = None
    ) -> List[Dict[str, Any]]:
        """Get all FileContentIndex entries for a user."""
        recordFilter = {"userId": userId}
        if featureInstanceId:
            recordFilter["featureInstanceId"] = featureInstanceId
        return self.db.getRecordset(FileContentIndex, recordFilter=recordFilter)

    def updateFileStatus(self, fileId: str, status: str) -> bool:
        """Update the processing status of a FileContentIndex."""
        existing = self.db._loadRecord(FileContentIndex, fileId)
        if not existing:
            return False
        self.db.recordModify(FileContentIndex, fileId, {"status": status})
        return True

    def deleteFileContentIndex(self, fileId: str) -> bool:
        """Delete a FileContentIndex and all associated ContentChunks."""
        existing = self.getFileContentIndex(fileId)
        mandateId = (existing or {}).get("mandateId") or ""
        chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
        for chunk in chunks:
            self.db.recordDelete(ContentChunk, chunk["id"])
        ok = self.db.recordDelete(FileContentIndex, fileId)
        if ok and mandateId:
            try:
                from modules.interfaces.interfaceDbBilling import _getRootInterface

                _getRootInterface().reconcileMandateStorageBilling(str(mandateId))
            except Exception as ex:
                logger.warning("reconcileMandateStorageBilling after delete failed: %s", ex)
        return ok

    # =========================================================================
    # ContentChunk CRUD
    # =========================================================================

    def upsertContentChunk(self, chunk: ContentChunk) -> Dict[str, Any]:
        """Create or update a ContentChunk."""
        data = chunk.model_dump()
        existing = self.db._loadRecord(ContentChunk, chunk.id)
        if existing:
            return self.db.recordModify(ContentChunk, chunk.id, data)
        return self.db.recordCreate(ContentChunk, data)

    def upsertContentChunks(self, chunks: List[ContentChunk]) -> int:
        """Batch upsert multiple ContentChunks. Returns count of upserted chunks."""
        count = 0
        for chunk in chunks:
            self.upsertContentChunk(chunk)
            count += 1
        return count

    def getContentChunks(self, fileId: str) -> List[Dict[str, Any]]:
        """Get all ContentChunks for a file."""
        return self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})

    def deleteContentChunks(self, fileId: str) -> int:
        """Delete all ContentChunks for a file. Returns count of deleted chunks."""
        chunks = self.db.getRecordset(ContentChunk, recordFilter={"fileId": fileId})
        count = 0
        for chunk in chunks:
            if self.db.recordDelete(ContentChunk, chunk["id"]):
                count += 1
        return count

    # =========================================================================
    # RoundMemory CRUD
    # =========================================================================

    def storeRoundMemory(self, memory: RoundMemory) -> Dict[str, Any]:
        """Create or update a RoundMemory entry (upsert by id)."""
        data = memory.model_dump()
        existing = self.db._loadRecord(RoundMemory, memory.id)
        if existing:
            return self.db.recordModify(RoundMemory, memory.id, data)
        return self.db.recordCreate(RoundMemory, data)

    def getRoundMemories(self, workflowId: str) -> List[Dict[str, Any]]:
        """Get all RoundMemory entries for a workflow, sorted by roundNumber."""
        records = self.db.getRecordset(RoundMemory, recordFilter={"workflowId": workflowId})
        records.sort(key=lambda r: r.get("roundNumber", 0))
        return records

    def getRoundMemoriesByType(
        self, workflowId: str, memoryType: str
    ) -> List[Dict[str, Any]]:
        """Get RoundMemory entries filtered by type (e.g. 'file_ref')."""
        return self.db.getRecordset(
            RoundMemory, recordFilter={"workflowId": workflowId, "memoryType": memoryType}
        )

    def semanticSearchRoundMemory(
        self,
        queryVector: List[float],
        workflowId: str,
        limit: int = 10,
        minScore: float = None,
    ) -> List[Dict[str, Any]]:
        """Semantic search across RoundMemory entries for a workflow."""
        return self.db.semanticSearch(
            modelClass=RoundMemory,
            vectorColumn="embedding",
            queryVector=queryVector,
            limit=limit,
            recordFilter={"workflowId": workflowId},
            minScore=minScore,
        )

    def deleteRoundMemories(self, workflowId: str) -> int:
        """Delete all RoundMemory entries for a workflow. Returns count."""
        entries = self.db.getRecordset(RoundMemory, recordFilter={"workflowId": workflowId})
        count = 0
        for entry in entries:
            if self.db.recordDelete(RoundMemory, entry["id"]):
                count += 1
        return count

    # =========================================================================
    # WorkflowMemory CRUD
    # =========================================================================

    def upsertWorkflowMemory(self, memory: WorkflowMemory) -> Dict[str, Any]:
        """Create or update a WorkflowMemory entry."""
        data = memory.model_dump()
        existing = self.db._loadRecord(WorkflowMemory, memory.id)
        if existing:
            return self.db.recordModify(WorkflowMemory, memory.id, data)
        return self.db.recordCreate(WorkflowMemory, data)

    def getWorkflowEntities(self, workflowId: str) -> List[Dict[str, Any]]:
        """Get all WorkflowMemory entries for a workflow."""
        return self.db.getRecordset(WorkflowMemory, recordFilter={"workflowId": workflowId})

    def getWorkflowEntity(self, workflowId: str, key: str) -> Optional[Dict[str, Any]]:
        """Get a specific WorkflowMemory entry by workflow and key."""
        results = self.db.getRecordset(
            WorkflowMemory, recordFilter={"workflowId": workflowId, "key": key}
        )
        return results[0] if results else None

    def deleteWorkflowMemory(self, workflowId: str) -> int:
        """Delete all WorkflowMemory entries for a workflow. Returns count."""
        entries = self.db.getRecordset(WorkflowMemory, recordFilter={"workflowId": workflowId})
        count = 0
        for entry in entries:
            if self.db.recordDelete(WorkflowMemory, entry["id"]):
                count += 1
        return count

    # =========================================================================
    # Semantic Search
    # =========================================================================

    def _buildScopeFilter(self, userId: str = None, featureInstanceId: str = None, mandateId: str = None) -> dict:
        """Build a scope-aware filter for RAG queries.
        Returns a filter dict that includes records visible to this user context."""
        return {
            "userId": userId,
            "featureInstanceId": featureInstanceId,
            "mandateId": mandateId,
        }

    def _getScopedFileIds(self, userId: str = None, featureInstanceId: str = None, mandateId: str = None, isSysAdmin: bool = False) -> List[str]:
        """Collect FileContentIndex IDs visible under the scope union:
        - scope=personal AND userId matches
        - scope=featureInstance AND featureInstanceId matches
        - scope=mandate AND mandateId matches
        - scope=global (only if isSysAdmin)
        """
        _cacheKey = f"{userId}:{featureInstanceId}:{mandateId}:{isSysAdmin}"
        if _cacheKey in self._scopeCache:
            return self._scopeCache[_cacheKey]

        allIds: set = set()

        if isSysAdmin:
            globalIndexes = self.db.getRecordset(
                FileContentIndex, recordFilter={"scope": "global"}
            )
            for idx in globalIndexes:
                fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
                if fid:
                    allIds.add(fid)

        if userId:
            personalIndexes = self.db.getRecordset(
                FileContentIndex, recordFilter={"scope": "personal", "userId": userId}
            )
            for idx in personalIndexes:
                fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
                if fid:
                    allIds.add(fid)

        if featureInstanceId:
            instanceIndexes = self.db.getRecordset(
                FileContentIndex, recordFilter={"scope": "featureInstance", "featureInstanceId": featureInstanceId}
            )
            for idx in instanceIndexes:
                fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
                if fid:
                    allIds.add(fid)

        if mandateId:
            mandateIndexes = self.db.getRecordset(
                FileContentIndex, recordFilter={"scope": "mandate", "mandateId": mandateId}
            )
            for idx in mandateIndexes:
                fid = idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
                if fid:
                    allIds.add(fid)

        self._scopeCache[_cacheKey] = list(allIds)
        return self._scopeCache[_cacheKey]

    def semanticSearch(
        self,
        queryVector: List[float],
        userId: str = None,
        featureInstanceId: str = None,
        mandateId: str = None,
        scope: str = None,
        limit: int = 10,
        minScore: float = None,
        contentType: str = None,
        isSysAdmin: bool = False,
    ) -> List[Dict[str, Any]]:
        """Semantic search across ContentChunks using pgvector cosine similarity.

        Args:
            queryVector: Query embedding vector.
            userId: Filter by user (personal scope).
            featureInstanceId: Filter by feature instance.
            mandateId: Filter by mandate (scope=mandate means visible to all mandate users).
            scope: If provided, filter by this specific scope value.
                   If not provided, use scope-union approach (personal + featureInstance + mandate + global).
            limit: Max results.
            minScore: Minimum cosine similarity (0.0 - 1.0).
            contentType: Filter by content type (text, image, etc.).

        Returns:
            List of ContentChunk records with _score field, sorted by relevance.
        """
        recordFilter = {}
        if contentType:
            recordFilter["contentType"] = contentType

        if scope:
            scopeFilter: Dict[str, Any] = {"scope": scope}
            if mandateId:
                scopeFilter["mandateId"] = mandateId
            if featureInstanceId:
                scopeFilter["featureInstanceId"] = featureInstanceId
            scopedFileIds = self.db.getRecordset(
                FileContentIndex, recordFilter=scopeFilter
            )
            fileIds = [
                idx.get("id") if isinstance(idx, dict) else getattr(idx, "id", None)
                for idx in scopedFileIds
            ]
            fileIds = [fid for fid in fileIds if fid]
            if not fileIds:
                return []
            recordFilter["fileId"] = fileIds
        elif userId or featureInstanceId or mandateId:
            scopedFileIds = self._getScopedFileIds(
                userId=userId,
                featureInstanceId=featureInstanceId,
                mandateId=mandateId,
                isSysAdmin=isSysAdmin,
            )
            if not scopedFileIds:
                return []
            recordFilter["fileId"] = scopedFileIds

        return self.db.semanticSearch(
            modelClass=ContentChunk,
            vectorColumn="embedding",
            queryVector=queryVector,
            limit=limit,
            recordFilter=recordFilter if recordFilter else None,
            minScore=minScore,
        )

    def semanticSearchWorkflowMemory(
        self,
        queryVector: List[float],
        workflowId: str,
        limit: int = 5,
        minScore: float = None,
    ) -> List[Dict[str, Any]]:
        """Semantic search across WorkflowMemory entries."""
        return self.db.semanticSearch(
            modelClass=WorkflowMemory,
            vectorColumn="embedding",
            queryVector=queryVector,
            limit=limit,
            recordFilter={"workflowId": workflowId},
            minScore=minScore,
        )

    def getRagStatisticsForInstance(
        self,
        featureInstanceId: str,
        mandateId: str,
        timelineDays: int = 90,
        workspaceFileIds: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """Aggregate anonymised RAG / knowledge-store metrics for one workspace instance.

        No file names, user identifiers, or chunk text are returned — only counts and
        distributions suitable for dashboards and presentations.

        workspaceFileIds: optional list of FileItem ids for this feature instance (from Management DB).
        Index pipelines often stored rows with empty featureInstanceId; linking by file id fixes stats.
        """
        if not featureInstanceId:
            return {"error": "featureInstanceId required"}

        ws_ids = [x for x in (workspaceFileIds or []) if x]
        ws_id_set = set(ws_ids)

        files_inst = self.db.getRecordset(
            FileContentIndex,
            recordFilter={"featureInstanceId": featureInstanceId},
        )
        files_shared: List[Dict[str, Any]] = []
        if mandateId:
            files_shared = self.db.getRecordset(
                FileContentIndex,
                recordFilter={"mandateId": mandateId, "scope": "mandate"},
            )

        by_id: Dict[str, Dict[str, Any]] = {}
        for row in files_inst + files_shared:
            rid = row.get("id")
            if rid and rid not in by_id:
                by_id[rid] = row

        for fid in ws_ids:
            if fid in by_id:
                continue
            row = self.getFileContentIndex(fid)
            if row:
                by_id[fid] = row

        files = list(by_id.values())

        chunks_by_id: Dict[str, Dict[str, Any]] = {}
        inst_chunks = self.db.getRecordset(
            ContentChunk,
            recordFilter={"featureInstanceId": featureInstanceId},
        )
        for c in inst_chunks:
            cid = c.get("id")
            if cid:
                chunks_by_id[cid] = c

        for fid in ws_id_set:
            for c in self.getContentChunks(fid):
                cid = c.get("id")
                if cid and cid not in chunks_by_id:
                    chunks_by_id[cid] = c

        covered_file_ids = {c.get("fileId") for c in chunks_by_id.values() if c.get("fileId")}
        for row in files:
            fid = row.get("id")
            if fid and fid not in covered_file_ids:
                for c in self.getContentChunks(fid):
                    cid = c.get("id")
                    if cid and cid not in chunks_by_id:
                        chunks_by_id[cid] = c

        chunks = list(chunks_by_id.values())

        def _mimeCategory(mime: str) -> str:
            m = (mime or "").lower()
            if "pdf" in m:
                return "pdf"
            if "wordprocessing" in m or "msword" in m or "officedocument.wordprocessing" in m:
                return "office_doc"
            if "spreadsheet" in m or "excel" in m or "officedocument.spreadsheet" in m:
                return "office_sheet"
            if "presentation" in m or "officedocument.presentation" in m:
                return "office_slides"
            if m.startswith("text/"):
                return "text"
            if m.startswith("image/"):
                return "image"
            if "html" in m:
                return "html"
            return "other"

        def _utcDay(ts: Any) -> str:
            if ts is None:
                return ""
            try:
                return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d")
            except (TypeError, ValueError, OSError):
                return ""

        status_counts: Dict[str, int] = defaultdict(int)
        mime_counts: Dict[str, int] = defaultdict(int)
        extracted_by_day: Dict[str, int] = defaultdict(int)
        total_bytes = 0
        user_ids = set()

        for row in files:
            st = row.get("status") or "unknown"
            status_counts[st] += 1
            mime_counts[_mimeCategory(row.get("mimeType") or "")] += 1
            day = _utcDay(row.get("extractedAt"))
            if day:
                extracted_by_day[day] += 1
            try:
                total_bytes += int(row.get("totalSize") or 0)
            except (TypeError, ValueError):
                pass
            uid = row.get("userId")
            if uid:
                user_ids.add(str(uid))

        content_type_counts: Dict[str, int] = defaultdict(int)
        chunks_with_embedding = 0
        for c in chunks:
            ct = c.get("contentType") or "other"
            content_type_counts[ct] += 1
            emb = c.get("embedding")
            if emb is not None and (
                (isinstance(emb, list) and len(emb) > 0)
                or (isinstance(emb, str) and len(emb) > 10)
            ):
                chunks_with_embedding += 1

        wf_mem = self.db.getRecordset(
            WorkflowMemory,
            recordFilter={"featureInstanceId": featureInstanceId},
        )

        cutoff = datetime.now(timezone.utc) - timedelta(days=max(1, int(timelineDays)))
        cutoff_ts = cutoff.timestamp()

        timeline: List[Dict[str, Any]] = []
        for day in sorted(extracted_by_day.keys()):
            try:
                d = datetime.strptime(day, "%Y-%m-%d").replace(tzinfo=timezone.utc)
            except ValueError:
                continue
            if d.timestamp() >= cutoff_ts:
                timeline.append({"date": day, "indexedDocuments": extracted_by_day[day]})

        if len(timeline) > 120:
            timeline = timeline[-120:]

        total_chunks = len(chunks)
        embedding_pct = round(100.0 * chunks_with_embedding / total_chunks, 1) if total_chunks else 0.0

        sorted_files = sorted(
            files,
            key=lambda r: float(r.get("extractedAt") or 0),
            reverse=True,
        )
        recently_indexed = []
        for row in sorted_files[:10]:
            recently_indexed.append({
                "fileName": row.get("fileName", ""),
                "mimeType": row.get("mimeType", ""),
                "status": row.get("status", "unknown"),
                "extractedAt": row.get("extractedAt"),
                "totalSize": row.get("totalSize", 0),
            })

        return {
            "scope": {
                "featureInstanceId": featureInstanceId,
                "mandateScopedShared": bool(mandateId),
            },
            "kpis": {
                "indexedDocuments": len(files),
                "indexedBytesTotal": total_bytes,
                "contributorUsers": len(user_ids),
                "contentChunks": total_chunks,
                "chunksWithEmbedding": chunks_with_embedding,
                "embeddingCoveragePercent": embedding_pct,
                "workflowEntities": len(wf_mem),
            },
            "indexedDocumentsByStatus": dict(sorted(status_counts.items())),
            "documentsByMimeCategory": dict(sorted(mime_counts.items(), key=lambda x: -x[1])),
            "chunksByContentType": dict(sorted(content_type_counts.items())),
            "timelineIndexedDocuments": timeline,
            "recentlyIndexedDocuments": recently_indexed,
            "generatedAtUtc": datetime.now(timezone.utc).isoformat(),
        }


def aggregateMandateRagTotalBytes(mandateId: str) -> int:
    """Sum FileContentIndex.totalSize for a mandate.

    Primary strategy (relies on correct scope fields on FileContentIndex):
      1. FileContentIndex rows with mandateId on the index
      2. FileContentIndex rows with featureInstanceId of any mandate FeatureInstance
    Deduplicates by id.
    """
    if not mandateId:
        return 0
    from modules.datamodels.datamodelFeatures import FeatureInstance
    from modules.interfaces.interfaceDbApp import getRootInterface

    knowDb = getInterface(None).db
    appDb = getRootInterface().db
    byId: Dict[str, Dict[str, Any]] = {}

    for row in knowDb.getRecordset(FileContentIndex, recordFilter={"mandateId": mandateId}):
        rid = row.get("id")
        if rid:
            byId[str(rid)] = row

    instances = appDb.getRecordset(FeatureInstance, recordFilter={"mandateId": mandateId})
    instIds = [str(inst.get("id", "")) for inst in instances if inst.get("id")]

    for instId in instIds:
        for row in knowDb.getRecordset(FileContentIndex, recordFilter={"featureInstanceId": instId}):
            rid = row.get("id")
            if rid and str(rid) not in byId:
                byId[str(rid)] = row

    # DEPRECATED: file-ID-correlation fallback from poweron_management.
    # Only needed for pre-migration data where mandateId/featureInstanceId on the
    # FileContentIndex are empty. Safe to remove once all environments are migrated.
    _fallbackCount = 0
    try:
        from modules.datamodels.datamodelFiles import FileItem
        from modules.interfaces.interfaceDbManagement import ComponentObjects
        mgmtDb = ComponentObjects().db
        knowledgeIf = getInterface(None)

        fileIds: set = set()
        for f in mgmtDb.getRecordset(FileItem, recordFilter={"mandateId": mandateId}):
            fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
            if fid:
                fileIds.add(str(fid))
        for instId in instIds:
            for f in mgmtDb.getRecordset(FileItem, recordFilter={"featureInstanceId": instId}):
                fid = f.get("id") if isinstance(f, dict) else getattr(f, "id", None)
                if fid:
                    fileIds.add(str(fid))

        for fid in fileIds:
            if fid in byId:
                continue
            row = knowledgeIf.getFileContentIndex(fid)
            if row:
                byId[fid] = row
                _fallbackCount += 1
    except Exception as e:
        logger.warning("aggregateMandateRagTotalBytes fallback failed: %s", e)

    total = sum(int(r.get("totalSize") or 0) for r in byId.values())
    logger.info(
        "aggregateMandateRagTotalBytes(%s): %d indexes, %d bytes (fallback: %d)",
        mandateId, len(byId), total, _fallbackCount,
    )
    return total


def getInterface(currentUser: Optional[User] = None) -> KnowledgeObjects:
    """Get or create a KnowledgeObjects singleton."""
    if "default" not in _instances:
        _instances["default"] = KnowledgeObjects()

    interface = _instances["default"]
    if currentUser:
        interface.setUserContext(currentUser)

    return interface