gateway/modules/serviceCenter/services/serviceKnowledge/mainServiceKnowledge.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""Knowledge service: 3-tier RAG with indexing, semantic search, and context building."""

import logging
import re
from typing import Any, Callable, Dict, List, Optional

from modules.datamodels.datamodelKnowledge import (
    FileContentIndex, ContentChunk, WorkflowMemory,
)
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
from modules.shared.timeUtils import getUtcTimestamp

logger = logging.getLogger(__name__)

CHARS_PER_TOKEN = 4
DEFAULT_CHUNK_TOKENS = 400
DEFAULT_CONTEXT_BUDGET = 12000


class KnowledgeService:
    """Service for Knowledge Store operations: indexing, retrieval, and context building."""

    def __init__(self, context, get_service: Callable[[str], Any]):
        self._context = context
        self._getService = get_service
        self._knowledgeDb = getKnowledgeInterface(context.user)

    # =========================================================================
    # Embedding helper
    # =========================================================================

    async def _embed(self, texts: List[str]) -> List[List[float]]:
        """Embed texts via AiService (respects allowedProviders)."""
        aiService = self._getService("ai")
        response = await aiService.callEmbedding(texts)
        if response.errorCount > 0:
            logger.error(f"Embedding failed: {response.content}")
            return []
        return (response.metadata or {}).get("embeddings", [])

    async def _embedSingle(self, text: str) -> List[float]:
        """Embed a single text. Returns empty list on failure."""
        results = await self._embed([text])
        return results[0] if results else []

    # =========================================================================
    # File Indexing (called after extraction, before embedding)
    # =========================================================================

    async def indexFile(
        self,
        fileId: str,
        fileName: str,
        mimeType: str,
        userId: str,
        featureInstanceId: str = "",
        mandateId: str = "",
        contentObjects: List[Dict[str, Any]] = None,
        structure: Dict[str, Any] = None,
        containerPath: str = None,
    ) -> FileContentIndex:
        """Index a file's content objects and create embeddings for text chunks.

        This is the main entry point after non-AI extraction has produced content objects.

        Args:
            fileId: The file ID.
            fileName: Original file name.
            mimeType: MIME type.
            userId: Owner user.
            featureInstanceId: Feature instance scope.
            mandateId: Mandate scope.
            contentObjects: List of extracted content objects, each with keys:
                contentType (str), data (str), contextRef (dict), contentObjectId (str).
            structure: Structural overview of the file.
            containerPath: Path within container if applicable.

        Returns:
            The created FileContentIndex.
        """
        contentObjects = contentObjects or []

        # 1. Create FileContentIndex
        index = FileContentIndex(
            id=fileId,
            userId=userId,
            featureInstanceId=featureInstanceId,
            mandateId=mandateId,
            fileName=fileName,
            mimeType=mimeType,
            containerPath=containerPath,
            totalObjects=len(contentObjects),
            totalSize=sum(len(obj.get("data", "").encode("utf-8")) for obj in contentObjects),
            structure=structure or {},
            objectSummary=[
                {
                    "id": obj.get("contentObjectId", ""),
                    "type": obj.get("contentType", "other"),
                    "size": len(obj.get("data", "").encode("utf-8")),
                    "ref": obj.get("contextRef", {}),
                }
                for obj in contentObjects
            ],
            status="extracted",
        )
        self._knowledgeDb.upsertFileContentIndex(index)

        # 2. Chunk text content objects and create embeddings
        textObjects = [o for o in contentObjects if o.get("contentType") == "text"]

        # Read FileItem attributes for index metadata and neutralization
        _shouldNeutralize = False
        try:
            from modules.datamodels.datamodelFiles import FileItem as _FileItem
            _dbComponent = getattr(self._context, 'interfaceDbComponent', None)
            _fileRecords = _dbComponent.getRecordset(_FileItem, recordFilter={"id": fileId}) if _dbComponent else []
            if _fileRecords:
                _fileRecord = _fileRecords[0]
                _get = (lambda k, d=None: _fileRecord.get(k, d)) if isinstance(_fileRecord, dict) else (lambda k, d=None: getattr(_fileRecord, k, d))
                _shouldNeutralize = bool(_get("neutralize", False))
                _fileScope = _get("scope")
                if _fileScope:
                    index.scope = _fileScope
                _fileCreatedBy = _get("_createdBy")
                if _fileCreatedBy:
                    index.userId = str(_fileCreatedBy)
        except Exception:
            pass

        if _shouldNeutralize and textObjects:
            _neutralizedObjects = []
            try:
                _neutralSvc = self._getService("neutralization")
            except Exception:
                _neutralSvc = None
            if _neutralSvc:
                for _obj in textObjects:
                    _textContent = (_obj.get("data", "") or "").strip()
                    if not _textContent:
                        continue
                    try:
                        _neutralResult = _neutralSvc.processText(
                            _textContent, userId=userId, featureInstanceId=featureInstanceId
                        )
                        if _neutralResult and _neutralResult.get("neutralized_text"):
                            _obj["data"] = _neutralResult["neutralized_text"]
                            _neutralizedObjects.append(_obj)
                        else:
                            logger.warning(f"Neutralization failed for file {fileId}, skipping text object (fail-safe)")
                    except Exception as e:
                        logger.warning(f"Neutralization error for file {fileId}: {e}, skipping text object (fail-safe)")
                textObjects = _neutralizedObjects
            else:
                logger.warning(f"Neutralization required for file {fileId} but service unavailable, skipping text indexing")
                textObjects = []

        if textObjects:
            self._knowledgeDb.updateFileStatus(fileId, "embedding")
            chunks = _chunkForEmbedding(textObjects, maxTokens=DEFAULT_CHUNK_TOKENS)
            texts = [c["data"] for c in chunks]

            totalChars = sum(len(t) for t in texts)
            estTokens = totalChars // CHARS_PER_TOKEN
            logger.info(
                f"Embedding file {fileId}: {len(textObjects)} text objects -> "
                f"{len(chunks)} chunks, ~{estTokens} tokens total"
            )

            embeddings = await self._embed(texts) if texts else []

            for i, chunk in enumerate(chunks):
                embedding = embeddings[i] if i < len(embeddings) else None
                contentChunk = ContentChunk(
                    contentObjectId=chunk["contentObjectId"],
                    fileId=fileId,
                    userId=userId,
                    featureInstanceId=featureInstanceId,
                    contentType="text",
                    data=chunk["data"],
                    contextRef=chunk["contextRef"],
                    embedding=embedding,
                )
                self._knowledgeDb.upsertContentChunk(contentChunk)

        # 3. Store non-text content objects (images, etc.) without embedding
        nonTextObjects = [o for o in contentObjects if o.get("contentType") != "text"]
        for obj in nonTextObjects:
            contentChunk = ContentChunk(
                contentObjectId=obj.get("contentObjectId", ""),
                fileId=fileId,
                userId=userId,
                featureInstanceId=featureInstanceId,
                contentType=obj.get("contentType", "other"),
                data=obj.get("data", ""),
                contextRef=obj.get("contextRef", {}),
                embedding=None,
            )
            self._knowledgeDb.upsertContentChunk(contentChunk)

        self._knowledgeDb.updateFileStatus(fileId, "indexed")
        index.status = "indexed"
        if _shouldNeutralize:
            try:
                index.neutralizationStatus = "completed"
                index.isNeutralized = True
                self._knowledgeDb.upsertFileContentIndex(index)
            except Exception as e:
                logger.debug(f"Could not set neutralizationStatus for file {fileId}: {e}")
        logger.info(f"Indexed file {fileId} ({fileName}): {len(contentObjects)} objects, {len(textObjects)} text chunks")
        return index

    # =========================================================================
    # RAG Context Building (3-tier search)
    # =========================================================================

    async def buildAgentContext(
        self,
        currentPrompt: str,
        workflowId: str,
        userId: str,
        featureInstanceId: str = "",
        mandateId: str = "",
        contextBudget: int = DEFAULT_CONTEXT_BUDGET,
        workflowHintItems: List[Dict[str, Any]] = None,
        isSysAdmin: bool = False,
    ) -> str:
        """Build RAG context for an agent round by searching all layers.

        Layer priority:
            0  - File refs from RoundMemory (always included so the agent knows
                 which files exist in this workflow)
            1  - Instance documents (user's own indexed files)
            1.5 - Semantically relevant RoundMemory entries
            2  - Workflow entities
            3  - Shared knowledge
            4  - Cross-workflow hint (other conversations in this workspace)

        Args:
            currentPrompt: The current user prompt to find relevant context for.
            workflowId: Current workflow ID.
            userId: Current user.
            featureInstanceId: Feature instance scope.
            mandateId: Mandate scope.
            contextBudget: Maximum characters for the context string.
            workflowHintItems: Optional pre-built list of other workflow summaries
                for the cross-workflow hint layer.

        Returns:
            Formatted context string for injection into the agent's system prompt.
        """
        queryVector = await self._embedSingle(currentPrompt)
        if not queryVector:
            return ""

        builder = _ContextBuilder(budget=contextBudget)

        # Layer 0: File references from RoundMemory (always included)
        fileRefMemories = self._knowledgeDb.getRoundMemoriesByType(workflowId, "file_ref")
        if fileRefMemories:
            refItems = [
                {"key": m.get("key", ""), "value": m.get("summary", "")[:300]}
                for m in fileRefMemories
            ]
            builder.add(
                priority=0,
                label="Known Files",
                items=refItems,
                isKeyValue=True,
                maxChars=2000,
            )

        # Layer 1: Scope-based document search (personal + instance + mandate + global)
        instanceChunks = self._knowledgeDb.semanticSearch(
            queryVector=queryVector,
            userId=userId,
            featureInstanceId=featureInstanceId,
            mandateId=mandateId,
            limit=15,
            minScore=0.65,
            isSysAdmin=isSysAdmin,
        )
        if instanceChunks:
            builder.add(priority=1, label="Relevant Documents", items=instanceChunks, maxChars=4000)

        # Layer 1.5: Semantically relevant RoundMemory entries
        roundMemories = self._knowledgeDb.semanticSearchRoundMemory(
            queryVector=queryVector,
            workflowId=workflowId,
            limit=10,
            minScore=0.55,
        )
        if roundMemories:
            memItems = []
            for m in roundMemories:
                data = m.get("fullData") or m.get("summary", "")
                memItems.append({
                    "data": data,
                    "contextRef": {
                        "type": m.get("memoryType", ""),
                        "key": m.get("key", ""),
                        "round": m.get("roundNumber", 0),
                    },
                })
            seen = {m.get("key") for m in fileRefMemories} if fileRefMemories else set()
            memItems = [
                mi for mi in memItems if mi["contextRef"].get("key") not in seen
            ]
            if memItems:
                builder.add(
                    priority=2,
                    label="Previous Round Context",
                    items=memItems,
                    maxChars=4000,
                )

        # Layer 2: Workflow Layer (current workflow entities & memory)
        entities = self._knowledgeDb.getWorkflowEntities(workflowId)
        if entities:
            builder.add(priority=3, label="Workflow Context", items=entities, isKeyValue=True, maxChars=2000)

        # Layer 3: Shared Layer (mandate-wide shared documents)
        sharedChunks = self._knowledgeDb.semanticSearch(
            queryVector=queryVector,
            mandateId=mandateId,
            isShared=True,
            limit=10,
            minScore=0.7,
            isSysAdmin=isSysAdmin,
        )
        if sharedChunks:
            builder.add(priority=4, label="Shared Knowledge", items=sharedChunks, maxChars=2000)

        # Layer 4: Cross-workflow hint (other conversations in this workspace)
        if workflowHintItems:
            builder.add(
                priority=5,
                label="Other Conversations",
                items=workflowHintItems,
                isKeyValue=True,
                maxChars=500,
            )

        return builder.build()

    # =========================================================================
    # Workflow Memory
    # =========================================================================

    async def storeEntity(
        self,
        workflowId: str,
        userId: str,
        featureInstanceId: str,
        key: str,
        value: str,
        source: str = "extraction",
    ) -> WorkflowMemory:
        """Store a key-value entity in workflow memory with optional embedding."""
        embedding = await self._embedSingle(f"{key}: {value}")
        memory = WorkflowMemory(
            workflowId=workflowId,
            userId=userId,
            featureInstanceId=featureInstanceId,
            key=key,
            value=value,
            source=source,
            embedding=embedding if embedding else None,
        )
        self._knowledgeDb.upsertWorkflowMemory(memory)
        return memory

    def getEntities(self, workflowId: str) -> List[Dict[str, Any]]:
        """Get all entities for a workflow."""
        return self._knowledgeDb.getWorkflowEntities(workflowId)

    # =========================================================================
    # File Status
    # =========================================================================

    def getFileStatus(self, fileId: str) -> Optional[str]:
        """Get the indexing status of a file."""
        index = self._knowledgeDb.getFileContentIndex(fileId)
        return index.get("status") if index else None

    def isFileIndexed(self, fileId: str) -> bool:
        """Check if a file has been fully indexed."""
        return self.getFileStatus(fileId) == "indexed"

    # =========================================================================
    # On-Demand Extraction (Smart Document Handling)
    # =========================================================================

    async def readSection(self, fileId: str, sectionId: str) -> List[Dict[str, Any]]:
        """Read content objects for a specific section. Uses cache if available.

        Args:
            fileId: Source file ID.
            sectionId: Section identifier from the FileContentIndex structure.

        Returns:
            List of content object dicts with data and contextRef.
        """
        cached = self._knowledgeDb.getContentChunks(fileId)
        sectionChunks = [
            c for c in (cached or [])
            if (c.get("contextRef", {}).get("sectionId") == sectionId)
        ]
        if sectionChunks:
            return sectionChunks

        index = self._knowledgeDb.getFileContentIndex(fileId)
        if not index:
            return []

        structure = index.get("structure", {}) if isinstance(index, dict) else getattr(index, "structure", {})
        sections = structure.get("sections", [])
        section = next((s for s in sections if s.get("id") == sectionId), None)
        if not section:
            return []

        startPage = section.get("startPage", 0)
        endPage = section.get("endPage", startPage)

        return await self._extractPagesOnDemand(fileId, startPage, endPage, sectionId)

    async def readContentObjects(
        self, fileId: str, filter: Dict[str, Any] = None
    ) -> List[Dict[str, Any]]:
        """Read content objects with optional filters (pageIndex, contentType, sectionId).

        Args:
            fileId: Source file ID.
            filter: Optional dict with keys pageIndex (list[int]), contentType (str), sectionId (str).

        Returns:
            Filtered list of content chunk dicts.
        """
        filter = filter or {}
        chunks = self._knowledgeDb.getContentChunks(fileId) or []

        if "pageIndex" in filter:
            targetPages = filter["pageIndex"]
            if isinstance(targetPages, int):
                targetPages = [targetPages]
            chunks = [
                c for c in chunks
                if c.get("contextRef", {}).get("pageIndex") in targetPages
            ]

        if "contentType" in filter:
            chunks = [c for c in chunks if c.get("contentType") == filter["contentType"]]

        if "sectionId" in filter:
            chunks = [
                c for c in chunks
                if c.get("contextRef", {}).get("sectionId") == filter["sectionId"]
            ]

        return chunks

    async def extractContainerItem(
        self, fileId: str, containerPath: str
    ) -> Optional[Dict[str, Any]]:
        """On-demand extraction of a specific item within a container.

        If the item is already indexed, returns existing data.
        Otherwise triggers extraction and indexing.

        Args:
            fileId: The container file ID.
            containerPath: Path within the container (e.g. "folder/report.pdf").

        Returns:
            FileContentIndex dict for the extracted item, or None.
        """
        existing = self._knowledgeDb.getFileContentIndex(fileId)
        if existing:
            existingPath = existing.get("containerPath") if isinstance(existing, dict) else getattr(existing, "containerPath", None)
            if existingPath == containerPath:
                return existing

        logger.info(f"On-demand extraction for {containerPath} in file {fileId}")
        return None

    async def _extractPagesOnDemand(
        self, fileId: str, startPage: int, endPage: int, sectionId: str
    ) -> List[Dict[str, Any]]:
        """Extract specific pages from a file and cache in knowledge store."""
        try:
            chatService = self._getService("chat")
            fileContent = chatService.getFileContent(fileId)
            if not fileContent:
                return []

            fileData = fileContent.get("data", b"")
            mimeType = fileContent.get("mimeType", "")
            fileName = fileContent.get("fileName", "")

            if isinstance(fileData, str):
                import base64
                fileData = base64.b64decode(fileData)

            if mimeType != "application/pdf":
                return []

            try:
                import fitz
            except ImportError:
                return []

            doc = fitz.open(stream=fileData, filetype="pdf")
            results = []

            for pageIdx in range(startPage, min(endPage + 1, len(doc))):
                page = doc[pageIdx]
                text = page.get_text() or ""
                if not text.strip():
                    continue

                chunk = ContentChunk(
                    contentObjectId=f"page-{pageIdx}",
                    fileId=fileId,
                    userId=self._context.user.id if self._context.user else "",
                    featureInstanceId=self._context.feature_instance_id or "",
                    contentType="text",
                    data=text,
                    contextRef={
                        "containerPath": fileName,
                        "location": f"page:{pageIdx+1}",
                        "pageIndex": pageIdx,
                        "sectionId": sectionId,
                    },
                )

                embedding = await self._embedSingle(text[:2000])
                if embedding:
                    chunk.embedding = embedding

                self._knowledgeDb.upsertContentChunk(chunk)
                results.append(chunk.model_dump())

            doc.close()
            return results

        except Exception as e:
            logger.error(f"On-demand page extraction failed: {e}")
            return []

    def getFileContentIndex(self, fileId: str) -> Optional[Dict[str, Any]]:
        """Get the FileContentIndex for a file."""
        return self._knowledgeDb.getFileContentIndex(fileId)


# =============================================================================
# Internal helpers
# =============================================================================

def _estimateTokens(text: str) -> int:
    """Estimate token count using character-based heuristic (1 token ~ 4 chars)."""
    return max(1, len(text) // CHARS_PER_TOKEN)


def _splitSentences(text: str) -> List[str]:
    """Split text into sentences at common boundaries (.!?) followed by whitespace."""
    parts = re.split(r'(?<=[.!?])\s+', text.replace("\n", " ").strip())
    return [p for p in parts if p.strip()]


def _hardSplitByTokens(text: str, maxTokens: int) -> List[str]:
    """Force-split text into pieces that each fit within maxTokens.

    Used as safety net when sentence splitting produces oversized segments.
    Splits at word boundaries where possible.
    """
    maxChars = maxTokens * CHARS_PER_TOKEN
    pieces = []
    while len(text) > maxChars:
        splitAt = text.rfind(" ", 0, maxChars)
        if splitAt <= 0:
            splitAt = maxChars
        pieces.append(text[:splitAt].strip())
        text = text[splitAt:].strip()
    if text:
        pieces.append(text)
    return pieces


def _chunkForEmbedding(
    textObjects: List[Dict[str, Any]], maxTokens: int = DEFAULT_CHUNK_TOKENS
) -> List[Dict[str, Any]]:
    """Split text content objects into token-aware chunks suitable for embedding.

    Each chunk preserves the contextRef from its source object.
    Splits at sentence boundaries; applies hard-cap if a single sentence exceeds maxTokens.
    """
    chunks = []
    for obj in textObjects:
        text = (obj.get("data", "") or "").strip()
        if not text:
            continue
        contentObjectId = obj.get("contentObjectId", "")
        contextRef = obj.get("contextRef", {})

        if _estimateTokens(text) <= maxTokens:
            chunks.append({"data": text, "contentObjectId": contentObjectId, "contextRef": contextRef})
            continue

        sentences = _splitSentences(text)
        currentChunk = ""
        for sentence in sentences:
            if _estimateTokens(sentence) > maxTokens:
                if currentChunk.strip():
                    chunks.append({"data": currentChunk.strip(), "contentObjectId": contentObjectId, "contextRef": contextRef})
                    currentChunk = ""
                for piece in _hardSplitByTokens(sentence, maxTokens):
                    chunks.append({"data": piece, "contentObjectId": contentObjectId, "contextRef": contextRef})
                continue

            candidate = f"{currentChunk} {sentence}".strip() if currentChunk else sentence
            if _estimateTokens(candidate) > maxTokens:
                if currentChunk.strip():
                    chunks.append({"data": currentChunk.strip(), "contentObjectId": contentObjectId, "contextRef": contextRef})
                currentChunk = sentence
            else:
                currentChunk = candidate

        if currentChunk.strip():
            chunks.append({"data": currentChunk.strip(), "contentObjectId": contentObjectId, "contextRef": contextRef})

    return chunks


class _ContextBuilder:
    """Assembles RAG context from multiple sources respecting a character budget."""

    def __init__(self, budget: int):
        self._budget = budget
        self._sections: List[Dict[str, Any]] = []

    def add(
        self,
        priority: int,
        label: str,
        items: List[Dict[str, Any]],
        isKeyValue: bool = False,
        maxChars: int = 0,
    ):
        self._sections.append({
            "priority": priority,
            "label": label,
            "items": items,
            "isKeyValue": isKeyValue,
            "maxChars": maxChars,
        })

    def build(self) -> str:
        self._sections.sort(key=lambda s: s["priority"])
        parts = []
        remaining = self._budget

        for section in self._sections:
            if remaining <= 0:
                break

            sectionCap = section.get("maxChars") or remaining
            sectionRemaining = min(sectionCap, remaining)

            header = f"### {section['label']}\n"
            sectionText = header
            sectionRemaining -= len(header)

            for item in section["items"]:
                if sectionRemaining <= 0:
                    break

                if section["isKeyValue"]:
                    line = f"- {item.get('key', '')}: {item.get('value', '')}\n"
                else:
                    data = item.get("data", "")
                    ref = item.get("contextRef", {})
                    refStr = f" [{ref}]" if ref else ""
                    line = f"{data}{refStr}\n"

                if len(line) <= sectionRemaining:
                    sectionText += line
                    sectionRemaining -= len(line)

            consumed = min(sectionCap, remaining) - sectionRemaining
            remaining -= consumed
            parts.append(sectionText)

        return "\n".join(parts).strip()