gateway/modules/features/commcoach/serviceCommcoachIndexer.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
CommCoach Session Indexer.
Indexes coaching session data into the knowledge store (pgvector) for RAG-based long-term memory.
Called after session completion to ensure semantic searchability across 20+ sessions.
"""

import logging
import uuid
import json
from typing import List, Dict, Any, Optional

logger = logging.getLogger(__name__)

_COACHING_FILE_PREFIX = "coaching-session:"


async def indexSessionData(
    sessionId: str,
    contextId: str,
    userId: str,
    featureInstanceId: str,
    mandateId: str,
    messages: List[Dict[str, Any]],
    summary: Optional[str],
    keyTopics: Optional[str],
    goals: Optional[List[Any]],
    insights: Optional[List[Any]],
    tasks: Optional[List[Dict[str, Any]]],
    contextTitle: str = "",
    knowledgeService=None,
):
    """Index a completed coaching session into the knowledge store.

    Creates ContentChunks with embeddings for:
    - Each User+Assistant message pair (maximum detail depth)
    - Session summary
    - Key topics (individually, for precise retrieval)
    - Current goals
    - New insights
    - Tasks (open + done)
    """
    if not knowledgeService:
        logger.warning("No knowledge service available for coaching indexer")
        return

    syntheticFileId = f"{_COACHING_FILE_PREFIX}{sessionId}"

    chunks = []

    # 1. Message pairs (User + Assistant) as individual chunks
    messagePairs = _extractMessagePairs(messages)
    for idx, pair in enumerate(messagePairs):
        chunks.append({
            "contentObjectId": f"{sessionId}:msg-pair:{idx}",
            "data": pair["text"],
            "contextRef": {
                "containerPath": f"session:{sessionId}",
                "location": f"message-pair-{idx}",
                "type": "coaching-message-pair",
                "contextId": contextId,
                "sessionId": sessionId,
                "contextTitle": contextTitle,
            },
        })

    # 2. Session summary
    if summary:
        chunks.append({
            "contentObjectId": f"{sessionId}:summary",
            "data": f"Session-Zusammenfassung ({contextTitle}): {summary}",
            "contextRef": {
                "containerPath": f"session:{sessionId}",
                "location": "summary",
                "type": "coaching-session-summary",
                "contextId": contextId,
                "sessionId": sessionId,
                "contextTitle": contextTitle,
            },
        })

    # 3. Key topics (each as separate chunk for precise retrieval)
    parsedTopics = _parseJsonSafe(keyTopics, [])
    for tidx, topic in enumerate(parsedTopics):
        topicStr = str(topic).strip()
        if topicStr:
            chunks.append({
                "contentObjectId": f"{sessionId}:topic:{tidx}",
                "data": f"Coaching-Thema ({contextTitle}): {topicStr}",
                "contextRef": {
                    "containerPath": f"session:{sessionId}",
                    "location": f"topic-{tidx}",
                    "type": "coaching-key-topic",
                    "contextId": contextId,
                    "sessionId": sessionId,
                    "contextTitle": contextTitle,
                },
            })

    # 4. Goals
    if goals:
        goalTexts = [g.get("text", g) if isinstance(g, dict) else str(g) for g in goals if g]
        if goalTexts:
            goalsStr = "\n".join(f"- {g}" for g in goalTexts)
            chunks.append({
                "contentObjectId": f"{sessionId}:goals",
                "data": f"Coaching-Ziele ({contextTitle}):\n{goalsStr}",
                "contextRef": {
                    "containerPath": f"session:{sessionId}",
                    "location": "goals",
                    "type": "coaching-goals",
                    "contextId": contextId,
                    "sessionId": sessionId,
                    "contextTitle": contextTitle,
                },
            })

    # 5. Insights
    if insights:
        insightTexts = [i.get("text", i) if isinstance(i, dict) else str(i) for i in insights if i]
        if insightTexts:
            insightsStr = "\n".join(f"- {t}" for t in insightTexts)
            chunks.append({
                "contentObjectId": f"{sessionId}:insights",
                "data": f"Coaching-Erkenntnisse ({contextTitle}):\n{insightsStr}",
                "contextRef": {
                    "containerPath": f"session:{sessionId}",
                    "location": "insights",
                    "type": "coaching-insights",
                    "contextId": contextId,
                    "sessionId": sessionId,
                    "contextTitle": contextTitle,
                },
            })

    # 6. Tasks
    if tasks:
        taskLines = []
        for t in tasks:
            status = t.get("status", "open")
            title = t.get("title", "")
            if title:
                taskLines.append(f"- [{status}] {title}")
        if taskLines:
            tasksStr = "\n".join(taskLines)
            chunks.append({
                "contentObjectId": f"{sessionId}:tasks",
                "data": f"Coaching-Aufgaben ({contextTitle}):\n{tasksStr}",
                "contextRef": {
                    "containerPath": f"session:{sessionId}",
                    "location": "tasks",
                    "type": "coaching-tasks",
                    "contextId": contextId,
                    "sessionId": sessionId,
                    "contextTitle": contextTitle,
                },
            })

    if not chunks:
        logger.info(f"No chunks to index for session {sessionId}")
        return

    logger.info(f"Indexing {len(chunks)} chunks for coaching session {sessionId}")

    try:
        contentObjects = [
            {
                "contentObjectId": c["contentObjectId"],
                "contentType": "text",
                "data": c["data"],
                "contextRef": c["contextRef"],
            }
            for c in chunks
        ]

        await knowledgeService.indexFile(
            fileId=syntheticFileId,
            fileName=f"coaching-session-{sessionId[:8]}",
            mimeType="application/x-coaching-session",
            userId=userId,
            featureInstanceId=featureInstanceId,
            mandateId=mandateId,
            contentObjects=contentObjects,
        )
        logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
    except Exception as e:
        logger.error(f"Failed to index coaching session {sessionId}: {e}", exc_info=True)


def _extractMessagePairs(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """Extract User+Assistant pairs from message list."""
    pairs = []
    i = 0
    while i < len(messages):
        msg = messages[i]
        if msg.get("role") == "user":
            userText = (msg.get("content") or "").strip()
            assistantText = ""
            if i + 1 < len(messages) and messages[i + 1].get("role") == "assistant":
                assistantText = (messages[i + 1].get("content") or "").strip()
                i += 2
            else:
                i += 1
            if userText:
                text = f"Benutzer: {userText}"
                if assistantText:
                    text += f"\nCoach: {assistantText}"
                pairs.append({"text": text})
        else:
            i += 1
    return pairs


def _parseJsonSafe(value, fallback):
    if not value:
        return fallback
    if isinstance(value, (list, dict)):
        return value
    try:
        return json.loads(value)
    except (json.JSONDecodeError, TypeError):
        return fallback