gateway/modules/features/commcoach/serviceCommcoachIndexer.py
2026-04-29 14:39:40 +02:00

235 lines
8 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
CommCoach Session Indexer.
Indexes coaching session data into the knowledge store (pgvector) for RAG-based long-term memory.
Called after session completion to ensure semantic searchability across 20+ sessions.
"""
import logging
import uuid
import json
from typing import List, Dict, Any, Optional
logger = logging.getLogger(__name__)
_COACHING_FILE_PREFIX = "coaching-session:"
async def indexSessionData(
sessionId: str,
contextId: str,
userId: str,
featureInstanceId: str,
mandateId: str,
messages: List[Dict[str, Any]],
summary: Optional[str],
keyTopics: Optional[str],
goals: Optional[List[Any]],
insights: Optional[List[Any]],
tasks: Optional[List[Dict[str, Any]]],
contextTitle: str = "",
knowledgeService=None,
):
"""Index a completed coaching session into the knowledge store.
Creates ContentChunks with embeddings for:
- Each User+Assistant message pair (maximum detail depth)
- Session summary
- Key topics (individually, for precise retrieval)
- Current goals
- New insights
- Tasks (open + done)
"""
if not knowledgeService:
logger.warning("No knowledge service available for coaching indexer")
return
syntheticFileId = f"{_COACHING_FILE_PREFIX}{sessionId}"
chunks = []
# 1. Message pairs (User + Assistant) as individual chunks
messagePairs = _extractMessagePairs(messages)
for idx, pair in enumerate(messagePairs):
chunks.append({
"contentObjectId": f"{sessionId}:msg-pair:{idx}",
"data": pair["text"],
"contextRef": {
"containerPath": f"session:{sessionId}",
"location": f"message-pair-{idx}",
"type": "coaching-message-pair",
"contextId": contextId,
"sessionId": sessionId,
"contextTitle": contextTitle,
},
})
# 2. Session summary
if summary:
chunks.append({
"contentObjectId": f"{sessionId}:summary",
"data": f"Session-Zusammenfassung ({contextTitle}): {summary}",
"contextRef": {
"containerPath": f"session:{sessionId}",
"location": "summary",
"type": "coaching-session-summary",
"contextId": contextId,
"sessionId": sessionId,
"contextTitle": contextTitle,
},
})
# 3. Key topics (each as separate chunk for precise retrieval)
parsedTopics = _parseJsonSafe(keyTopics, [])
for tidx, topic in enumerate(parsedTopics):
topicStr = str(topic).strip()
if topicStr:
chunks.append({
"contentObjectId": f"{sessionId}:topic:{tidx}",
"data": f"Coaching-Thema ({contextTitle}): {topicStr}",
"contextRef": {
"containerPath": f"session:{sessionId}",
"location": f"topic-{tidx}",
"type": "coaching-key-topic",
"contextId": contextId,
"sessionId": sessionId,
"contextTitle": contextTitle,
},
})
# 4. Goals
if goals:
goalTexts = [g.get("text", g) if isinstance(g, dict) else str(g) for g in goals if g]
if goalTexts:
goalsStr = "\n".join(f"- {g}" for g in goalTexts)
chunks.append({
"contentObjectId": f"{sessionId}:goals",
"data": f"Coaching-Ziele ({contextTitle}):\n{goalsStr}",
"contextRef": {
"containerPath": f"session:{sessionId}",
"location": "goals",
"type": "coaching-goals",
"contextId": contextId,
"sessionId": sessionId,
"contextTitle": contextTitle,
},
})
# 5. Insights
if insights:
insightTexts = [i.get("text", i) if isinstance(i, dict) else str(i) for i in insights if i]
if insightTexts:
insightsStr = "\n".join(f"- {t}" for t in insightTexts)
chunks.append({
"contentObjectId": f"{sessionId}:insights",
"data": f"Coaching-Erkenntnisse ({contextTitle}):\n{insightsStr}",
"contextRef": {
"containerPath": f"session:{sessionId}",
"location": "insights",
"type": "coaching-insights",
"contextId": contextId,
"sessionId": sessionId,
"contextTitle": contextTitle,
},
})
# 6. Tasks
if tasks:
taskLines = []
for t in tasks:
status = t.get("status", "open")
title = t.get("title", "")
if title:
taskLines.append(f"- [{status}] {title}")
if taskLines:
tasksStr = "\n".join(taskLines)
chunks.append({
"contentObjectId": f"{sessionId}:tasks",
"data": f"Coaching-Aufgaben ({contextTitle}):\n{tasksStr}",
"contextRef": {
"containerPath": f"session:{sessionId}",
"location": "tasks",
"type": "coaching-tasks",
"contextId": contextId,
"sessionId": sessionId,
"contextTitle": contextTitle,
},
})
if not chunks:
logger.info(f"No chunks to index for session {sessionId}")
return
logger.info(f"Indexing {len(chunks)} chunks for coaching session {sessionId}")
try:
contentObjects = [
{
"contentObjectId": c["contentObjectId"],
"contentType": "text",
"data": c["data"],
"contextRef": c["contextRef"],
}
for c in chunks
]
from modules.serviceCenter.services.serviceKnowledge.mainServiceKnowledge import IngestionJob
await knowledgeService.requestIngestion(
IngestionJob(
sourceKind="coaching_session",
sourceId=syntheticFileId,
fileName=f"coaching-session-{sessionId[:8]}",
mimeType="application/x-coaching-session",
userId=userId,
featureInstanceId=featureInstanceId,
mandateId=mandateId,
contentObjects=contentObjects,
provenance={
"lane": "feature",
"feature": "commcoach",
"sessionId": sessionId,
"contextId": contextId,
"messageCount": len(messages or []),
},
)
)
logger.info(f"Successfully indexed coaching session {sessionId} ({len(chunks)} chunks)")
except Exception as e:
logger.error(f"Failed to index coaching session {sessionId}: {e}", exc_info=True)
def _extractMessagePairs(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""Extract User+Assistant pairs from message list."""
pairs = []
i = 0
while i < len(messages):
msg = messages[i]
if msg.get("role") == "user":
userText = (msg.get("content") or "").strip()
assistantText = ""
if i + 1 < len(messages) and messages[i + 1].get("role") == "assistant":
assistantText = (messages[i + 1].get("content") or "").strip()
i += 2
else:
i += 1
if userText:
text = f"Benutzer: {userText}"
if assistantText:
text += f"\nCoach: {assistantText}"
pairs.append({"text": text})
else:
i += 1
return pairs
def _parseJsonSafe(value, fallback):
if not value:
return fallback
if isinstance(value, (list, dict)):
return value
try:
return json.loads(value)
except (json.JSONDecodeError, TypeError):
return fallback