From 12273247031d0941a45ab81097dc104618864a7f Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Wed, 18 Feb 2026 17:50:31 +0100
Subject: [PATCH] feat: differential transcript, fix bot name in WS handler, AI
commands support
Co-authored-by: Cursor
---
.../features/teamsbot/datamodelTeamsbot.py | 9 +-
.../teamsbot/interfaceFeatureTeamsbot.py | 4 +
.../features/teamsbot/routeFeatureTeamsbot.py | 27 ++-
modules/features/teamsbot/service.py | 207 +++++++++++++++---
modules/services/serviceAi/mainServiceAi.py | 12 +-
5 files changed, 220 insertions(+), 39 deletions(-)
diff --git a/modules/features/teamsbot/datamodelTeamsbot.py b/modules/features/teamsbot/datamodelTeamsbot.py
index 737f360e..dd823c2e 100644
--- a/modules/features/teamsbot/datamodelTeamsbot.py
+++ b/modules/features/teamsbot/datamodelTeamsbot.py
@@ -234,12 +234,19 @@ class TeamsbotConfigUpdateRequest(BaseModel):
# SPEECH_TEAMS AI Response Model
# ============================================================================
+class TeamsbotCommand(BaseModel):
+ """A structured command the AI can issue to control Teams meeting actions."""
+ action: str = Field(description="Command action: toggleTranscript, sendChat, readAloud, changeLanguage, toggleMic, toggleCamera")
+ params: Optional[Dict[str, Any]] = Field(default=None, description="Action-specific parameters")
+
+
class SpeechTeamsResponse(BaseModel):
"""Structured response from the SPEECH_TEAMS AI handler."""
shouldRespond: bool = Field(description="Whether the bot should respond")
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
- detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, none")
+ detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
+ commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
# ============================================================================
diff --git a/modules/features/teamsbot/interfaceFeatureTeamsbot.py b/modules/features/teamsbot/interfaceFeatureTeamsbot.py
index 3a2caf47..4640b45b 100644
--- a/modules/features/teamsbot/interfaceFeatureTeamsbot.py
+++ b/modules/features/teamsbot/interfaceFeatureTeamsbot.py
@@ -148,6 +148,10 @@ class TeamsbotObjects:
transcriptData["creationDate"] = getIsoTimestamp()
return self.db.recordCreate(TeamsbotTranscript, transcriptData)
+ def updateTranscript(self, transcriptId: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ """Update an existing transcript segment (used for differential writing)."""
+ return self.db.recordModify(TeamsbotTranscript, transcriptId, updates)
+
def _deleteTranscriptsBySession(self, sessionId: str) -> int:
"""Delete all transcripts for a session."""
records = self.db.getRecordset(TeamsbotTranscript, recordFilter={"sessionId": sessionId})
diff --git a/modules/features/teamsbot/routeFeatureTeamsbot.py b/modules/features/teamsbot/routeFeatureTeamsbot.py
index a441d461..abac3ef0 100644
--- a/modules/features/teamsbot/routeFeatureTeamsbot.py
+++ b/modules/features/teamsbot/routeFeatureTeamsbot.py
@@ -1110,8 +1110,33 @@ async def botWebsocket(
logger.warning(f"Could not load original user {startedByUserId}, falling back to system user")
originalUser = systemUser
+ # Build effective config with the session's actual bot name.
+ # The session stores the resolved bot name (from system bot or user override).
+ # Without this, the default config botName (e.g. "AI Assistant") is used,
+ # which is wrong for registered system bots.
+ sessionBotName = session.get("botName") if session else None
+ if sessionBotName:
+ config = config.model_copy(update={"botName": sessionBotName})
+ logger.info(f"Browser Bot WebSocket: Using session botName '{sessionBotName}' (not default '{_getInstanceConfig(instanceId).botName}')")
+
+ # Also merge user-specific settings if available
+ if startedByUserId:
+ interface = interfaceDb.getInterface(originalUser, mandateId=mandateId, featureInstanceId=instanceId)
+ userSettings = interface.getUserSettings(startedByUserId, instanceId)
+ if userSettings:
+ overrides = {}
+ for field in ["aiSystemPrompt", "responseMode", "responseChannel", "transferMode",
+ "language", "voiceId", "triggerIntervalSeconds", "triggerCooldownSeconds",
+ "contextWindowSegments"]:
+ value = userSettings.get(field)
+ if value is not None:
+ overrides[field] = value
+ if overrides:
+ config = config.model_copy(update=overrides)
+ logger.info(f"Browser Bot WebSocket: Applied user settings overrides: {list(overrides.keys())}")
+
service = TeamsbotService(originalUser, mandateId, instanceId, config)
- logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}")
+ logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}, botName={config.botName}")
await service.handleBotWebSocket(websocket, sessionId)
except WebSocketDisconnect:
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 5457758e..6482c073 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -27,6 +27,7 @@ from .datamodelTeamsbot import (
TeamsbotResponseMode,
TeamsbotResponseChannel,
SpeechTeamsResponse,
+ TeamsbotCommand,
)
from .browserBotConnector import BrowserBotConnector
@@ -81,6 +82,12 @@ class TeamsbotService:
self._sessionContext: Optional[str] = None # User-provided background context
self._contextSummary: Optional[str] = None # AI-generated summary of long context
+ # Differential transcript tracking: only write new text, update existing
+ # record when the same speaker continues speaking
+ self._lastTranscriptSpeaker: Optional[str] = None
+ self._lastTranscriptText: Optional[str] = None
+ self._lastTranscriptId: Optional[str] = None
+
# =========================================================================
# Session Lifecycle
# =========================================================================
@@ -438,61 +445,99 @@ class TeamsbotService:
websocket: WebSocket,
source: str = "caption",
):
- """Process a transcript segment from captions or chat messages."""
+ """Process a transcript segment from captions or chat messages.
+
+ Differential writing: When the same speaker continues (text grows
+ incrementally as captions stream), we UPDATE the existing DB record
+ instead of creating a cascade of near-duplicate rows. A new record
+ is only created when the speaker changes or the text is not a
+ continuation of the previous segment.
+ """
text = text.strip()
if not text:
return
# Filter out the bot's own speech from AI triggering.
- # The bot hears itself via captions — these should be stored in the
- # transcript for the record, but must NOT trigger AI analysis (feedback loop).
isBotSpeaker = self._isBotSpeaker(speaker)
- # Store transcript segment
- transcriptData = TeamsbotTranscript(
- sessionId=sessionId,
- speaker=speaker,
- text=text,
- timestamp=getIsoTimestamp(),
- confidence=1.0, # Captions don't have confidence scores
- language=self.config.language,
- isFinal=isFinal,
- ).model_dump()
+ # Differential transcript writing:
+ # If the same speaker is still talking and the new text is a
+ # continuation (starts with the previous text), UPDATE the existing
+ # record instead of creating a new one. This avoids cascading rows like:
+ # "Der AHV"
+ # "Der AHV Fonds"
+ # "Der AHV Fonds hat 2025"
+ # and instead keeps a single row that grows until the speaker changes.
+ isContinuation = (
+ self._lastTranscriptSpeaker == speaker
+ and self._lastTranscriptText
+ and self._lastTranscriptId
+ and text.startswith(self._lastTranscriptText)
+ and source == "caption" # only for captions, chat messages are always new
+ )
- createdTranscript = interface.createTranscript(transcriptData)
+ if isContinuation:
+ interface.updateTranscript(self._lastTranscriptId, {
+ "text": text,
+ "isFinal": isFinal,
+ })
+ self._lastTranscriptText = text
+ createdTranscript = {"id": self._lastTranscriptId}
- # Update context buffer (mark source for chat messages)
- self._contextBuffer.append({
- "speaker": speaker or "Unknown",
- "text": text,
- "timestamp": getUtcTimestamp(),
- "source": source,
- })
- # Keep only last N segments
- maxSegments = self.config.contextWindowSegments
- if len(self._contextBuffer) > maxSegments:
- # When buffer overflows, summarize the older half to preserve context
- # without losing information. The summary replaces the old segments.
- if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
- asyncio.create_task(self._summarizeContextBuffer(sessionId))
- self._contextBuffer = self._contextBuffer[-maxSegments:]
+ # Update context buffer: replace last entry for same speaker
+ if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
+ self._contextBuffer[-1]["text"] = text
+ else:
+ # New speaker or non-continuation → create a new record
+ transcriptData = TeamsbotTranscript(
+ sessionId=sessionId,
+ speaker=speaker,
+ text=text,
+ timestamp=getIsoTimestamp(),
+ confidence=1.0,
+ language=self.config.language,
+ isFinal=isFinal,
+ ).model_dump()
- # Emit SSE event for live transcript
+ createdTranscript = interface.createTranscript(transcriptData)
+
+ # Track for differential writing
+ self._lastTranscriptSpeaker = speaker
+ self._lastTranscriptText = text
+ self._lastTranscriptId = createdTranscript.get("id")
+
+ # Append to context buffer
+ self._contextBuffer.append({
+ "speaker": speaker or "Unknown",
+ "text": text,
+ "timestamp": getUtcTimestamp(),
+ "source": source,
+ })
+
+ # Keep only last N segments
+ maxSegments = self.config.contextWindowSegments
+ if len(self._contextBuffer) > maxSegments:
+ if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
+ asyncio.create_task(self._summarizeContextBuffer(sessionId))
+ self._contextBuffer = self._contextBuffer[-maxSegments:]
+
+ # Update session transcript count (only for new records)
+ session = interface.getSession(sessionId)
+ if session:
+ count = session.get("transcriptSegmentCount", 0) + 1
+ interface.updateSession(sessionId, {"transcriptSegmentCount": count})
+
+ # Emit SSE event for live transcript (always, for UI updates)
await _emitSessionEvent(sessionId, "transcript", {
"id": createdTranscript.get("id"),
"speaker": speaker,
"text": text,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
+ "isContinuation": isContinuation,
})
- # Update session transcript count
- session = interface.getSession(sessionId)
- if session:
- count = session.get("transcriptSegmentCount", 0) + 1
- interface.updateSession(sessionId, {"transcriptSegmentCount": count})
-
# Skip AI analysis for bot's own speech (prevents feedback loop)
if isBotSpeaker:
logger.debug(f"Session {sessionId}: Skipping AI trigger for bot's own speech: [{speaker}] {text[:60]}...")
@@ -805,10 +850,100 @@ class TeamsbotService:
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
+ # Step 5: Execute AI-issued commands (if any)
+ if speechResult.commands:
+ await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)
+
except Exception as e:
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
+ # =========================================================================
+ # AI Command Execution
+ # =========================================================================
+
+ async def _executeCommands(
+ self,
+ sessionId: str,
+ commands: List[TeamsbotCommand],
+ voiceInterface,
+ websocket: WebSocket,
+ ):
+ """Execute structured commands returned by the AI.
+
+ Each command is sent to the browser bot via WebSocket as a
+ 'botCommand' message. The bot's TeamsActionsService handles
+ the actual Teams UI interaction (checking state, toggling, etc.).
+ """
+ for cmd in commands:
+ action = cmd.action
+ params = cmd.params or {}
+ logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")
+
+ try:
+ if action == "toggleTranscript":
+ enable = params.get("enable", True)
+ if websocket:
+ await websocket.send_text(json.dumps({
+ "type": "botCommand",
+ "sessionId": sessionId,
+ "command": "toggleTranscript",
+ "params": {"enable": enable},
+ }))
+
+ elif action == "sendChat":
+ chatText = params.get("text", "")
+ if chatText and websocket:
+ await websocket.send_text(json.dumps({
+ "type": "sendChatMessage",
+ "sessionId": sessionId,
+ "text": chatText,
+ }))
+
+ elif action == "readAloud":
+ readText = params.get("text", "")
+ if readText and voiceInterface:
+ ttsResult = await voiceInterface.textToSpeech(
+ text=readText,
+ languageCode=self.config.language,
+ voiceName=self.config.voiceId,
+ )
+ if ttsResult and isinstance(ttsResult, dict):
+ audioContent = ttsResult.get("audioContent")
+ if audioContent and websocket:
+ await websocket.send_text(json.dumps({
+ "type": "playAudio",
+ "sessionId": sessionId,
+ "audio": {
+ "data": base64.b64encode(
+ audioContent if isinstance(audioContent, bytes) else audioContent.encode()
+ ).decode(),
+ "format": "mp3",
+ },
+ }))
+
+ elif action == "changeLanguage":
+ newLang = params.get("language", "")
+ if newLang:
+ self.config = self.config.model_copy(update={"language": newLang})
+ logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
+ await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})
+
+ elif action in ("toggleMic", "toggleCamera"):
+ if websocket:
+ await websocket.send_text(json.dumps({
+ "type": "botCommand",
+ "sessionId": sessionId,
+ "command": action,
+ "params": params,
+ }))
+
+ else:
+ logger.warning(f"Session {sessionId}: Unknown command '{action}'")
+
+ except Exception as cmdErr:
+ logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")
+
# =========================================================================
# Context Summarization (for long sessions)
# =========================================================================
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index ab45b49f..3537fe43 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -379,12 +379,22 @@ shouldRespond auf false. Du musst NICHT antworten wenn jemand dich stoppt."""
basePrompt += f"""
+KOMMANDOS: Du kannst optionale Aktions-Kommandos ausfuehren lassen.
+Verfuegbare Kommandos (im "commands" Array):
+- {{"action": "toggleTranscript", "params": {{"enable": true/false}}}} — Transkription ein-/ausschalten
+- {{"action": "sendChat", "params": {{"text": "Nachricht"}}}} — Zusaetzliche Nachricht in den Chat schreiben (unabhaengig von responseText)
+- {{"action": "readAloud", "params": {{"text": "Text zum Vorlesen"}}}} — Einen bestimmten Text vorlesen (unabhaengig von responseText)
+- {{"action": "changeLanguage", "params": {{"language": "en-US"}}}} — Kommunikationssprache aendern (z.B. "de-DE", "en-US", "fr-FR")
+
+Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Transkription ein", "schreib das in den Chat", "lies das vor", "sprich Englisch").
+
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
{{
"shouldRespond": true/false,
"responseText": "Deine Antwort hier" oder null,
"reasoning": "Kurze Begruendung deiner Entscheidung",
- "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none"
+ "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
+ "commands": [] oder null
}}
detectedIntent-Werte: