From 12273247031d0941a45ab81097dc104618864a7f Mon Sep 17 00:00:00 2001
From: patrick-motsch <p.motsch@valueon.ch>
Date: Wed, 18 Feb 2026 17:50:31 +0100
Subject: [PATCH] feat: differential transcript, fix bot name in WS handler, AI
 commands support

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../features/teamsbot/datamodelTeamsbot.py    |   9 +-
 .../teamsbot/interfaceFeatureTeamsbot.py      |   4 +
 .../features/teamsbot/routeFeatureTeamsbot.py |  27 ++-
 modules/features/teamsbot/service.py          | 207 +++++++++++++++---
 modules/services/serviceAi/mainServiceAi.py   |  12 +-
 5 files changed, 220 insertions(+), 39 deletions(-)

diff --git a/modules/features/teamsbot/datamodelTeamsbot.py b/modules/features/teamsbot/datamodelTeamsbot.py
index 737f360e..dd823c2e 100644
--- a/modules/features/teamsbot/datamodelTeamsbot.py
+++ b/modules/features/teamsbot/datamodelTeamsbot.py
@@ -234,12 +234,19 @@ class TeamsbotConfigUpdateRequest(BaseModel):
 # SPEECH_TEAMS AI Response Model
 # ============================================================================
 
+class TeamsbotCommand(BaseModel):
+    """A structured command the AI can issue to control Teams meeting actions."""
+    action: str = Field(description="Command action: toggleTranscript, sendChat, readAloud, changeLanguage, toggleMic, toggleCamera")
+    params: Optional[Dict[str, Any]] = Field(default=None, description="Action-specific parameters")
+
+
 class SpeechTeamsResponse(BaseModel):
     """Structured response from the SPEECH_TEAMS AI handler."""
     shouldRespond: bool = Field(description="Whether the bot should respond")
     responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
     reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
-    detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, none")
+    detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
+    commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
 
 
 # ============================================================================
diff --git a/modules/features/teamsbot/interfaceFeatureTeamsbot.py b/modules/features/teamsbot/interfaceFeatureTeamsbot.py
index 3a2caf47..4640b45b 100644
--- a/modules/features/teamsbot/interfaceFeatureTeamsbot.py
+++ b/modules/features/teamsbot/interfaceFeatureTeamsbot.py
@@ -148,6 +148,10 @@ class TeamsbotObjects:
         transcriptData["creationDate"] = getIsoTimestamp()
         return self.db.recordCreate(TeamsbotTranscript, transcriptData)
 
+    def updateTranscript(self, transcriptId: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Update an existing transcript segment (used for differential writing)."""
+        return self.db.recordModify(TeamsbotTranscript, transcriptId, updates)
+
     def _deleteTranscriptsBySession(self, sessionId: str) -> int:
         """Delete all transcripts for a session."""
         records = self.db.getRecordset(TeamsbotTranscript, recordFilter={"sessionId": sessionId})
diff --git a/modules/features/teamsbot/routeFeatureTeamsbot.py b/modules/features/teamsbot/routeFeatureTeamsbot.py
index a441d461..abac3ef0 100644
--- a/modules/features/teamsbot/routeFeatureTeamsbot.py
+++ b/modules/features/teamsbot/routeFeatureTeamsbot.py
@@ -1110,8 +1110,33 @@ async def botWebsocket(
             logger.warning(f"Could not load original user {startedByUserId}, falling back to system user")
             originalUser = systemUser
         
+        # Build effective config with the session's actual bot name.
+        # The session stores the resolved bot name (from system bot or user override).
+        # Without this, the default config botName (e.g. "AI Assistant") is used,
+        # which is wrong for registered system bots.
+        sessionBotName = session.get("botName") if session else None
+        if sessionBotName:
+            config = config.model_copy(update={"botName": sessionBotName})
+            logger.info(f"Browser Bot WebSocket: Using session botName '{sessionBotName}' (not default '{_getInstanceConfig(instanceId).botName}')")
+        
+        # Also merge user-specific settings if available
+        if startedByUserId:
+            interface = interfaceDb.getInterface(originalUser, mandateId=mandateId, featureInstanceId=instanceId)
+            userSettings = interface.getUserSettings(startedByUserId, instanceId)
+            if userSettings:
+                overrides = {}
+                for field in ["aiSystemPrompt", "responseMode", "responseChannel", "transferMode",
+                              "language", "voiceId", "triggerIntervalSeconds", "triggerCooldownSeconds",
+                              "contextWindowSegments"]:
+                    value = userSettings.get(field)
+                    if value is not None:
+                        overrides[field] = value
+                if overrides:
+                    config = config.model_copy(update=overrides)
+                    logger.info(f"Browser Bot WebSocket: Applied user settings overrides: {list(overrides.keys())}")
+        
         service = TeamsbotService(originalUser, mandateId, instanceId, config)
-        logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}")
+        logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}, botName={config.botName}")
         
         await service.handleBotWebSocket(websocket, sessionId)
     except WebSocketDisconnect:
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 5457758e..6482c073 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -27,6 +27,7 @@ from .datamodelTeamsbot import (
     TeamsbotResponseMode,
     TeamsbotResponseChannel,
     SpeechTeamsResponse,
+    TeamsbotCommand,
 )
 from .browserBotConnector import BrowserBotConnector
 
@@ -81,6 +82,12 @@ class TeamsbotService:
         self._sessionContext: Optional[str] = None  # User-provided background context
         self._contextSummary: Optional[str] = None  # AI-generated summary of long context
 
+        # Differential transcript tracking: only write new text, update existing
+        # record when the same speaker continues speaking
+        self._lastTranscriptSpeaker: Optional[str] = None
+        self._lastTranscriptText: Optional[str] = None
+        self._lastTranscriptId: Optional[str] = None
+
     # =========================================================================
     # Session Lifecycle
     # =========================================================================
@@ -438,61 +445,99 @@ class TeamsbotService:
         websocket: WebSocket,
         source: str = "caption",
     ):
-        """Process a transcript segment from captions or chat messages."""
+        """Process a transcript segment from captions or chat messages.
+
+        Differential writing: When the same speaker continues (text grows
+        incrementally as captions stream), we UPDATE the existing DB record
+        instead of creating a cascade of near-duplicate rows. A new record
+        is only created when the speaker changes or the text is not a
+        continuation of the previous segment.
+        """
         
         text = text.strip()
         if not text:
             return
 
         # Filter out the bot's own speech from AI triggering.
-        # The bot hears itself via captions — these should be stored in the
-        # transcript for the record, but must NOT trigger AI analysis (feedback loop).
         isBotSpeaker = self._isBotSpeaker(speaker)
 
-        # Store transcript segment
-        transcriptData = TeamsbotTranscript(
-            sessionId=sessionId,
-            speaker=speaker,
-            text=text,
-            timestamp=getIsoTimestamp(),
-            confidence=1.0,  # Captions don't have confidence scores
-            language=self.config.language,
-            isFinal=isFinal,
-        ).model_dump()
+        # Differential transcript writing:
+        # If the same speaker is still talking and the new text is a
+        # continuation (starts with the previous text), UPDATE the existing
+        # record instead of creating a new one. This avoids cascading rows like:
+        #   "Der AHV"
+        #   "Der AHV Fonds"
+        #   "Der AHV Fonds hat 2025"
+        # and instead keeps a single row that grows until the speaker changes.
+        isContinuation = (
+            self._lastTranscriptSpeaker == speaker
+            and self._lastTranscriptText
+            and self._lastTranscriptId
+            and text.startswith(self._lastTranscriptText)
+            and source == "caption"  # only for captions, chat messages are always new
+        )
 
-        createdTranscript = interface.createTranscript(transcriptData)
+        if isContinuation:
+            interface.updateTranscript(self._lastTranscriptId, {
+                "text": text,
+                "isFinal": isFinal,
+            })
+            self._lastTranscriptText = text
+            createdTranscript = {"id": self._lastTranscriptId}
 
-        # Update context buffer (mark source for chat messages)
-        self._contextBuffer.append({
-            "speaker": speaker or "Unknown",
-            "text": text,
-            "timestamp": getUtcTimestamp(),
-            "source": source,
-        })
-        # Keep only last N segments
-        maxSegments = self.config.contextWindowSegments
-        if len(self._contextBuffer) > maxSegments:
-            # When buffer overflows, summarize the older half to preserve context
-            # without losing information. The summary replaces the old segments.
-            if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
-                asyncio.create_task(self._summarizeContextBuffer(sessionId))
-            self._contextBuffer = self._contextBuffer[-maxSegments:]
+            # Update context buffer: replace last entry for same speaker
+            if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
+                self._contextBuffer[-1]["text"] = text
+        else:
+            # New speaker or non-continuation → create a new record
+            transcriptData = TeamsbotTranscript(
+                sessionId=sessionId,
+                speaker=speaker,
+                text=text,
+                timestamp=getIsoTimestamp(),
+                confidence=1.0,
+                language=self.config.language,
+                isFinal=isFinal,
+            ).model_dump()
 
-        # Emit SSE event for live transcript
+            createdTranscript = interface.createTranscript(transcriptData)
+
+            # Track for differential writing
+            self._lastTranscriptSpeaker = speaker
+            self._lastTranscriptText = text
+            self._lastTranscriptId = createdTranscript.get("id")
+
+            # Append to context buffer
+            self._contextBuffer.append({
+                "speaker": speaker or "Unknown",
+                "text": text,
+                "timestamp": getUtcTimestamp(),
+                "source": source,
+            })
+
+            # Keep only last N segments
+            maxSegments = self.config.contextWindowSegments
+            if len(self._contextBuffer) > maxSegments:
+                if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
+                    asyncio.create_task(self._summarizeContextBuffer(sessionId))
+                self._contextBuffer = self._contextBuffer[-maxSegments:]
+
+            # Update session transcript count (only for new records)
+            session = interface.getSession(sessionId)
+            if session:
+                count = session.get("transcriptSegmentCount", 0) + 1
+                interface.updateSession(sessionId, {"transcriptSegmentCount": count})
+
+        # Emit SSE event for live transcript (always, for UI updates)
         await _emitSessionEvent(sessionId, "transcript", {
             "id": createdTranscript.get("id"),
             "speaker": speaker,
             "text": text,
             "confidence": 1.0,
             "timestamp": getIsoTimestamp(),
+            "isContinuation": isContinuation,
         })
 
-        # Update session transcript count
-        session = interface.getSession(sessionId)
-        if session:
-            count = session.get("transcriptSegmentCount", 0) + 1
-            interface.updateSession(sessionId, {"transcriptSegmentCount": count})
-
         # Skip AI analysis for bot's own speech (prevents feedback loop)
         if isBotSpeaker:
             logger.debug(f"Session {sessionId}: Skipping AI trigger for bot's own speech: [{speaker}] {text[:60]}...")
@@ -805,10 +850,100 @@ class TeamsbotService:
 
                 logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
 
+            # Step 5: Execute AI-issued commands (if any)
+            if speechResult.commands:
+                await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)
+
         except Exception as e:
             logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
             await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
 
+    # =========================================================================
+    # AI Command Execution
+    # =========================================================================
+
+    async def _executeCommands(
+        self,
+        sessionId: str,
+        commands: List[TeamsbotCommand],
+        voiceInterface,
+        websocket: WebSocket,
+    ):
+        """Execute structured commands returned by the AI.
+
+        Each command is sent to the browser bot via WebSocket as a
+        'botCommand' message. The bot's TeamsActionsService handles
+        the actual Teams UI interaction (checking state, toggling, etc.).
+        """
+        for cmd in commands:
+            action = cmd.action
+            params = cmd.params or {}
+            logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")
+
+            try:
+                if action == "toggleTranscript":
+                    enable = params.get("enable", True)
+                    if websocket:
+                        await websocket.send_text(json.dumps({
+                            "type": "botCommand",
+                            "sessionId": sessionId,
+                            "command": "toggleTranscript",
+                            "params": {"enable": enable},
+                        }))
+
+                elif action == "sendChat":
+                    chatText = params.get("text", "")
+                    if chatText and websocket:
+                        await websocket.send_text(json.dumps({
+                            "type": "sendChatMessage",
+                            "sessionId": sessionId,
+                            "text": chatText,
+                        }))
+
+                elif action == "readAloud":
+                    readText = params.get("text", "")
+                    if readText and voiceInterface:
+                        ttsResult = await voiceInterface.textToSpeech(
+                            text=readText,
+                            languageCode=self.config.language,
+                            voiceName=self.config.voiceId,
+                        )
+                        if ttsResult and isinstance(ttsResult, dict):
+                            audioContent = ttsResult.get("audioContent")
+                            if audioContent and websocket:
+                                await websocket.send_text(json.dumps({
+                                    "type": "playAudio",
+                                    "sessionId": sessionId,
+                                    "audio": {
+                                        "data": base64.b64encode(
+                                            audioContent if isinstance(audioContent, bytes) else audioContent.encode()
+                                        ).decode(),
+                                        "format": "mp3",
+                                    },
+                                }))
+
+                elif action == "changeLanguage":
+                    newLang = params.get("language", "")
+                    if newLang:
+                        self.config = self.config.model_copy(update={"language": newLang})
+                        logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
+                        await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})
+
+                elif action in ("toggleMic", "toggleCamera"):
+                    if websocket:
+                        await websocket.send_text(json.dumps({
+                            "type": "botCommand",
+                            "sessionId": sessionId,
+                            "command": action,
+                            "params": params,
+                        }))
+
+                else:
+                    logger.warning(f"Session {sessionId}: Unknown command '{action}'")
+
+            except Exception as cmdErr:
+                logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")
+
     # =========================================================================
     # Context Summarization (for long sessions)
     # =========================================================================
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index ab45b49f..3537fe43 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -379,12 +379,22 @@ shouldRespond auf false. Du musst NICHT antworten wenn jemand dich stoppt."""
 
         basePrompt += f"""
 
+KOMMANDOS: Du kannst optionale Aktions-Kommandos ausfuehren lassen.
+Verfuegbare Kommandos (im "commands" Array):
+- {{"action": "toggleTranscript", "params": {{"enable": true/false}}}} — Transkription ein-/ausschalten
+- {{"action": "sendChat", "params": {{"text": "Nachricht"}}}} — Zusaetzliche Nachricht in den Chat schreiben (unabhaengig von responseText)
+- {{"action": "readAloud", "params": {{"text": "Text zum Vorlesen"}}}} — Einen bestimmten Text vorlesen (unabhaengig von responseText)
+- {{"action": "changeLanguage", "params": {{"language": "en-US"}}}} — Kommunikationssprache aendern (z.B. "de-DE", "en-US", "fr-FR")
+
+Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Transkription ein", "schreib das in den Chat", "lies das vor", "sprich Englisch").
+
 WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
 {{
     "shouldRespond": true/false,
     "responseText": "Deine Antwort hier" oder null,
     "reasoning": "Kurze Begruendung deiner Entscheidung",
-    "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none"
+    "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
+    "commands": [] oder null
 }}
 
 detectedIntent-Werte: