From 12273247031d0941a45ab81097dc104618864a7f Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Wed, 18 Feb 2026 17:50:31 +0100 Subject: [PATCH] feat: differential transcript, fix bot name in WS handler, AI commands support Co-authored-by: Cursor --- .../features/teamsbot/datamodelTeamsbot.py | 9 +- .../teamsbot/interfaceFeatureTeamsbot.py | 4 + .../features/teamsbot/routeFeatureTeamsbot.py | 27 ++- modules/features/teamsbot/service.py | 207 +++++++++++++++--- modules/services/serviceAi/mainServiceAi.py | 12 +- 5 files changed, 220 insertions(+), 39 deletions(-) diff --git a/modules/features/teamsbot/datamodelTeamsbot.py b/modules/features/teamsbot/datamodelTeamsbot.py index 737f360e..dd823c2e 100644 --- a/modules/features/teamsbot/datamodelTeamsbot.py +++ b/modules/features/teamsbot/datamodelTeamsbot.py @@ -234,12 +234,19 @@ class TeamsbotConfigUpdateRequest(BaseModel): # SPEECH_TEAMS AI Response Model # ============================================================================ +class TeamsbotCommand(BaseModel): + """A structured command the AI can issue to control Teams meeting actions.""" + action: str = Field(description="Command action: toggleTranscript, sendChat, readAloud, changeLanguage, toggleMic, toggleCamera") + params: Optional[Dict[str, Any]] = Field(default=None, description="Action-specific parameters") + + class SpeechTeamsResponse(BaseModel): """Structured response from the SPEECH_TEAMS AI handler.""" shouldRespond: bool = Field(description="Whether the bot should respond") responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)") reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)") - detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, none") + detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none") + commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)") # ============================================================================ diff --git a/modules/features/teamsbot/interfaceFeatureTeamsbot.py b/modules/features/teamsbot/interfaceFeatureTeamsbot.py index 3a2caf47..4640b45b 100644 --- a/modules/features/teamsbot/interfaceFeatureTeamsbot.py +++ b/modules/features/teamsbot/interfaceFeatureTeamsbot.py @@ -148,6 +148,10 @@ class TeamsbotObjects: transcriptData["creationDate"] = getIsoTimestamp() return self.db.recordCreate(TeamsbotTranscript, transcriptData) + def updateTranscript(self, transcriptId: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Update an existing transcript segment (used for differential writing).""" + return self.db.recordModify(TeamsbotTranscript, transcriptId, updates) + def _deleteTranscriptsBySession(self, sessionId: str) -> int: """Delete all transcripts for a session.""" records = self.db.getRecordset(TeamsbotTranscript, recordFilter={"sessionId": sessionId}) diff --git a/modules/features/teamsbot/routeFeatureTeamsbot.py b/modules/features/teamsbot/routeFeatureTeamsbot.py index a441d461..abac3ef0 100644 --- a/modules/features/teamsbot/routeFeatureTeamsbot.py +++ b/modules/features/teamsbot/routeFeatureTeamsbot.py @@ -1110,8 +1110,33 @@ async def botWebsocket( logger.warning(f"Could not load original user {startedByUserId}, falling back to system user") originalUser = systemUser + # Build effective config with the session's actual bot name. + # The session stores the resolved bot name (from system bot or user override). + # Without this, the default config botName (e.g. "AI Assistant") is used, + # which is wrong for registered system bots. + sessionBotName = session.get("botName") if session else None + if sessionBotName: + config = config.model_copy(update={"botName": sessionBotName}) + logger.info(f"Browser Bot WebSocket: Using session botName '{sessionBotName}' (not default '{_getInstanceConfig(instanceId).botName}')") + + # Also merge user-specific settings if available + if startedByUserId: + interface = interfaceDb.getInterface(originalUser, mandateId=mandateId, featureInstanceId=instanceId) + userSettings = interface.getUserSettings(startedByUserId, instanceId) + if userSettings: + overrides = {} + for field in ["aiSystemPrompt", "responseMode", "responseChannel", "transferMode", + "language", "voiceId", "triggerIntervalSeconds", "triggerCooldownSeconds", + "contextWindowSegments"]: + value = userSettings.get(field) + if value is not None: + overrides[field] = value + if overrides: + config = config.model_copy(update=overrides) + logger.info(f"Browser Bot WebSocket: Applied user settings overrides: {list(overrides.keys())}") + service = TeamsbotService(originalUser, mandateId, instanceId, config) - logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}") + logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}, botName={config.botName}") await service.handleBotWebSocket(websocket, sessionId) except WebSocketDisconnect: diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 5457758e..6482c073 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -27,6 +27,7 @@ from .datamodelTeamsbot import ( TeamsbotResponseMode, TeamsbotResponseChannel, SpeechTeamsResponse, + TeamsbotCommand, ) from .browserBotConnector import BrowserBotConnector @@ -81,6 +82,12 @@ class TeamsbotService: self._sessionContext: Optional[str] = None # User-provided background context self._contextSummary: Optional[str] = None # AI-generated summary of long context + # Differential transcript tracking: only write new text, update existing + # record when the same speaker continues speaking + self._lastTranscriptSpeaker: Optional[str] = None + self._lastTranscriptText: Optional[str] = None + self._lastTranscriptId: Optional[str] = None + # ========================================================================= # Session Lifecycle # ========================================================================= @@ -438,61 +445,99 @@ class TeamsbotService: websocket: WebSocket, source: str = "caption", ): - """Process a transcript segment from captions or chat messages.""" + """Process a transcript segment from captions or chat messages. + + Differential writing: When the same speaker continues (text grows + incrementally as captions stream), we UPDATE the existing DB record + instead of creating a cascade of near-duplicate rows. A new record + is only created when the speaker changes or the text is not a + continuation of the previous segment. + """ text = text.strip() if not text: return # Filter out the bot's own speech from AI triggering. - # The bot hears itself via captions — these should be stored in the - # transcript for the record, but must NOT trigger AI analysis (feedback loop). isBotSpeaker = self._isBotSpeaker(speaker) - # Store transcript segment - transcriptData = TeamsbotTranscript( - sessionId=sessionId, - speaker=speaker, - text=text, - timestamp=getIsoTimestamp(), - confidence=1.0, # Captions don't have confidence scores - language=self.config.language, - isFinal=isFinal, - ).model_dump() + # Differential transcript writing: + # If the same speaker is still talking and the new text is a + # continuation (starts with the previous text), UPDATE the existing + # record instead of creating a new one. This avoids cascading rows like: + # "Der AHV" + # "Der AHV Fonds" + # "Der AHV Fonds hat 2025" + # and instead keeps a single row that grows until the speaker changes. + isContinuation = ( + self._lastTranscriptSpeaker == speaker + and self._lastTranscriptText + and self._lastTranscriptId + and text.startswith(self._lastTranscriptText) + and source == "caption" # only for captions, chat messages are always new + ) - createdTranscript = interface.createTranscript(transcriptData) + if isContinuation: + interface.updateTranscript(self._lastTranscriptId, { + "text": text, + "isFinal": isFinal, + }) + self._lastTranscriptText = text + createdTranscript = {"id": self._lastTranscriptId} - # Update context buffer (mark source for chat messages) - self._contextBuffer.append({ - "speaker": speaker or "Unknown", - "text": text, - "timestamp": getUtcTimestamp(), - "source": source, - }) - # Keep only last N segments - maxSegments = self.config.contextWindowSegments - if len(self._contextBuffer) > maxSegments: - # When buffer overflows, summarize the older half to preserve context - # without losing information. The summary replaces the old segments. - if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5: - asyncio.create_task(self._summarizeContextBuffer(sessionId)) - self._contextBuffer = self._contextBuffer[-maxSegments:] + # Update context buffer: replace last entry for same speaker + if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker: + self._contextBuffer[-1]["text"] = text + else: + # New speaker or non-continuation → create a new record + transcriptData = TeamsbotTranscript( + sessionId=sessionId, + speaker=speaker, + text=text, + timestamp=getIsoTimestamp(), + confidence=1.0, + language=self.config.language, + isFinal=isFinal, + ).model_dump() - # Emit SSE event for live transcript + createdTranscript = interface.createTranscript(transcriptData) + + # Track for differential writing + self._lastTranscriptSpeaker = speaker + self._lastTranscriptText = text + self._lastTranscriptId = createdTranscript.get("id") + + # Append to context buffer + self._contextBuffer.append({ + "speaker": speaker or "Unknown", + "text": text, + "timestamp": getUtcTimestamp(), + "source": source, + }) + + # Keep only last N segments + maxSegments = self.config.contextWindowSegments + if len(self._contextBuffer) > maxSegments: + if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5: + asyncio.create_task(self._summarizeContextBuffer(sessionId)) + self._contextBuffer = self._contextBuffer[-maxSegments:] + + # Update session transcript count (only for new records) + session = interface.getSession(sessionId) + if session: + count = session.get("transcriptSegmentCount", 0) + 1 + interface.updateSession(sessionId, {"transcriptSegmentCount": count}) + + # Emit SSE event for live transcript (always, for UI updates) await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": speaker, "text": text, "confidence": 1.0, "timestamp": getIsoTimestamp(), + "isContinuation": isContinuation, }) - # Update session transcript count - session = interface.getSession(sessionId) - if session: - count = session.get("transcriptSegmentCount", 0) + 1 - interface.updateSession(sessionId, {"transcriptSegmentCount": count}) - # Skip AI analysis for bot's own speech (prevents feedback loop) if isBotSpeaker: logger.debug(f"Session {sessionId}: Skipping AI trigger for bot's own speech: [{speaker}] {text[:60]}...") @@ -805,10 +850,100 @@ class TeamsbotService: logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}") + # Step 5: Execute AI-issued commands (if any) + if speechResult.commands: + await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket) + except Exception as e: logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True) await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"}) + # ========================================================================= + # AI Command Execution + # ========================================================================= + + async def _executeCommands( + self, + sessionId: str, + commands: List[TeamsbotCommand], + voiceInterface, + websocket: WebSocket, + ): + """Execute structured commands returned by the AI. + + Each command is sent to the browser bot via WebSocket as a + 'botCommand' message. The bot's TeamsActionsService handles + the actual Teams UI interaction (checking state, toggling, etc.). + """ + for cmd in commands: + action = cmd.action + params = cmd.params or {} + logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}") + + try: + if action == "toggleTranscript": + enable = params.get("enable", True) + if websocket: + await websocket.send_text(json.dumps({ + "type": "botCommand", + "sessionId": sessionId, + "command": "toggleTranscript", + "params": {"enable": enable}, + })) + + elif action == "sendChat": + chatText = params.get("text", "") + if chatText and websocket: + await websocket.send_text(json.dumps({ + "type": "sendChatMessage", + "sessionId": sessionId, + "text": chatText, + })) + + elif action == "readAloud": + readText = params.get("text", "") + if readText and voiceInterface: + ttsResult = await voiceInterface.textToSpeech( + text=readText, + languageCode=self.config.language, + voiceName=self.config.voiceId, + ) + if ttsResult and isinstance(ttsResult, dict): + audioContent = ttsResult.get("audioContent") + if audioContent and websocket: + await websocket.send_text(json.dumps({ + "type": "playAudio", + "sessionId": sessionId, + "audio": { + "data": base64.b64encode( + audioContent if isinstance(audioContent, bytes) else audioContent.encode() + ).decode(), + "format": "mp3", + }, + })) + + elif action == "changeLanguage": + newLang = params.get("language", "") + if newLang: + self.config = self.config.model_copy(update={"language": newLang}) + logger.info(f"Session {sessionId}: Language changed to '{newLang}'") + await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang}) + + elif action in ("toggleMic", "toggleCamera"): + if websocket: + await websocket.send_text(json.dumps({ + "type": "botCommand", + "sessionId": sessionId, + "command": action, + "params": params, + })) + + else: + logger.warning(f"Session {sessionId}: Unknown command '{action}'") + + except Exception as cmdErr: + logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}") + # ========================================================================= # Context Summarization (for long sessions) # ========================================================================= diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index ab45b49f..3537fe43 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -379,12 +379,22 @@ shouldRespond auf false. Du musst NICHT antworten wenn jemand dich stoppt.""" basePrompt += f""" +KOMMANDOS: Du kannst optionale Aktions-Kommandos ausfuehren lassen. +Verfuegbare Kommandos (im "commands" Array): +- {{"action": "toggleTranscript", "params": {{"enable": true/false}}}} — Transkription ein-/ausschalten +- {{"action": "sendChat", "params": {{"text": "Nachricht"}}}} — Zusaetzliche Nachricht in den Chat schreiben (unabhaengig von responseText) +- {{"action": "readAloud", "params": {{"text": "Text zum Vorlesen"}}}} — Einen bestimmten Text vorlesen (unabhaengig von responseText) +- {{"action": "changeLanguage", "params": {{"language": "en-US"}}}} — Kommunikationssprache aendern (z.B. "de-DE", "en-US", "fr-FR") + +Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Transkription ein", "schreib das in den Chat", "lies das vor", "sprich Englisch"). + WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format: {{ "shouldRespond": true/false, "responseText": "Deine Antwort hier" oder null, "reasoning": "Kurze Begruendung deiner Entscheidung", - "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none" + "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none", + "commands": [] oder null }} detectedIntent-Werte: