From 02002f35760ea6b71fd55613ba6300ac6024f559 Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Thu, 26 Feb 2026 09:26:59 +0100 Subject: [PATCH] feat: add speaker-hint debug flag and improve TTS diagnostics Made-with: Cursor --- modules/features/teamsbot/service.py | 63 +++++++++++++++++++--------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 337f6ce3..c24f0388 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -444,13 +444,14 @@ class TeamsbotService: logger.info(f"[AudioChunk] STT result: {text[:80]}...") await self._processTranscript( sessionId=sessionId, - speaker=resolvedSpeaker, + speaker=resolvedSpeaker["speaker"], text=text, isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source="audioCapture", + speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"], ) except Exception as e: logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}") @@ -473,19 +474,22 @@ class TeamsbotService: if len(self._recentSpeakerHints) > 20: self._recentSpeakerHints = self._recentSpeakerHints[-20:] - def _resolveSpeakerForAudioCapture(self) -> str: + def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]: """Best-effort speaker name for audio chunks using recent caption hints.""" if not self._recentSpeakerHints: - return "Meeting Audio" + return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False} nowTs = time.time() # Prefer very recent hints to reduce wrong attribution for hint in reversed(self._recentSpeakerHints): hintAge = nowTs - hint.get("timestamp", 0) if hintAge <= 15: - return hint.get("speaker", "Meeting Audio") + return { + "speaker": hint.get("speaker", "Meeting Audio"), + "speakerResolvedFromHint": True, + } - return "Meeting Audio" + return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False} async def _processTranscript( self, @@ -497,6 +501,7 @@ class TeamsbotService: voiceInterface, websocket: WebSocket, source: str = "caption", + speakerResolvedFromHint: Optional[bool] = None, ): """Process a transcript segment from captions or chat messages. @@ -601,6 +606,12 @@ class TeamsbotService: "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": isContinuation, + "source": source, + "speakerResolvedFromHint": ( + speakerResolvedFromHint + if speakerResolvedFromHint is not None + else False + ), }) # Check if AI analysis should be triggered (only for final transcripts) @@ -837,25 +848,39 @@ class TeamsbotService: # 4a: Voice response (TTS -> Audio to bot) if sendVoice: try: + logger.info( + f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})" + ) ttsResult = await voiceInterface.textToSpeech( text=speechResult.responseText, languageCode=self.config.language, voiceName=self.config.voiceId ) - - if ttsResult and isinstance(ttsResult, dict): - audioContent = ttsResult.get("audioContent") - if audioContent and websocket: - await websocket.send_text(json.dumps({ - "type": "playAudio", - "sessionId": sessionId, - "audio": { - "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(), - "format": "mp3", - }, - })) - elif audioContent and not websocket: - logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)") + + if not ttsResult or not isinstance(ttsResult, dict): + raise RuntimeError("TTS returned invalid result payload") + + if ttsResult.get("success") is False: + raise RuntimeError(f"TTS backend error: {ttsResult.get('error', 'unknown')}") + + audioContent = ttsResult.get("audioContent") + if not audioContent: + raise RuntimeError("TTS returned no audioContent") + + if websocket: + await websocket.send_text(json.dumps({ + "type": "playAudio", + "sessionId": sessionId, + "audio": { + "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(), + "format": "mp3", + }, + })) + logger.info(f"Session {sessionId}: TTS audio dispatched to bot") + else: + logger.warning( + f"Session {sessionId}: TTS audio generated but cannot be played (bot websocket unavailable, likely fallback mode)" + ) except Exception as ttsErr: logger.warning(f"TTS failed for session {sessionId}: {ttsErr}") if not sendChat: