From 02002f35760ea6b71fd55613ba6300ac6024f559 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Thu, 26 Feb 2026 09:26:59 +0100
Subject: [PATCH] feat: add speaker-hint debug flag and improve TTS diagnostics
Made-with: Cursor
---
modules/features/teamsbot/service.py | 63 +++++++++++++++++++---------
1 file changed, 44 insertions(+), 19 deletions(-)
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 337f6ce3..c24f0388 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -444,13 +444,14 @@ class TeamsbotService:
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
await self._processTranscript(
sessionId=sessionId,
- speaker=resolvedSpeaker,
+ speaker=resolvedSpeaker["speaker"],
text=text,
isFinal=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source="audioCapture",
+ speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"],
)
except Exception as e:
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
@@ -473,19 +474,22 @@ class TeamsbotService:
if len(self._recentSpeakerHints) > 20:
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
- def _resolveSpeakerForAudioCapture(self) -> str:
+ def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]:
"""Best-effort speaker name for audio chunks using recent caption hints."""
if not self._recentSpeakerHints:
- return "Meeting Audio"
+ return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
nowTs = time.time()
# Prefer very recent hints to reduce wrong attribution
for hint in reversed(self._recentSpeakerHints):
hintAge = nowTs - hint.get("timestamp", 0)
if hintAge <= 15:
- return hint.get("speaker", "Meeting Audio")
+ return {
+ "speaker": hint.get("speaker", "Meeting Audio"),
+ "speakerResolvedFromHint": True,
+ }
- return "Meeting Audio"
+ return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
async def _processTranscript(
self,
@@ -497,6 +501,7 @@ class TeamsbotService:
voiceInterface,
websocket: WebSocket,
source: str = "caption",
+ speakerResolvedFromHint: Optional[bool] = None,
):
"""Process a transcript segment from captions or chat messages.
@@ -601,6 +606,12 @@ class TeamsbotService:
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": isContinuation,
+ "source": source,
+ "speakerResolvedFromHint": (
+ speakerResolvedFromHint
+ if speakerResolvedFromHint is not None
+ else False
+ ),
})
# Check if AI analysis should be triggered (only for final transcripts)
@@ -837,25 +848,39 @@ class TeamsbotService:
# 4a: Voice response (TTS -> Audio to bot)
if sendVoice:
try:
+ logger.info(
+ f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
+ )
ttsResult = await voiceInterface.textToSpeech(
text=speechResult.responseText,
languageCode=self.config.language,
voiceName=self.config.voiceId
)
-
- if ttsResult and isinstance(ttsResult, dict):
- audioContent = ttsResult.get("audioContent")
- if audioContent and websocket:
- await websocket.send_text(json.dumps({
- "type": "playAudio",
- "sessionId": sessionId,
- "audio": {
- "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
- "format": "mp3",
- },
- }))
- elif audioContent and not websocket:
- logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)")
+
+ if not ttsResult or not isinstance(ttsResult, dict):
+ raise RuntimeError("TTS returned invalid result payload")
+
+ if ttsResult.get("success") is False:
+ raise RuntimeError(f"TTS backend error: {ttsResult.get('error', 'unknown')}")
+
+ audioContent = ttsResult.get("audioContent")
+ if not audioContent:
+ raise RuntimeError("TTS returned no audioContent")
+
+ if websocket:
+ await websocket.send_text(json.dumps({
+ "type": "playAudio",
+ "sessionId": sessionId,
+ "audio": {
+ "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
+ "format": "mp3",
+ },
+ }))
+ logger.info(f"Session {sessionId}: TTS audio dispatched to bot")
+ else:
+ logger.warning(
+ f"Session {sessionId}: TTS audio generated but cannot be played (bot websocket unavailable, likely fallback mode)"
+ )
except Exception as ttsErr:
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
if not sendChat: