feat: add speaker-hint debug flag and improve TTS diagnostics

Made-with: Cursor
This commit is contained in:
patrick-motsch 2026-02-26 09:26:59 +01:00
parent 0cd8e9ebfa
commit 02002f3576

View file

@ -444,13 +444,14 @@ class TeamsbotService:
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
await self._processTranscript(
sessionId=sessionId,
speaker=resolvedSpeaker,
speaker=resolvedSpeaker["speaker"],
text=text,
isFinal=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source="audioCapture",
speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"],
)
except Exception as e:
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
@ -473,19 +474,22 @@ class TeamsbotService:
if len(self._recentSpeakerHints) > 20:
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
def _resolveSpeakerForAudioCapture(self) -> str:
def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]:
"""Best-effort speaker name for audio chunks using recent caption hints."""
if not self._recentSpeakerHints:
return "Meeting Audio"
return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
nowTs = time.time()
# Prefer very recent hints to reduce wrong attribution
for hint in reversed(self._recentSpeakerHints):
hintAge = nowTs - hint.get("timestamp", 0)
if hintAge <= 15:
return hint.get("speaker", "Meeting Audio")
return {
"speaker": hint.get("speaker", "Meeting Audio"),
"speakerResolvedFromHint": True,
}
return "Meeting Audio"
return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
async def _processTranscript(
self,
@ -497,6 +501,7 @@ class TeamsbotService:
voiceInterface,
websocket: WebSocket,
source: str = "caption",
speakerResolvedFromHint: Optional[bool] = None,
):
"""Process a transcript segment from captions or chat messages.
@ -601,6 +606,12 @@ class TeamsbotService:
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": isContinuation,
"source": source,
"speakerResolvedFromHint": (
speakerResolvedFromHint
if speakerResolvedFromHint is not None
else False
),
})
# Check if AI analysis should be triggered (only for final transcripts)
@ -837,15 +848,26 @@ class TeamsbotService:
# 4a: Voice response (TTS -> Audio to bot)
if sendVoice:
try:
logger.info(
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
)
ttsResult = await voiceInterface.textToSpeech(
text=speechResult.responseText,
languageCode=self.config.language,
voiceName=self.config.voiceId
)
if ttsResult and isinstance(ttsResult, dict):
if not ttsResult or not isinstance(ttsResult, dict):
raise RuntimeError("TTS returned invalid result payload")
if ttsResult.get("success") is False:
raise RuntimeError(f"TTS backend error: {ttsResult.get('error', 'unknown')}")
audioContent = ttsResult.get("audioContent")
if audioContent and websocket:
if not audioContent:
raise RuntimeError("TTS returned no audioContent")
if websocket:
await websocket.send_text(json.dumps({
"type": "playAudio",
"sessionId": sessionId,
@ -854,8 +876,11 @@ class TeamsbotService:
"format": "mp3",
},
}))
elif audioContent and not websocket:
logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)")
logger.info(f"Session {sessionId}: TTS audio dispatched to bot")
else:
logger.warning(
f"Session {sessionId}: TTS audio generated but cannot be played (bot websocket unavailable, likely fallback mode)"
)
except Exception as ttsErr:
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
if not sendChat: