feat: add speaker-hint debug flag and improve TTS diagnostics
Made-with: Cursor
This commit is contained in:
parent
0cd8e9ebfa
commit
02002f3576
1 changed files with 44 additions and 19 deletions
|
|
@ -444,13 +444,14 @@ class TeamsbotService:
|
||||||
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
||||||
await self._processTranscript(
|
await self._processTranscript(
|
||||||
sessionId=sessionId,
|
sessionId=sessionId,
|
||||||
speaker=resolvedSpeaker,
|
speaker=resolvedSpeaker["speaker"],
|
||||||
text=text,
|
text=text,
|
||||||
isFinal=True,
|
isFinal=True,
|
||||||
interface=interface,
|
interface=interface,
|
||||||
voiceInterface=voiceInterface,
|
voiceInterface=voiceInterface,
|
||||||
websocket=websocket,
|
websocket=websocket,
|
||||||
source="audioCapture",
|
source="audioCapture",
|
||||||
|
speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"],
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
||||||
|
|
@ -473,19 +474,22 @@ class TeamsbotService:
|
||||||
if len(self._recentSpeakerHints) > 20:
|
if len(self._recentSpeakerHints) > 20:
|
||||||
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
|
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
|
||||||
|
|
||||||
def _resolveSpeakerForAudioCapture(self) -> str:
|
def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]:
|
||||||
"""Best-effort speaker name for audio chunks using recent caption hints."""
|
"""Best-effort speaker name for audio chunks using recent caption hints."""
|
||||||
if not self._recentSpeakerHints:
|
if not self._recentSpeakerHints:
|
||||||
return "Meeting Audio"
|
return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
|
||||||
|
|
||||||
nowTs = time.time()
|
nowTs = time.time()
|
||||||
# Prefer very recent hints to reduce wrong attribution
|
# Prefer very recent hints to reduce wrong attribution
|
||||||
for hint in reversed(self._recentSpeakerHints):
|
for hint in reversed(self._recentSpeakerHints):
|
||||||
hintAge = nowTs - hint.get("timestamp", 0)
|
hintAge = nowTs - hint.get("timestamp", 0)
|
||||||
if hintAge <= 15:
|
if hintAge <= 15:
|
||||||
return hint.get("speaker", "Meeting Audio")
|
return {
|
||||||
|
"speaker": hint.get("speaker", "Meeting Audio"),
|
||||||
|
"speakerResolvedFromHint": True,
|
||||||
|
}
|
||||||
|
|
||||||
return "Meeting Audio"
|
return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
|
||||||
|
|
||||||
async def _processTranscript(
|
async def _processTranscript(
|
||||||
self,
|
self,
|
||||||
|
|
@ -497,6 +501,7 @@ class TeamsbotService:
|
||||||
voiceInterface,
|
voiceInterface,
|
||||||
websocket: WebSocket,
|
websocket: WebSocket,
|
||||||
source: str = "caption",
|
source: str = "caption",
|
||||||
|
speakerResolvedFromHint: Optional[bool] = None,
|
||||||
):
|
):
|
||||||
"""Process a transcript segment from captions or chat messages.
|
"""Process a transcript segment from captions or chat messages.
|
||||||
|
|
||||||
|
|
@ -601,6 +606,12 @@ class TeamsbotService:
|
||||||
"confidence": 1.0,
|
"confidence": 1.0,
|
||||||
"timestamp": getIsoTimestamp(),
|
"timestamp": getIsoTimestamp(),
|
||||||
"isContinuation": isContinuation,
|
"isContinuation": isContinuation,
|
||||||
|
"source": source,
|
||||||
|
"speakerResolvedFromHint": (
|
||||||
|
speakerResolvedFromHint
|
||||||
|
if speakerResolvedFromHint is not None
|
||||||
|
else False
|
||||||
|
),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Check if AI analysis should be triggered (only for final transcripts)
|
# Check if AI analysis should be triggered (only for final transcripts)
|
||||||
|
|
@ -837,25 +848,39 @@ class TeamsbotService:
|
||||||
# 4a: Voice response (TTS -> Audio to bot)
|
# 4a: Voice response (TTS -> Audio to bot)
|
||||||
if sendVoice:
|
if sendVoice:
|
||||||
try:
|
try:
|
||||||
|
logger.info(
|
||||||
|
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
||||||
|
)
|
||||||
ttsResult = await voiceInterface.textToSpeech(
|
ttsResult = await voiceInterface.textToSpeech(
|
||||||
text=speechResult.responseText,
|
text=speechResult.responseText,
|
||||||
languageCode=self.config.language,
|
languageCode=self.config.language,
|
||||||
voiceName=self.config.voiceId
|
voiceName=self.config.voiceId
|
||||||
)
|
)
|
||||||
|
|
||||||
if ttsResult and isinstance(ttsResult, dict):
|
if not ttsResult or not isinstance(ttsResult, dict):
|
||||||
audioContent = ttsResult.get("audioContent")
|
raise RuntimeError("TTS returned invalid result payload")
|
||||||
if audioContent and websocket:
|
|
||||||
await websocket.send_text(json.dumps({
|
if ttsResult.get("success") is False:
|
||||||
"type": "playAudio",
|
raise RuntimeError(f"TTS backend error: {ttsResult.get('error', 'unknown')}")
|
||||||
"sessionId": sessionId,
|
|
||||||
"audio": {
|
audioContent = ttsResult.get("audioContent")
|
||||||
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
|
if not audioContent:
|
||||||
"format": "mp3",
|
raise RuntimeError("TTS returned no audioContent")
|
||||||
},
|
|
||||||
}))
|
if websocket:
|
||||||
elif audioContent and not websocket:
|
await websocket.send_text(json.dumps({
|
||||||
logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)")
|
"type": "playAudio",
|
||||||
|
"sessionId": sessionId,
|
||||||
|
"audio": {
|
||||||
|
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
|
||||||
|
"format": "mp3",
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
logger.info(f"Session {sessionId}: TTS audio dispatched to bot")
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Session {sessionId}: TTS audio generated but cannot be played (bot websocket unavailable, likely fallback mode)"
|
||||||
|
)
|
||||||
except Exception as ttsErr:
|
except Exception as ttsErr:
|
||||||
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
|
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
|
||||||
if not sendChat:
|
if not sendChat:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue