From 43e310a41df0f5897c5a46fe5a290208decc4af2 Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Thu, 26 Feb 2026 09:05:21 +0100 Subject: [PATCH] feat: map audio STT to recent caption speaker hints Made-with: Cursor --- .../features/teamsbot/routeFeatureTeamsbot.py | 2 + modules/features/teamsbot/service.py | 45 ++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/modules/features/teamsbot/routeFeatureTeamsbot.py b/modules/features/teamsbot/routeFeatureTeamsbot.py index f426ae1b..a764eb64 100644 --- a/modules/features/teamsbot/routeFeatureTeamsbot.py +++ b/modules/features/teamsbot/routeFeatureTeamsbot.py @@ -1073,6 +1073,7 @@ async def postTranscript( speaker = transcript.get("speaker", "Unknown") text = transcript.get("text", "") isFinal = transcript.get("isFinal", True) + source = transcript.get("source", "caption") if not text.strip(): return {"success": True, "message": "Empty transcript ignored"} @@ -1110,6 +1111,7 @@ async def postTranscript( interface=interface, voiceInterface=voiceInterface, websocket=None, # No WebSocket in HTTP mode + source=source, ) logger.info(f"HTTP transcript received: session={sessionId}, speaker={speaker}, text={text[:50]}...") diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 221f3a05..1dda67db 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -87,6 +87,7 @@ class TeamsbotService: self._lastTranscriptSpeaker: Optional[str] = None self._lastTranscriptText: Optional[str] = None self._lastTranscriptId: Optional[str] = None + self._recentSpeakerHints: List[Dict[str, Any]] = [] # ========================================================================= # Session Lifecycle @@ -267,6 +268,7 @@ class TeamsbotService: if msgType == "transcript": transcript = message.get("transcript", {}) + source = transcript.get("source", "caption") logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...") await self._processTranscript( sessionId=sessionId, @@ -276,6 +278,7 @@ class TeamsbotService: interface=interface, voiceInterface=voiceInterface, websocket=websocket, + source=source, ) elif msgType == "chatMessage": @@ -437,10 +440,11 @@ class TeamsbotService: if sttResult and sttResult.get("success") and sttResult.get("text"): text = sttResult["text"].strip() if text: + resolvedSpeaker = self._resolveSpeakerForAudioCapture() logger.info(f"[AudioChunk] STT result: {text[:80]}...") await self._processTranscript( sessionId=sessionId, - speaker="Meeting Audio", + speaker=resolvedSpeaker, text=text, isFinal=True, interface=interface, @@ -451,6 +455,38 @@ class TeamsbotService: except Exception as e: logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}") + def _registerSpeakerHint(self, speaker: str, text: str): + """Store recent speaker hints from captions for audio-mode speaker attribution.""" + if not speaker: + return + normalizedSpeaker = speaker.strip() + if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker): + return + + self._recentSpeakerHints.append({ + "speaker": normalizedSpeaker, + "text": (text or "").strip(), + "timestamp": time.time(), + }) + + # Keep only the latest 20 hints + if len(self._recentSpeakerHints) > 20: + self._recentSpeakerHints = self._recentSpeakerHints[-20:] + + def _resolveSpeakerForAudioCapture(self) -> str: + """Best-effort speaker name for audio chunks using recent caption hints.""" + if not self._recentSpeakerHints: + return "Meeting Audio" + + nowTs = time.time() + # Prefer very recent hints to reduce wrong attribution + for hint in reversed(self._recentSpeakerHints): + hintAge = nowTs - hint.get("timestamp", 0) + if hintAge <= 15: + return hint.get("speaker", "Meeting Audio") + + return "Meeting Audio" + async def _processTranscript( self, sessionId: str, @@ -475,6 +511,13 @@ class TeamsbotService: if not text: return + # Speaker hints are lightweight caption-derived signals used only to + # attribute audio-stream STT to likely speakers. They are not persisted. + if source in ("caption", "speakerHint"): + self._registerSpeakerHint(speaker, text) + if source == "speakerHint": + return + # Filter out the bot's own speech entirely — captions of the bot's # own voice come back as garbled text (e.g. German TTS → English caption) # which pollutes the context buffer and confuses AI analysis.