From 43e310a41df0f5897c5a46fe5a290208decc4af2 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Thu, 26 Feb 2026 09:05:21 +0100
Subject: [PATCH] feat: map audio STT to recent caption speaker hints
Made-with: Cursor
---
.../features/teamsbot/routeFeatureTeamsbot.py | 2 +
modules/features/teamsbot/service.py | 45 ++++++++++++++++++-
2 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/modules/features/teamsbot/routeFeatureTeamsbot.py b/modules/features/teamsbot/routeFeatureTeamsbot.py
index f426ae1b..a764eb64 100644
--- a/modules/features/teamsbot/routeFeatureTeamsbot.py
+++ b/modules/features/teamsbot/routeFeatureTeamsbot.py
@@ -1073,6 +1073,7 @@ async def postTranscript(
speaker = transcript.get("speaker", "Unknown")
text = transcript.get("text", "")
isFinal = transcript.get("isFinal", True)
+ source = transcript.get("source", "caption")
if not text.strip():
return {"success": True, "message": "Empty transcript ignored"}
@@ -1110,6 +1111,7 @@ async def postTranscript(
interface=interface,
voiceInterface=voiceInterface,
websocket=None, # No WebSocket in HTTP mode
+ source=source,
)
logger.info(f"HTTP transcript received: session={sessionId}, speaker={speaker}, text={text[:50]}...")
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 221f3a05..1dda67db 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -87,6 +87,7 @@ class TeamsbotService:
self._lastTranscriptSpeaker: Optional[str] = None
self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None
+ self._recentSpeakerHints: List[Dict[str, Any]] = []
# =========================================================================
# Session Lifecycle
@@ -267,6 +268,7 @@ class TeamsbotService:
if msgType == "transcript":
transcript = message.get("transcript", {})
+ source = transcript.get("source", "caption")
logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
await self._processTranscript(
sessionId=sessionId,
@@ -276,6 +278,7 @@ class TeamsbotService:
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
+ source=source,
)
elif msgType == "chatMessage":
@@ -437,10 +440,11 @@ class TeamsbotService:
if sttResult and sttResult.get("success") and sttResult.get("text"):
text = sttResult["text"].strip()
if text:
+ resolvedSpeaker = self._resolveSpeakerForAudioCapture()
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
await self._processTranscript(
sessionId=sessionId,
- speaker="Meeting Audio",
+ speaker=resolvedSpeaker,
text=text,
isFinal=True,
interface=interface,
@@ -451,6 +455,38 @@ class TeamsbotService:
except Exception as e:
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
+ def _registerSpeakerHint(self, speaker: str, text: str):
+ """Store recent speaker hints from captions for audio-mode speaker attribution."""
+ if not speaker:
+ return
+ normalizedSpeaker = speaker.strip()
+ if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
+ return
+
+ self._recentSpeakerHints.append({
+ "speaker": normalizedSpeaker,
+ "text": (text or "").strip(),
+ "timestamp": time.time(),
+ })
+
+ # Keep only the latest 20 hints
+ if len(self._recentSpeakerHints) > 20:
+ self._recentSpeakerHints = self._recentSpeakerHints[-20:]
+
+ def _resolveSpeakerForAudioCapture(self) -> str:
+ """Best-effort speaker name for audio chunks using recent caption hints."""
+ if not self._recentSpeakerHints:
+ return "Meeting Audio"
+
+ nowTs = time.time()
+ # Prefer very recent hints to reduce wrong attribution
+ for hint in reversed(self._recentSpeakerHints):
+ hintAge = nowTs - hint.get("timestamp", 0)
+ if hintAge <= 15:
+ return hint.get("speaker", "Meeting Audio")
+
+ return "Meeting Audio"
+
async def _processTranscript(
self,
sessionId: str,
@@ -475,6 +511,13 @@ class TeamsbotService:
if not text:
return
+ # Speaker hints are lightweight caption-derived signals used only to
+ # attribute audio-stream STT to likely speakers. They are not persisted.
+ if source in ("caption", "speakerHint"):
+ self._registerSpeakerHint(speaker, text)
+ if source == "speakerHint":
+ return
+
# Filter out the bot's own speech entirely — captions of the bot's
# own voice come back as garbled text (e.g. German TTS → English caption)
# which pollutes the context buffer and confuses AI analysis.