feat: map audio STT to recent caption speaker hints

Made-with: Cursor
This commit is contained in:
patrick-motsch 2026-02-26 09:05:21 +01:00
parent 5684a4d769
commit 43e310a41d
2 changed files with 46 additions and 1 deletions

View file

@ -1073,6 +1073,7 @@ async def postTranscript(
speaker = transcript.get("speaker", "Unknown")
text = transcript.get("text", "")
isFinal = transcript.get("isFinal", True)
source = transcript.get("source", "caption")
if not text.strip():
return {"success": True, "message": "Empty transcript ignored"}
@ -1110,6 +1111,7 @@ async def postTranscript(
interface=interface,
voiceInterface=voiceInterface,
websocket=None, # No WebSocket in HTTP mode
source=source,
)
logger.info(f"HTTP transcript received: session={sessionId}, speaker={speaker}, text={text[:50]}...")

View file

@ -87,6 +87,7 @@ class TeamsbotService:
self._lastTranscriptSpeaker: Optional[str] = None
self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None
self._recentSpeakerHints: List[Dict[str, Any]] = []
# =========================================================================
# Session Lifecycle
@ -267,6 +268,7 @@ class TeamsbotService:
if msgType == "transcript":
transcript = message.get("transcript", {})
source = transcript.get("source", "caption")
logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
await self._processTranscript(
sessionId=sessionId,
@ -276,6 +278,7 @@ class TeamsbotService:
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source=source,
)
elif msgType == "chatMessage":
@ -437,10 +440,11 @@ class TeamsbotService:
if sttResult and sttResult.get("success") and sttResult.get("text"):
text = sttResult["text"].strip()
if text:
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
await self._processTranscript(
sessionId=sessionId,
speaker="Meeting Audio",
speaker=resolvedSpeaker,
text=text,
isFinal=True,
interface=interface,
@ -451,6 +455,38 @@ class TeamsbotService:
except Exception as e:
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
def _registerSpeakerHint(self, speaker: str, text: str):
"""Store recent speaker hints from captions for audio-mode speaker attribution."""
if not speaker:
return
normalizedSpeaker = speaker.strip()
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
return
self._recentSpeakerHints.append({
"speaker": normalizedSpeaker,
"text": (text or "").strip(),
"timestamp": time.time(),
})
# Keep only the latest 20 hints
if len(self._recentSpeakerHints) > 20:
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
def _resolveSpeakerForAudioCapture(self) -> str:
"""Best-effort speaker name for audio chunks using recent caption hints."""
if not self._recentSpeakerHints:
return "Meeting Audio"
nowTs = time.time()
# Prefer very recent hints to reduce wrong attribution
for hint in reversed(self._recentSpeakerHints):
hintAge = nowTs - hint.get("timestamp", 0)
if hintAge <= 15:
return hint.get("speaker", "Meeting Audio")
return "Meeting Audio"
async def _processTranscript(
self,
sessionId: str,
@ -475,6 +511,13 @@ class TeamsbotService:
if not text:
return
# Speaker hints are lightweight caption-derived signals used only to
# attribute audio-stream STT to likely speakers. They are not persisted.
if source in ("caption", "speakerHint"):
self._registerSpeakerHint(speaker, text)
if source == "speakerHint":
return
# Filter out the bot's own speech entirely — captions of the bot's
# own voice come back as garbled text (e.g. German TTS → English caption)
# which pollutes the context buffer and confuses AI analysis.