feat: map audio STT to recent caption speaker hints
Made-with: Cursor
This commit is contained in:
parent
5684a4d769
commit
43e310a41d
2 changed files with 46 additions and 1 deletions
|
|
@ -1073,6 +1073,7 @@ async def postTranscript(
|
|||
speaker = transcript.get("speaker", "Unknown")
|
||||
text = transcript.get("text", "")
|
||||
isFinal = transcript.get("isFinal", True)
|
||||
source = transcript.get("source", "caption")
|
||||
|
||||
if not text.strip():
|
||||
return {"success": True, "message": "Empty transcript ignored"}
|
||||
|
|
@ -1110,6 +1111,7 @@ async def postTranscript(
|
|||
interface=interface,
|
||||
voiceInterface=voiceInterface,
|
||||
websocket=None, # No WebSocket in HTTP mode
|
||||
source=source,
|
||||
)
|
||||
|
||||
logger.info(f"HTTP transcript received: session={sessionId}, speaker={speaker}, text={text[:50]}...")
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ class TeamsbotService:
|
|||
self._lastTranscriptSpeaker: Optional[str] = None
|
||||
self._lastTranscriptText: Optional[str] = None
|
||||
self._lastTranscriptId: Optional[str] = None
|
||||
self._recentSpeakerHints: List[Dict[str, Any]] = []
|
||||
|
||||
# =========================================================================
|
||||
# Session Lifecycle
|
||||
|
|
@ -267,6 +268,7 @@ class TeamsbotService:
|
|||
|
||||
if msgType == "transcript":
|
||||
transcript = message.get("transcript", {})
|
||||
source = transcript.get("source", "caption")
|
||||
logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
|
||||
await self._processTranscript(
|
||||
sessionId=sessionId,
|
||||
|
|
@ -276,6 +278,7 @@ class TeamsbotService:
|
|||
interface=interface,
|
||||
voiceInterface=voiceInterface,
|
||||
websocket=websocket,
|
||||
source=source,
|
||||
)
|
||||
|
||||
elif msgType == "chatMessage":
|
||||
|
|
@ -437,10 +440,11 @@ class TeamsbotService:
|
|||
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
||||
text = sttResult["text"].strip()
|
||||
if text:
|
||||
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
|
||||
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
||||
await self._processTranscript(
|
||||
sessionId=sessionId,
|
||||
speaker="Meeting Audio",
|
||||
speaker=resolvedSpeaker,
|
||||
text=text,
|
||||
isFinal=True,
|
||||
interface=interface,
|
||||
|
|
@ -451,6 +455,38 @@ class TeamsbotService:
|
|||
except Exception as e:
|
||||
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
||||
|
||||
def _registerSpeakerHint(self, speaker: str, text: str):
|
||||
"""Store recent speaker hints from captions for audio-mode speaker attribution."""
|
||||
if not speaker:
|
||||
return
|
||||
normalizedSpeaker = speaker.strip()
|
||||
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
|
||||
return
|
||||
|
||||
self._recentSpeakerHints.append({
|
||||
"speaker": normalizedSpeaker,
|
||||
"text": (text or "").strip(),
|
||||
"timestamp": time.time(),
|
||||
})
|
||||
|
||||
# Keep only the latest 20 hints
|
||||
if len(self._recentSpeakerHints) > 20:
|
||||
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
|
||||
|
||||
def _resolveSpeakerForAudioCapture(self) -> str:
|
||||
"""Best-effort speaker name for audio chunks using recent caption hints."""
|
||||
if not self._recentSpeakerHints:
|
||||
return "Meeting Audio"
|
||||
|
||||
nowTs = time.time()
|
||||
# Prefer very recent hints to reduce wrong attribution
|
||||
for hint in reversed(self._recentSpeakerHints):
|
||||
hintAge = nowTs - hint.get("timestamp", 0)
|
||||
if hintAge <= 15:
|
||||
return hint.get("speaker", "Meeting Audio")
|
||||
|
||||
return "Meeting Audio"
|
||||
|
||||
async def _processTranscript(
|
||||
self,
|
||||
sessionId: str,
|
||||
|
|
@ -475,6 +511,13 @@ class TeamsbotService:
|
|||
if not text:
|
||||
return
|
||||
|
||||
# Speaker hints are lightweight caption-derived signals used only to
|
||||
# attribute audio-stream STT to likely speakers. They are not persisted.
|
||||
if source in ("caption", "speakerHint"):
|
||||
self._registerSpeakerHint(speaker, text)
|
||||
if source == "speakerHint":
|
||||
return
|
||||
|
||||
# Filter out the bot's own speech entirely — captions of the bot's
|
||||
# own voice come back as garbled text (e.g. German TTS → English caption)
|
||||
# which pollutes the context buffer and confuses AI analysis.
|
||||
|
|
|
|||
Loading…
Reference in a new issue