feat: map audio STT to recent caption speaker hints
Made-with: Cursor
This commit is contained in:
parent
5684a4d769
commit
43e310a41d
2 changed files with 46 additions and 1 deletions
|
|
@ -1073,6 +1073,7 @@ async def postTranscript(
|
||||||
speaker = transcript.get("speaker", "Unknown")
|
speaker = transcript.get("speaker", "Unknown")
|
||||||
text = transcript.get("text", "")
|
text = transcript.get("text", "")
|
||||||
isFinal = transcript.get("isFinal", True)
|
isFinal = transcript.get("isFinal", True)
|
||||||
|
source = transcript.get("source", "caption")
|
||||||
|
|
||||||
if not text.strip():
|
if not text.strip():
|
||||||
return {"success": True, "message": "Empty transcript ignored"}
|
return {"success": True, "message": "Empty transcript ignored"}
|
||||||
|
|
@ -1110,6 +1111,7 @@ async def postTranscript(
|
||||||
interface=interface,
|
interface=interface,
|
||||||
voiceInterface=voiceInterface,
|
voiceInterface=voiceInterface,
|
||||||
websocket=None, # No WebSocket in HTTP mode
|
websocket=None, # No WebSocket in HTTP mode
|
||||||
|
source=source,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"HTTP transcript received: session={sessionId}, speaker={speaker}, text={text[:50]}...")
|
logger.info(f"HTTP transcript received: session={sessionId}, speaker={speaker}, text={text[:50]}...")
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,7 @@ class TeamsbotService:
|
||||||
self._lastTranscriptSpeaker: Optional[str] = None
|
self._lastTranscriptSpeaker: Optional[str] = None
|
||||||
self._lastTranscriptText: Optional[str] = None
|
self._lastTranscriptText: Optional[str] = None
|
||||||
self._lastTranscriptId: Optional[str] = None
|
self._lastTranscriptId: Optional[str] = None
|
||||||
|
self._recentSpeakerHints: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session Lifecycle
|
# Session Lifecycle
|
||||||
|
|
@ -267,6 +268,7 @@ class TeamsbotService:
|
||||||
|
|
||||||
if msgType == "transcript":
|
if msgType == "transcript":
|
||||||
transcript = message.get("transcript", {})
|
transcript = message.get("transcript", {})
|
||||||
|
source = transcript.get("source", "caption")
|
||||||
logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
|
logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
|
||||||
await self._processTranscript(
|
await self._processTranscript(
|
||||||
sessionId=sessionId,
|
sessionId=sessionId,
|
||||||
|
|
@ -276,6 +278,7 @@ class TeamsbotService:
|
||||||
interface=interface,
|
interface=interface,
|
||||||
voiceInterface=voiceInterface,
|
voiceInterface=voiceInterface,
|
||||||
websocket=websocket,
|
websocket=websocket,
|
||||||
|
source=source,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif msgType == "chatMessage":
|
elif msgType == "chatMessage":
|
||||||
|
|
@ -437,10 +440,11 @@ class TeamsbotService:
|
||||||
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
||||||
text = sttResult["text"].strip()
|
text = sttResult["text"].strip()
|
||||||
if text:
|
if text:
|
||||||
|
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
|
||||||
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
||||||
await self._processTranscript(
|
await self._processTranscript(
|
||||||
sessionId=sessionId,
|
sessionId=sessionId,
|
||||||
speaker="Meeting Audio",
|
speaker=resolvedSpeaker,
|
||||||
text=text,
|
text=text,
|
||||||
isFinal=True,
|
isFinal=True,
|
||||||
interface=interface,
|
interface=interface,
|
||||||
|
|
@ -451,6 +455,38 @@ class TeamsbotService:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
def _registerSpeakerHint(self, speaker: str, text: str):
|
||||||
|
"""Store recent speaker hints from captions for audio-mode speaker attribution."""
|
||||||
|
if not speaker:
|
||||||
|
return
|
||||||
|
normalizedSpeaker = speaker.strip()
|
||||||
|
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
|
||||||
|
return
|
||||||
|
|
||||||
|
self._recentSpeakerHints.append({
|
||||||
|
"speaker": normalizedSpeaker,
|
||||||
|
"text": (text or "").strip(),
|
||||||
|
"timestamp": time.time(),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Keep only the latest 20 hints
|
||||||
|
if len(self._recentSpeakerHints) > 20:
|
||||||
|
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
|
||||||
|
|
||||||
|
def _resolveSpeakerForAudioCapture(self) -> str:
|
||||||
|
"""Best-effort speaker name for audio chunks using recent caption hints."""
|
||||||
|
if not self._recentSpeakerHints:
|
||||||
|
return "Meeting Audio"
|
||||||
|
|
||||||
|
nowTs = time.time()
|
||||||
|
# Prefer very recent hints to reduce wrong attribution
|
||||||
|
for hint in reversed(self._recentSpeakerHints):
|
||||||
|
hintAge = nowTs - hint.get("timestamp", 0)
|
||||||
|
if hintAge <= 15:
|
||||||
|
return hint.get("speaker", "Meeting Audio")
|
||||||
|
|
||||||
|
return "Meeting Audio"
|
||||||
|
|
||||||
async def _processTranscript(
|
async def _processTranscript(
|
||||||
self,
|
self,
|
||||||
sessionId: str,
|
sessionId: str,
|
||||||
|
|
@ -475,6 +511,13 @@ class TeamsbotService:
|
||||||
if not text:
|
if not text:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Speaker hints are lightweight caption-derived signals used only to
|
||||||
|
# attribute audio-stream STT to likely speakers. They are not persisted.
|
||||||
|
if source in ("caption", "speakerHint"):
|
||||||
|
self._registerSpeakerHint(speaker, text)
|
||||||
|
if source == "speakerHint":
|
||||||
|
return
|
||||||
|
|
||||||
# Filter out the bot's own speech entirely — captions of the bot's
|
# Filter out the bot's own speech entirely — captions of the bot's
|
||||||
# own voice come back as garbled text (e.g. German TTS → English caption)
|
# own voice come back as garbled text (e.g. German TTS → English caption)
|
||||||
# which pollutes the context buffer and confuses AI analysis.
|
# which pollutes the context buffer and confuses AI analysis.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue