Teamsbot: STT phrase hints, multi-lang, follow-up window, known speakers collection

Made-with: Cursor
This commit is contained in:
patrick-motsch 2026-02-28 00:49:10 +01:00
parent b9c3ad38fb
commit 1dd354794b
3 changed files with 52 additions and 11 deletions

View file

@ -9,7 +9,7 @@ import json
import html import html
import asyncio import asyncio
import logging import logging
from typing import Dict, Optional, Any from typing import Dict, Optional, Any, List
from google.cloud import speech from google.cloud import speech
from google.cloud import translate_v2 as translate from google.cloud import translate_v2 as translate
from google.cloud import texttospeech from google.cloud import texttospeech
@ -60,7 +60,9 @@ class ConnectorGoogleSpeech:
async def speechToText(self, audioContent: bytes, language: str = "de-DE", async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None, sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False) -> Dict: skipFallbacks: bool = False,
phraseHints: Optional[list] = None,
alternativeLanguages: Optional[list] = None) -> Dict:
""" """
Convert speech to text using Google Cloud Speech-to-Text API. Convert speech to text using Google Cloud Speech-to-Text API.
@ -149,12 +151,21 @@ class ConnectorGoogleSpeech:
"audio_channel_count": channels, "audio_channel_count": channels,
"language_code": language, "language_code": language,
"enable_automatic_punctuation": True, "enable_automatic_punctuation": True,
"model": "latest_long", # Try latest_long model for better recognition "model": "latest_long",
"enable_word_time_offsets": True, # Enable word-level timing "enable_word_time_offsets": True,
"enable_word_confidence": True, # Enable word-level confidence "enable_word_confidence": True,
"max_alternatives": 3, # Try more alternatives "max_alternatives": 3,
"use_enhanced": True # Use enhanced model if available "use_enhanced": True,
} }
if phraseHints:
configParams["speech_contexts"] = [speech.SpeechContext(
phrases=phraseHints,
boost=15.0,
)]
if alternativeLanguages:
configParams["alternative_language_codes"] = alternativeLanguages
# Only add sample_rate_hertz if needed (not for WEBM_OPUS) # Only add sample_rate_hertz if needed (not for WEBM_OPUS)
if useSampleRate: if useSampleRate:

View file

@ -94,9 +94,11 @@ class TeamsbotService:
# Speaker attribution: simple last-caption-speaker model # Speaker attribution: simple last-caption-speaker model
self._lastCaptionSpeaker: Optional[str] = None self._lastCaptionSpeaker: Optional[str] = None
self._unattributedTranscriptIds: List[str] = [] self._unattributedTranscriptIds: List[str] = []
self._knownSpeakers: set = set()
# Debounced name trigger: wait for speaker to finish before AI analysis # Debounced name trigger: wait for speaker to finish before AI analysis
self._pendingNameTrigger: Optional[Dict[str, Any]] = None self._pendingNameTrigger: Optional[Dict[str, Any]] = None
self._followUpWindowEnd: float = 0.0
# ========================================================================= # =========================================================================
# Session Lifecycle # Session Lifecycle
@ -469,12 +471,18 @@ class TeamsbotService:
# Treat sampleRate=0 as unknown (triggers auto-detection) # Treat sampleRate=0 as unknown (triggers auto-detection)
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
phraseHints = list(self._knownSpeakers)
if self.config.botName:
phraseHints.append(self.config.botName)
sttResult = await voiceInterface.speechToText( sttResult = await voiceInterface.speechToText(
audioContent=audioBytes, audioContent=audioBytes,
language=self.config.language or "de-DE", language=self.config.language or "de-DE",
sampleRate=effectiveSampleRate, sampleRate=effectiveSampleRate,
channels=1, channels=1,
skipFallbacks=True, skipFallbacks=True,
phraseHints=phraseHints if phraseHints else None,
alternativeLanguages=["en-US"],
) )
if sttResult and sttResult.get("success") and sttResult.get("text"): if sttResult and sttResult.get("success") and sttResult.get("text"):
@ -512,6 +520,7 @@ class TeamsbotService:
prevSpeaker = self._lastCaptionSpeaker prevSpeaker = self._lastCaptionSpeaker
self._lastCaptionSpeaker = normalizedSpeaker self._lastCaptionSpeaker = normalizedSpeaker
self._knownSpeakers.add(normalizedSpeaker)
if prevSpeaker is None and self._unattributedTranscriptIds: if prevSpeaker is None and self._unattributedTranscriptIds:
from . import interfaceFeatureTeamsbot as interfaceDb from . import interfaceFeatureTeamsbot as interfaceDb
@ -719,6 +728,20 @@ class TeamsbotService:
asyncio.create_task(self._checkPendingNameTrigger()) asyncio.create_task(self._checkPendingNameTrigger())
return return
# Follow-up window: after a bot response, trigger AI for any human speech
# without requiring the bot name — the AI decides via shouldRespond
if (
source == "audioCapture"
and not self._isBotSpeaker(speaker)
and time.time() < self._followUpWindowEnd
and not self._pendingNameTrigger
):
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
if isNew:
logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
asyncio.create_task(self._checkPendingNameTrigger())
return
# Periodic trigger (only when no debounce pending) # Periodic trigger (only when no debounce pending)
if not self._pendingNameTrigger: if not self._pendingNameTrigger:
shouldTrigger = self._shouldTriggerAnalysis(text) shouldTrigger = self._shouldTriggerAnalysis(text)
@ -1149,7 +1172,8 @@ class TeamsbotService:
self._lastTranscriptText = speechResult.responseText self._lastTranscriptText = speechResult.responseText
self._lastTranscriptId = botTranscript.get("id") self._lastTranscriptId = botTranscript.get("id")
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}") self._followUpWindowEnd = time.time() + 15.0
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")
# Step 5: Execute AI-issued commands (if any) # Step 5: Execute AI-issued commands (if any)
if speechResult.commands: if speechResult.commands:

View file

@ -67,7 +67,9 @@ class VoiceObjects:
async def speechToText(self, audioContent: bytes, language: str = "de-DE", async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None, sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False) -> Dict[str, Any]: skipFallbacks: bool = False,
phraseHints: list = None,
alternativeLanguages: list = None) -> Dict[str, Any]:
""" """
Convert speech to text using Google Cloud Speech-to-Text API. Convert speech to text using Google Cloud Speech-to-Text API.
@ -77,12 +79,14 @@ class VoiceObjects:
sampleRate: Audio sample rate (auto-detected if None) sampleRate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None) channels: Number of audio channels (auto-detected if None)
skipFallbacks: If True, skip fallback attempts (use when audio format is known) skipFallbacks: If True, skip fallback attempts (use when audio format is known)
phraseHints: Optional list of phrases to boost recognition (names, terms)
alternativeLanguages: Optional list of additional language codes for multi-language
Returns: Returns:
Dict containing transcribed text, confidence, and metadata Dict containing transcribed text, confidence, and metadata
""" """
try: try:
logger.info(f"🎤 Speech-to-text request: {len(audioContent)} bytes, language: {language}") logger.info(f"Speech-to-text request: {len(audioContent)} bytes, language: {language}")
connector = self._getGoogleSpeechConnector() connector = self._getGoogleSpeechConnector()
result = await connector.speechToText( result = await connector.speechToText(
@ -90,7 +94,9 @@ class VoiceObjects:
language=language, language=language,
sampleRate=sampleRate, sampleRate=sampleRate,
channels=channels, channels=channels,
skipFallbacks=skipFallbacks skipFallbacks=skipFallbacks,
phraseHints=phraseHints,
alternativeLanguages=alternativeLanguages,
) )
if result["success"]: if result["success"]: