From 1dd354794b4d03bfc3e4a2668ddc4ec4a8d44a7c Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Sat, 28 Feb 2026 00:49:10 +0100 Subject: [PATCH] Teamsbot: STT phrase hints, multi-lang, follow-up window, known speakers collection Made-with: Cursor --- modules/connectors/connectorVoiceGoogle.py | 25 ++++++++++++++------ modules/features/teamsbot/service.py | 26 ++++++++++++++++++++- modules/interfaces/interfaceVoiceObjects.py | 12 +++++++--- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/modules/connectors/connectorVoiceGoogle.py b/modules/connectors/connectorVoiceGoogle.py index 10720efc..9fad87b9 100644 --- a/modules/connectors/connectorVoiceGoogle.py +++ b/modules/connectors/connectorVoiceGoogle.py @@ -9,7 +9,7 @@ import json import html import asyncio import logging -from typing import Dict, Optional, Any +from typing import Dict, Optional, Any, List from google.cloud import speech from google.cloud import translate_v2 as translate from google.cloud import texttospeech @@ -60,7 +60,9 @@ class ConnectorGoogleSpeech: async def speechToText(self, audioContent: bytes, language: str = "de-DE", sampleRate: int = None, channels: int = None, - skipFallbacks: bool = False) -> Dict: + skipFallbacks: bool = False, + phraseHints: Optional[list] = None, + alternativeLanguages: Optional[list] = None) -> Dict: """ Convert speech to text using Google Cloud Speech-to-Text API. @@ -149,12 +151,21 @@ class ConnectorGoogleSpeech: "audio_channel_count": channels, "language_code": language, "enable_automatic_punctuation": True, - "model": "latest_long", # Try latest_long model for better recognition - "enable_word_time_offsets": True, # Enable word-level timing - "enable_word_confidence": True, # Enable word-level confidence - "max_alternatives": 3, # Try more alternatives - "use_enhanced": True # Use enhanced model if available + "model": "latest_long", + "enable_word_time_offsets": True, + "enable_word_confidence": True, + "max_alternatives": 3, + "use_enhanced": True, } + + if phraseHints: + configParams["speech_contexts"] = [speech.SpeechContext( + phrases=phraseHints, + boost=15.0, + )] + + if alternativeLanguages: + configParams["alternative_language_codes"] = alternativeLanguages # Only add sample_rate_hertz if needed (not for WEBM_OPUS) if useSampleRate: diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 503cf347..a127dff7 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -94,9 +94,11 @@ class TeamsbotService: # Speaker attribution: simple last-caption-speaker model self._lastCaptionSpeaker: Optional[str] = None self._unattributedTranscriptIds: List[str] = [] + self._knownSpeakers: set = set() # Debounced name trigger: wait for speaker to finish before AI analysis self._pendingNameTrigger: Optional[Dict[str, Any]] = None + self._followUpWindowEnd: float = 0.0 # ========================================================================= # Session Lifecycle @@ -469,12 +471,18 @@ class TeamsbotService: # Treat sampleRate=0 as unknown (triggers auto-detection) effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None + phraseHints = list(self._knownSpeakers) + if self.config.botName: + phraseHints.append(self.config.botName) + sttResult = await voiceInterface.speechToText( audioContent=audioBytes, language=self.config.language or "de-DE", sampleRate=effectiveSampleRate, channels=1, skipFallbacks=True, + phraseHints=phraseHints if phraseHints else None, + alternativeLanguages=["en-US"], ) if sttResult and sttResult.get("success") and sttResult.get("text"): @@ -512,6 +520,7 @@ class TeamsbotService: prevSpeaker = self._lastCaptionSpeaker self._lastCaptionSpeaker = normalizedSpeaker + self._knownSpeakers.add(normalizedSpeaker) if prevSpeaker is None and self._unattributedTranscriptIds: from . import interfaceFeatureTeamsbot as interfaceDb @@ -719,6 +728,20 @@ class TeamsbotService: asyncio.create_task(self._checkPendingNameTrigger()) return + # Follow-up window: after a bot response, trigger AI for any human speech + # without requiring the bot name — the AI decides via shouldRespond + if ( + source == "audioCapture" + and not self._isBotSpeaker(speaker) + and time.time() < self._followUpWindowEnd + and not self._pendingNameTrigger + ): + isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript) + if isNew: + logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)") + asyncio.create_task(self._checkPendingNameTrigger()) + return + # Periodic trigger (only when no debounce pending) if not self._pendingNameTrigger: shouldTrigger = self._shouldTriggerAnalysis(text) @@ -1149,7 +1172,8 @@ class TeamsbotService: self._lastTranscriptText = speechResult.responseText self._lastTranscriptId = botTranscript.get("id") - logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}") + self._followUpWindowEnd = time.time() + 15.0 + logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s") # Step 5: Execute AI-issued commands (if any) if speechResult.commands: diff --git a/modules/interfaces/interfaceVoiceObjects.py b/modules/interfaces/interfaceVoiceObjects.py index 6efe51cf..5c84c047 100644 --- a/modules/interfaces/interfaceVoiceObjects.py +++ b/modules/interfaces/interfaceVoiceObjects.py @@ -67,7 +67,9 @@ class VoiceObjects: async def speechToText(self, audioContent: bytes, language: str = "de-DE", sampleRate: int = None, channels: int = None, - skipFallbacks: bool = False) -> Dict[str, Any]: + skipFallbacks: bool = False, + phraseHints: list = None, + alternativeLanguages: list = None) -> Dict[str, Any]: """ Convert speech to text using Google Cloud Speech-to-Text API. @@ -77,12 +79,14 @@ class VoiceObjects: sampleRate: Audio sample rate (auto-detected if None) channels: Number of audio channels (auto-detected if None) skipFallbacks: If True, skip fallback attempts (use when audio format is known) + phraseHints: Optional list of phrases to boost recognition (names, terms) + alternativeLanguages: Optional list of additional language codes for multi-language Returns: Dict containing transcribed text, confidence, and metadata """ try: - logger.info(f"🎤 Speech-to-text request: {len(audioContent)} bytes, language: {language}") + logger.info(f"Speech-to-text request: {len(audioContent)} bytes, language: {language}") connector = self._getGoogleSpeechConnector() result = await connector.speechToText( @@ -90,7 +94,9 @@ class VoiceObjects: language=language, sampleRate=sampleRate, channels=channels, - skipFallbacks=skipFallbacks + skipFallbacks=skipFallbacks, + phraseHints=phraseHints, + alternativeLanguages=alternativeLanguages, ) if result["success"]: