From 1dd354794b4d03bfc3e4a2668ddc4ec4a8d44a7c Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Sat, 28 Feb 2026 00:49:10 +0100
Subject: [PATCH] Teamsbot: STT phrase hints, multi-lang, follow-up window,
known speakers collection
Made-with: Cursor
---
modules/connectors/connectorVoiceGoogle.py | 25 ++++++++++++++------
modules/features/teamsbot/service.py | 26 ++++++++++++++++++++-
modules/interfaces/interfaceVoiceObjects.py | 12 +++++++---
3 files changed, 52 insertions(+), 11 deletions(-)
diff --git a/modules/connectors/connectorVoiceGoogle.py b/modules/connectors/connectorVoiceGoogle.py
index 10720efc..9fad87b9 100644
--- a/modules/connectors/connectorVoiceGoogle.py
+++ b/modules/connectors/connectorVoiceGoogle.py
@@ -9,7 +9,7 @@ import json
import html
import asyncio
import logging
-from typing import Dict, Optional, Any
+from typing import Dict, Optional, Any, List
from google.cloud import speech
from google.cloud import translate_v2 as translate
from google.cloud import texttospeech
@@ -60,7 +60,9 @@ class ConnectorGoogleSpeech:
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None,
- skipFallbacks: bool = False) -> Dict:
+ skipFallbacks: bool = False,
+ phraseHints: Optional[list] = None,
+ alternativeLanguages: Optional[list] = None) -> Dict:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
@@ -149,12 +151,21 @@ class ConnectorGoogleSpeech:
"audio_channel_count": channels,
"language_code": language,
"enable_automatic_punctuation": True,
- "model": "latest_long", # Try latest_long model for better recognition
- "enable_word_time_offsets": True, # Enable word-level timing
- "enable_word_confidence": True, # Enable word-level confidence
- "max_alternatives": 3, # Try more alternatives
- "use_enhanced": True # Use enhanced model if available
+ "model": "latest_long",
+ "enable_word_time_offsets": True,
+ "enable_word_confidence": True,
+ "max_alternatives": 3,
+ "use_enhanced": True,
}
+
+ if phraseHints:
+ configParams["speech_contexts"] = [speech.SpeechContext(
+ phrases=phraseHints,
+ boost=15.0,
+ )]
+
+ if alternativeLanguages:
+ configParams["alternative_language_codes"] = alternativeLanguages
# Only add sample_rate_hertz if needed (not for WEBM_OPUS)
if useSampleRate:
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 503cf347..a127dff7 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -94,9 +94,11 @@ class TeamsbotService:
# Speaker attribution: simple last-caption-speaker model
self._lastCaptionSpeaker: Optional[str] = None
self._unattributedTranscriptIds: List[str] = []
+ self._knownSpeakers: set = set()
# Debounced name trigger: wait for speaker to finish before AI analysis
self._pendingNameTrigger: Optional[Dict[str, Any]] = None
+ self._followUpWindowEnd: float = 0.0
# =========================================================================
# Session Lifecycle
@@ -469,12 +471,18 @@ class TeamsbotService:
# Treat sampleRate=0 as unknown (triggers auto-detection)
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
+ phraseHints = list(self._knownSpeakers)
+ if self.config.botName:
+ phraseHints.append(self.config.botName)
+
sttResult = await voiceInterface.speechToText(
audioContent=audioBytes,
language=self.config.language or "de-DE",
sampleRate=effectiveSampleRate,
channels=1,
skipFallbacks=True,
+ phraseHints=phraseHints if phraseHints else None,
+ alternativeLanguages=["en-US"],
)
if sttResult and sttResult.get("success") and sttResult.get("text"):
@@ -512,6 +520,7 @@ class TeamsbotService:
prevSpeaker = self._lastCaptionSpeaker
self._lastCaptionSpeaker = normalizedSpeaker
+ self._knownSpeakers.add(normalizedSpeaker)
if prevSpeaker is None and self._unattributedTranscriptIds:
from . import interfaceFeatureTeamsbot as interfaceDb
@@ -719,6 +728,20 @@ class TeamsbotService:
asyncio.create_task(self._checkPendingNameTrigger())
return
+ # Follow-up window: after a bot response, trigger AI for any human speech
+ # without requiring the bot name — the AI decides via shouldRespond
+ if (
+ source == "audioCapture"
+ and not self._isBotSpeaker(speaker)
+ and time.time() < self._followUpWindowEnd
+ and not self._pendingNameTrigger
+ ):
+ isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
+ if isNew:
+ logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
+ asyncio.create_task(self._checkPendingNameTrigger())
+ return
+
# Periodic trigger (only when no debounce pending)
if not self._pendingNameTrigger:
shouldTrigger = self._shouldTriggerAnalysis(text)
@@ -1149,7 +1172,8 @@ class TeamsbotService:
self._lastTranscriptText = speechResult.responseText
self._lastTranscriptId = botTranscript.get("id")
- logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
+ self._followUpWindowEnd = time.time() + 15.0
+ logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")
# Step 5: Execute AI-issued commands (if any)
if speechResult.commands:
diff --git a/modules/interfaces/interfaceVoiceObjects.py b/modules/interfaces/interfaceVoiceObjects.py
index 6efe51cf..5c84c047 100644
--- a/modules/interfaces/interfaceVoiceObjects.py
+++ b/modules/interfaces/interfaceVoiceObjects.py
@@ -67,7 +67,9 @@ class VoiceObjects:
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None,
- skipFallbacks: bool = False) -> Dict[str, Any]:
+ skipFallbacks: bool = False,
+ phraseHints: list = None,
+ alternativeLanguages: list = None) -> Dict[str, Any]:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
@@ -77,12 +79,14 @@ class VoiceObjects:
sampleRate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None)
skipFallbacks: If True, skip fallback attempts (use when audio format is known)
+ phraseHints: Optional list of phrases to boost recognition (names, terms)
+ alternativeLanguages: Optional list of additional language codes for multi-language
Returns:
Dict containing transcribed text, confidence, and metadata
"""
try:
- logger.info(f"🎤 Speech-to-text request: {len(audioContent)} bytes, language: {language}")
+ logger.info(f"Speech-to-text request: {len(audioContent)} bytes, language: {language}")
connector = self._getGoogleSpeechConnector()
result = await connector.speechToText(
@@ -90,7 +94,9 @@ class VoiceObjects:
language=language,
sampleRate=sampleRate,
channels=channels,
- skipFallbacks=skipFallbacks
+ skipFallbacks=skipFallbacks,
+ phraseHints=phraseHints,
+ alternativeLanguages=alternativeLanguages,
)
if result["success"]: