Teamsbot: STT phrase hints, multi-lang, follow-up window, known speakers collection

Made-with: Cursor
2026-02-28 00:49:10 +01:00 · 2026-02-28 00:49:10 +01:00 · 1dd354794b
commit 1dd354794b
parent b9c3ad38fb
3 changed files with 52 additions and 11 deletions
--- a/modules/connectors/connectorVoiceGoogle.py
+++ b/modules/connectors/connectorVoiceGoogle.py
@ -9,7 +9,7 @@ import json
 import html
 import asyncio
 import logging
-from typing import Dict, Optional, Any
+from typing import Dict, Optional, Any, List
 from google.cloud import speech
 from google.cloud import translate_v2 as translate
 from google.cloud import texttospeech
@ -60,7 +60,9 @@ class ConnectorGoogleSpeech:
    
    async def speechToText(self, audioContent: bytes, language: str = "de-DE", 
                           sampleRate: int = None, channels: int = None,
-                           skipFallbacks: bool = False) -> Dict:
+                           skipFallbacks: bool = False,
+                           phraseHints: Optional[list] = None,
+                           alternativeLanguages: Optional[list] = None) -> Dict:
        """
        Convert speech to text using Google Cloud Speech-to-Text API.
        
@ -149,13 +151,22 @@ class ConnectorGoogleSpeech:
                "audio_channel_count": channels,
                "language_code": language,
                "enable_automatic_punctuation": True,
-                "model": "latest_long",  # Try latest_long model for better recognition
-                "enable_word_time_offsets": True,  # Enable word-level timing
-                "enable_word_confidence": True,    # Enable word-level confidence
-                "max_alternatives": 3,             # Try more alternatives
-                "use_enhanced": True               # Use enhanced model if available
+                "model": "latest_long",
+                "enable_word_time_offsets": True,
+                "enable_word_confidence": True,
+                "max_alternatives": 3,
+                "use_enhanced": True,
            }

+            if phraseHints:
+                configParams["speech_contexts"] = [speech.SpeechContext(
+                    phrases=phraseHints,
+                    boost=15.0,
+                )]
+
+            if alternativeLanguages:
+                configParams["alternative_language_codes"] = alternativeLanguages
+            
            # Only add sample_rate_hertz if needed (not for WEBM_OPUS)
            if useSampleRate:
                configParams["sample_rate_hertz"] = sampleRate
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -94,9 +94,11 @@ class TeamsbotService:
        # Speaker attribution: simple last-caption-speaker model
        self._lastCaptionSpeaker: Optional[str] = None
        self._unattributedTranscriptIds: List[str] = []
+        self._knownSpeakers: set = set()

        # Debounced name trigger: wait for speaker to finish before AI analysis
        self._pendingNameTrigger: Optional[Dict[str, Any]] = None
+        self._followUpWindowEnd: float = 0.0

    # =========================================================================
    # Session Lifecycle
@ -469,12 +471,18 @@ class TeamsbotService:
            # Treat sampleRate=0 as unknown (triggers auto-detection)
            effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None

+            phraseHints = list(self._knownSpeakers)
+            if self.config.botName:
+                phraseHints.append(self.config.botName)
+
            sttResult = await voiceInterface.speechToText(
                audioContent=audioBytes,
                language=self.config.language or "de-DE",
                sampleRate=effectiveSampleRate,
                channels=1,
                skipFallbacks=True,
+                phraseHints=phraseHints if phraseHints else None,
+                alternativeLanguages=["en-US"],
            )

            if sttResult and sttResult.get("success") and sttResult.get("text"):
@ -512,6 +520,7 @@ class TeamsbotService:

        prevSpeaker = self._lastCaptionSpeaker
        self._lastCaptionSpeaker = normalizedSpeaker
+        self._knownSpeakers.add(normalizedSpeaker)

        if prevSpeaker is None and self._unattributedTranscriptIds:
            from . import interfaceFeatureTeamsbot as interfaceDb
@ -719,6 +728,20 @@ class TeamsbotService:
                asyncio.create_task(self._checkPendingNameTrigger())
            return

+        # Follow-up window: after a bot response, trigger AI for any human speech
+        # without requiring the bot name — the AI decides via shouldRespond
+        if (
+            source == "audioCapture"
+            and not self._isBotSpeaker(speaker)
+            and time.time() < self._followUpWindowEnd
+            and not self._pendingNameTrigger
+        ):
+            isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
+            if isNew:
+                logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
+                asyncio.create_task(self._checkPendingNameTrigger())
+            return
+
        # Periodic trigger (only when no debounce pending)
        if not self._pendingNameTrigger:
            shouldTrigger = self._shouldTriggerAnalysis(text)
@ -1149,7 +1172,8 @@ class TeamsbotService:
                self._lastTranscriptText = speechResult.responseText
                self._lastTranscriptId = botTranscript.get("id")

-                logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
+                self._followUpWindowEnd = time.time() + 15.0
+                logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")

            # Step 5: Execute AI-issued commands (if any)
            if speechResult.commands:
--- a/modules/interfaces/interfaceVoiceObjects.py
+++ b/modules/interfaces/interfaceVoiceObjects.py
@ -67,7 +67,9 @@ class VoiceObjects:
    
    async def speechToText(self, audioContent: bytes, language: str = "de-DE", 
                          sampleRate: int = None, channels: int = None,
-                          skipFallbacks: bool = False) -> Dict[str, Any]:
+                          skipFallbacks: bool = False,
+                          phraseHints: list = None,
+                          alternativeLanguages: list = None) -> Dict[str, Any]:
        """
        Convert speech to text using Google Cloud Speech-to-Text API.
        
@ -77,12 +79,14 @@ class VoiceObjects:
            sampleRate: Audio sample rate (auto-detected if None)
            channels: Number of audio channels (auto-detected if None)
            skipFallbacks: If True, skip fallback attempts (use when audio format is known)
+            phraseHints: Optional list of phrases to boost recognition (names, terms)
+            alternativeLanguages: Optional list of additional language codes for multi-language
            
        Returns:
            Dict containing transcribed text, confidence, and metadata
        """
        try:
-            logger.info(f"🎤 Speech-to-text request: {len(audioContent)} bytes, language: {language}")
+            logger.info(f"Speech-to-text request: {len(audioContent)} bytes, language: {language}")
            
            connector = self._getGoogleSpeechConnector()
            result = await connector.speechToText(
@ -90,7 +94,9 @@ class VoiceObjects:
                language=language,
                sampleRate=sampleRate,
                channels=channels,
-                skipFallbacks=skipFallbacks
+                skipFallbacks=skipFallbacks,
+                phraseHints=phraseHints,
+                alternativeLanguages=alternativeLanguages,
            )
            
            if result["success"]: