fix: skip STT fallbacks for teamsbot, run audio processing in background

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-13 18:17:47 +01:00 · 2026-02-13 18:17:47 +01:00 · ae4dc9fa48
commit ae4dc9fa48
parent 77151df0f4
3 changed files with 43 additions and 13 deletions
--- a/modules/connectors/connectorVoiceGoogle.py
+++ b/modules/connectors/connectorVoiceGoogle.py
@ -58,7 +58,8 @@ class ConnectorGoogleSpeech:
            raise
    async def speechToText(self, audioContent: bytes, language: str = "de-DE", 
-                           sampleRate: int = None, channels: int = None) -> Dict:
+                           sampleRate: int = None, channels: int = None,
                           skipFallbacks: bool = False) -> Dict:
        """
        Convert speech to text using Google Cloud Speech-to-Text API.
@ -234,6 +235,15 @@ class ConnectorGoogleSpeech:
                        "error": f"Google Cloud error: {response.error}"
                    }
                # Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
                if skipFallbacks:
                    return {
                        "success": False,
                        "text": "",
                        "confidence": 0.0,
                        "error": "No recognition results (silence or unclear audio)"
                    }
                # Try multiple fallback approaches
                fallback_configs = []
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -198,12 +198,15 @@ class TeamsbotService:
        audioBuffer = bytearray()
        bufferDurationMs = 0
-        targetBufferMs = 1500  # Buffer 1.5 seconds of audio before STT
+        targetBufferMs = 3000  # Buffer 3 seconds of audio before STT
        # PCM16 at 16kHz mono = 32000 bytes/second
        bytesPerSecond = 32000
        bytesPerMs = bytesPerSecond / 1000
        # Track background STT/AI tasks so they don't block the WebSocket loop
        backgroundTasks: list[asyncio.Task] = []
        logger.info(f"Audio processing started for session {sessionId}")
        try:
@ -235,18 +238,26 @@ class TeamsbotService:
                audioBuffer.extend(audioChunk)
                bufferDurationMs = len(audioBuffer) / bytesPerMs
-                # Process when buffer has enough audio
+                # Process when buffer has enough audio - run in background to not block WebSocket
                if bufferDurationMs >= targetBufferMs:
-                    await self._processAudioBuffer(
+                    chunkBytes = bytes(audioBuffer)
                        bytes(audioBuffer),
                        sessionId,
                        interface,
                        voiceInterface,
                        websocket,
                    )
                    audioBuffer.clear()
                    bufferDurationMs = 0
                    task = asyncio.create_task(
                        self._processAudioBuffer(
                            chunkBytes,
                            sessionId,
                            interface,
                            voiceInterface,
                            websocket,
                        )
                    )
                    backgroundTasks.append(task)
                    # Clean up completed tasks
                    backgroundTasks = [t for t in backgroundTasks if not t.done()]
        except Exception as e:
            if "disconnect" not in str(e).lower():
                logger.error(f"Audio stream error for session {sessionId}: {e}")
@ -261,6 +272,10 @@ class TeamsbotService:
                websocket,
            )
        # Wait for any remaining background tasks
        if backgroundTasks:
            await asyncio.gather(*backgroundTasks, return_exceptions=True)
        logger.info(f"Audio processing ended for session {sessionId}")
    async def _processAudioBuffer(
@ -274,12 +289,14 @@ class TeamsbotService:
        """Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
        # Step 1: STT -- convert audio to text
        # skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams)
        try:
            sttResult = await voiceInterface.speechToText(
                audioContent=audioBytes,
                language=self.config.language,
                sampleRate=16000,
-                channels=1
+                channels=1,
                skipFallbacks=True
            )
        except Exception as e:
            logger.warning(f"STT failed for session {sessionId}: {e}")
--- a/modules/interfaces/interfaceVoiceObjects.py
+++ b/modules/interfaces/interfaceVoiceObjects.py
@ -66,7 +66,8 @@ class VoiceObjects:
    # Speech-to-Text Operations
    async def speechToText(self, audioContent: bytes, language: str = "de-DE", 
-                          sampleRate: int = None, channels: int = None) -> Dict[str, Any]:
+                          sampleRate: int = None, channels: int = None,
                          skipFallbacks: bool = False) -> Dict[str, Any]:
        """
        Convert speech to text using Google Cloud Speech-to-Text API.
@ -75,6 +76,7 @@ class VoiceObjects:
            language: Language code (e.g., 'de-DE', 'en-US')
            sampleRate: Audio sample rate (auto-detected if None)
            channels: Number of audio channels (auto-detected if None)
            skipFallbacks: If True, skip fallback attempts (use when audio format is known)
        Returns:
            Dict containing transcribed text, confidence, and metadata
@ -87,7 +89,8 @@ class VoiceObjects:
                audioContent=audioContent,
                language=language,
                sampleRate=sampleRate,
-                channels=channels
+                channels=channels,
                skipFallbacks=skipFallbacks
            )
            if result["success"]: