From ae4dc9fa48e4ff06590fc6a0bea581ce04ba42a2 Mon Sep 17 00:00:00 2001
From: patrick-motsch <p.motsch@valueon.ch>
Date: Fri, 13 Feb 2026 18:17:47 +0100
Subject: [PATCH] fix: skip STT fallbacks for teamsbot, run audio processing in
 background

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 modules/connectors/connectorVoiceGoogle.py  | 12 ++++++-
 modules/features/teamsbot/service.py        | 37 +++++++++++++++------
 modules/interfaces/interfaceVoiceObjects.py |  7 ++--
 3 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/modules/connectors/connectorVoiceGoogle.py b/modules/connectors/connectorVoiceGoogle.py
index c32b7fa4..85a19bf7 100644
--- a/modules/connectors/connectorVoiceGoogle.py
+++ b/modules/connectors/connectorVoiceGoogle.py
@@ -58,7 +58,8 @@ class ConnectorGoogleSpeech:
             raise
     
     async def speechToText(self, audioContent: bytes, language: str = "de-DE", 
-                           sampleRate: int = None, channels: int = None) -> Dict:
+                           sampleRate: int = None, channels: int = None,
+                           skipFallbacks: bool = False) -> Dict:
         """
         Convert speech to text using Google Cloud Speech-to-Text API.
         
@@ -234,6 +235,15 @@ class ConnectorGoogleSpeech:
                         "error": f"Google Cloud error: {response.error}"
                     }
                 
+                # Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
+                if skipFallbacks:
+                    return {
+                        "success": False,
+                        "text": "",
+                        "confidence": 0.0,
+                        "error": "No recognition results (silence or unclear audio)"
+                    }
+
                 # Try multiple fallback approaches
                 fallback_configs = []
                 
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index a7a266ec..ea10fddd 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -198,12 +198,15 @@ class TeamsbotService:
 
         audioBuffer = bytearray()
         bufferDurationMs = 0
-        targetBufferMs = 1500  # Buffer 1.5 seconds of audio before STT
+        targetBufferMs = 3000  # Buffer 3 seconds of audio before STT
 
         # PCM16 at 16kHz mono = 32000 bytes/second
         bytesPerSecond = 32000
         bytesPerMs = bytesPerSecond / 1000
 
+        # Track background STT/AI tasks so they don't block the WebSocket loop
+        backgroundTasks: list[asyncio.Task] = []
+
         logger.info(f"Audio processing started for session {sessionId}")
 
         try:
@@ -235,18 +238,26 @@ class TeamsbotService:
                 audioBuffer.extend(audioChunk)
                 bufferDurationMs = len(audioBuffer) / bytesPerMs
 
-                # Process when buffer has enough audio
+                # Process when buffer has enough audio - run in background to not block WebSocket
                 if bufferDurationMs >= targetBufferMs:
-                    await self._processAudioBuffer(
-                        bytes(audioBuffer),
-                        sessionId,
-                        interface,
-                        voiceInterface,
-                        websocket,
-                    )
+                    chunkBytes = bytes(audioBuffer)
                     audioBuffer.clear()
                     bufferDurationMs = 0
 
+                    task = asyncio.create_task(
+                        self._processAudioBuffer(
+                            chunkBytes,
+                            sessionId,
+                            interface,
+                            voiceInterface,
+                            websocket,
+                        )
+                    )
+                    backgroundTasks.append(task)
+
+                    # Clean up completed tasks
+                    backgroundTasks = [t for t in backgroundTasks if not t.done()]
+
         except Exception as e:
             if "disconnect" not in str(e).lower():
                 logger.error(f"Audio stream error for session {sessionId}: {e}")
@@ -261,6 +272,10 @@ class TeamsbotService:
                 websocket,
             )
 
+        # Wait for any remaining background tasks
+        if backgroundTasks:
+            await asyncio.gather(*backgroundTasks, return_exceptions=True)
+
         logger.info(f"Audio processing ended for session {sessionId}")
 
     async def _processAudioBuffer(
@@ -274,12 +289,14 @@ class TeamsbotService:
         """Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
         
         # Step 1: STT -- convert audio to text
+        # skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams)
         try:
             sttResult = await voiceInterface.speechToText(
                 audioContent=audioBytes,
                 language=self.config.language,
                 sampleRate=16000,
-                channels=1
+                channels=1,
+                skipFallbacks=True
             )
         except Exception as e:
             logger.warning(f"STT failed for session {sessionId}: {e}")
diff --git a/modules/interfaces/interfaceVoiceObjects.py b/modules/interfaces/interfaceVoiceObjects.py
index 0c28f81d..cccebce4 100644
--- a/modules/interfaces/interfaceVoiceObjects.py
+++ b/modules/interfaces/interfaceVoiceObjects.py
@@ -66,7 +66,8 @@ class VoiceObjects:
     # Speech-to-Text Operations
     
     async def speechToText(self, audioContent: bytes, language: str = "de-DE", 
-                          sampleRate: int = None, channels: int = None) -> Dict[str, Any]:
+                          sampleRate: int = None, channels: int = None,
+                          skipFallbacks: bool = False) -> Dict[str, Any]:
         """
         Convert speech to text using Google Cloud Speech-to-Text API.
         
@@ -75,6 +76,7 @@ class VoiceObjects:
             language: Language code (e.g., 'de-DE', 'en-US')
             sampleRate: Audio sample rate (auto-detected if None)
             channels: Number of audio channels (auto-detected if None)
+            skipFallbacks: If True, skip fallback attempts (use when audio format is known)
             
         Returns:
             Dict containing transcribed text, confidence, and metadata
@@ -87,7 +89,8 @@ class VoiceObjects:
                 audioContent=audioContent,
                 language=language,
                 sampleRate=sampleRate,
-                channels=channels
+                channels=channels,
+                skipFallbacks=skipFallbacks
             )
             
             if result["success"]: