From ae4dc9fa48e4ff06590fc6a0bea581ce04ba42a2 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Fri, 13 Feb 2026 18:17:47 +0100
Subject: [PATCH] fix: skip STT fallbacks for teamsbot, run audio processing in
background
Co-authored-by: Cursor
---
modules/connectors/connectorVoiceGoogle.py | 12 ++++++-
modules/features/teamsbot/service.py | 37 +++++++++++++++------
modules/interfaces/interfaceVoiceObjects.py | 7 ++--
3 files changed, 43 insertions(+), 13 deletions(-)
diff --git a/modules/connectors/connectorVoiceGoogle.py b/modules/connectors/connectorVoiceGoogle.py
index c32b7fa4..85a19bf7 100644
--- a/modules/connectors/connectorVoiceGoogle.py
+++ b/modules/connectors/connectorVoiceGoogle.py
@@ -58,7 +58,8 @@ class ConnectorGoogleSpeech:
raise
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
- sampleRate: int = None, channels: int = None) -> Dict:
+ sampleRate: int = None, channels: int = None,
+ skipFallbacks: bool = False) -> Dict:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
@@ -234,6 +235,15 @@ class ConnectorGoogleSpeech:
"error": f"Google Cloud error: {response.error}"
}
+ # Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
+ if skipFallbacks:
+ return {
+ "success": False,
+ "text": "",
+ "confidence": 0.0,
+ "error": "No recognition results (silence or unclear audio)"
+ }
+
# Try multiple fallback approaches
fallback_configs = []
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index a7a266ec..ea10fddd 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -198,12 +198,15 @@ class TeamsbotService:
audioBuffer = bytearray()
bufferDurationMs = 0
- targetBufferMs = 1500 # Buffer 1.5 seconds of audio before STT
+ targetBufferMs = 3000 # Buffer 3 seconds of audio before STT
# PCM16 at 16kHz mono = 32000 bytes/second
bytesPerSecond = 32000
bytesPerMs = bytesPerSecond / 1000
+ # Track background STT/AI tasks so they don't block the WebSocket loop
+ backgroundTasks: list[asyncio.Task] = []
+
logger.info(f"Audio processing started for session {sessionId}")
try:
@@ -235,18 +238,26 @@ class TeamsbotService:
audioBuffer.extend(audioChunk)
bufferDurationMs = len(audioBuffer) / bytesPerMs
- # Process when buffer has enough audio
+ # Process when buffer has enough audio - run in background to not block WebSocket
if bufferDurationMs >= targetBufferMs:
- await self._processAudioBuffer(
- bytes(audioBuffer),
- sessionId,
- interface,
- voiceInterface,
- websocket,
- )
+ chunkBytes = bytes(audioBuffer)
audioBuffer.clear()
bufferDurationMs = 0
+ task = asyncio.create_task(
+ self._processAudioBuffer(
+ chunkBytes,
+ sessionId,
+ interface,
+ voiceInterface,
+ websocket,
+ )
+ )
+ backgroundTasks.append(task)
+
+ # Clean up completed tasks
+ backgroundTasks = [t for t in backgroundTasks if not t.done()]
+
except Exception as e:
if "disconnect" not in str(e).lower():
logger.error(f"Audio stream error for session {sessionId}: {e}")
@@ -261,6 +272,10 @@ class TeamsbotService:
websocket,
)
+ # Wait for any remaining background tasks
+ if backgroundTasks:
+ await asyncio.gather(*backgroundTasks, return_exceptions=True)
+
logger.info(f"Audio processing ended for session {sessionId}")
async def _processAudioBuffer(
@@ -274,12 +289,14 @@ class TeamsbotService:
"""Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
# Step 1: STT -- convert audio to text
+ # skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams)
try:
sttResult = await voiceInterface.speechToText(
audioContent=audioBytes,
language=self.config.language,
sampleRate=16000,
- channels=1
+ channels=1,
+ skipFallbacks=True
)
except Exception as e:
logger.warning(f"STT failed for session {sessionId}: {e}")
diff --git a/modules/interfaces/interfaceVoiceObjects.py b/modules/interfaces/interfaceVoiceObjects.py
index 0c28f81d..cccebce4 100644
--- a/modules/interfaces/interfaceVoiceObjects.py
+++ b/modules/interfaces/interfaceVoiceObjects.py
@@ -66,7 +66,8 @@ class VoiceObjects:
# Speech-to-Text Operations
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
- sampleRate: int = None, channels: int = None) -> Dict[str, Any]:
+ sampleRate: int = None, channels: int = None,
+ skipFallbacks: bool = False) -> Dict[str, Any]:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
@@ -75,6 +76,7 @@ class VoiceObjects:
language: Language code (e.g., 'de-DE', 'en-US')
sampleRate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None)
+ skipFallbacks: If True, skip fallback attempts (use when audio format is known)
Returns:
Dict containing transcribed text, confidence, and metadata
@@ -87,7 +89,8 @@ class VoiceObjects:
audioContent=audioContent,
language=language,
sampleRate=sampleRate,
- channels=channels
+ channels=channels,
+ skipFallbacks=skipFallbacks
)
if result["success"]: