fix: skip STT fallbacks for teamsbot, run audio processing in background

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
patrick-motsch 2026-02-13 18:17:47 +01:00
parent 77151df0f4
commit ae4dc9fa48
3 changed files with 43 additions and 13 deletions

View file

@ -58,7 +58,8 @@ class ConnectorGoogleSpeech:
raise raise
async def speechToText(self, audioContent: bytes, language: str = "de-DE", async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None) -> Dict: sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False) -> Dict:
""" """
Convert speech to text using Google Cloud Speech-to-Text API. Convert speech to text using Google Cloud Speech-to-Text API.
@ -234,6 +235,15 @@ class ConnectorGoogleSpeech:
"error": f"Google Cloud error: {response.error}" "error": f"Google Cloud error: {response.error}"
} }
# Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
if skipFallbacks:
return {
"success": False,
"text": "",
"confidence": 0.0,
"error": "No recognition results (silence or unclear audio)"
}
# Try multiple fallback approaches # Try multiple fallback approaches
fallback_configs = [] fallback_configs = []

View file

@ -198,12 +198,15 @@ class TeamsbotService:
audioBuffer = bytearray() audioBuffer = bytearray()
bufferDurationMs = 0 bufferDurationMs = 0
targetBufferMs = 1500 # Buffer 1.5 seconds of audio before STT targetBufferMs = 3000 # Buffer 3 seconds of audio before STT
# PCM16 at 16kHz mono = 32000 bytes/second # PCM16 at 16kHz mono = 32000 bytes/second
bytesPerSecond = 32000 bytesPerSecond = 32000
bytesPerMs = bytesPerSecond / 1000 bytesPerMs = bytesPerSecond / 1000
# Track background STT/AI tasks so they don't block the WebSocket loop
backgroundTasks: list[asyncio.Task] = []
logger.info(f"Audio processing started for session {sessionId}") logger.info(f"Audio processing started for session {sessionId}")
try: try:
@ -235,18 +238,26 @@ class TeamsbotService:
audioBuffer.extend(audioChunk) audioBuffer.extend(audioChunk)
bufferDurationMs = len(audioBuffer) / bytesPerMs bufferDurationMs = len(audioBuffer) / bytesPerMs
# Process when buffer has enough audio # Process when buffer has enough audio - run in background to not block WebSocket
if bufferDurationMs >= targetBufferMs: if bufferDurationMs >= targetBufferMs:
await self._processAudioBuffer( chunkBytes = bytes(audioBuffer)
bytes(audioBuffer),
sessionId,
interface,
voiceInterface,
websocket,
)
audioBuffer.clear() audioBuffer.clear()
bufferDurationMs = 0 bufferDurationMs = 0
task = asyncio.create_task(
self._processAudioBuffer(
chunkBytes,
sessionId,
interface,
voiceInterface,
websocket,
)
)
backgroundTasks.append(task)
# Clean up completed tasks
backgroundTasks = [t for t in backgroundTasks if not t.done()]
except Exception as e: except Exception as e:
if "disconnect" not in str(e).lower(): if "disconnect" not in str(e).lower():
logger.error(f"Audio stream error for session {sessionId}: {e}") logger.error(f"Audio stream error for session {sessionId}: {e}")
@ -261,6 +272,10 @@ class TeamsbotService:
websocket, websocket,
) )
# Wait for any remaining background tasks
if backgroundTasks:
await asyncio.gather(*backgroundTasks, return_exceptions=True)
logger.info(f"Audio processing ended for session {sessionId}") logger.info(f"Audio processing ended for session {sessionId}")
async def _processAudioBuffer( async def _processAudioBuffer(
@ -274,12 +289,14 @@ class TeamsbotService:
"""Process a buffered audio chunk through the STT -> AI -> TTS pipeline.""" """Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
# Step 1: STT -- convert audio to text # Step 1: STT -- convert audio to text
# skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams)
try: try:
sttResult = await voiceInterface.speechToText( sttResult = await voiceInterface.speechToText(
audioContent=audioBytes, audioContent=audioBytes,
language=self.config.language, language=self.config.language,
sampleRate=16000, sampleRate=16000,
channels=1 channels=1,
skipFallbacks=True
) )
except Exception as e: except Exception as e:
logger.warning(f"STT failed for session {sessionId}: {e}") logger.warning(f"STT failed for session {sessionId}: {e}")

View file

@ -66,7 +66,8 @@ class VoiceObjects:
# Speech-to-Text Operations # Speech-to-Text Operations
async def speechToText(self, audioContent: bytes, language: str = "de-DE", async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None) -> Dict[str, Any]: sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False) -> Dict[str, Any]:
""" """
Convert speech to text using Google Cloud Speech-to-Text API. Convert speech to text using Google Cloud Speech-to-Text API.
@ -75,6 +76,7 @@ class VoiceObjects:
language: Language code (e.g., 'de-DE', 'en-US') language: Language code (e.g., 'de-DE', 'en-US')
sampleRate: Audio sample rate (auto-detected if None) sampleRate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None) channels: Number of audio channels (auto-detected if None)
skipFallbacks: If True, skip fallback attempts (use when audio format is known)
Returns: Returns:
Dict containing transcribed text, confidence, and metadata Dict containing transcribed text, confidence, and metadata
@ -87,7 +89,8 @@ class VoiceObjects:
audioContent=audioContent, audioContent=audioContent,
language=language, language=language,
sampleRate=sampleRate, sampleRate=sampleRate,
channels=channels channels=channels,
skipFallbacks=skipFallbacks
) )
if result["success"]: if result["success"]: