fix: skip STT fallbacks for teamsbot, run audio processing in background

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
patrick-motsch 2026-02-13 18:17:47 +01:00
parent 77151df0f4
commit ae4dc9fa48
3 changed files with 43 additions and 13 deletions

View file

@ -58,7 +58,8 @@ class ConnectorGoogleSpeech:
raise
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None) -> Dict:
sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False) -> Dict:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
@ -234,6 +235,15 @@ class ConnectorGoogleSpeech:
"error": f"Google Cloud error: {response.error}"
}
# Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
if skipFallbacks:
return {
"success": False,
"text": "",
"confidence": 0.0,
"error": "No recognition results (silence or unclear audio)"
}
# Try multiple fallback approaches
fallback_configs = []

View file

@ -198,12 +198,15 @@ class TeamsbotService:
audioBuffer = bytearray()
bufferDurationMs = 0
targetBufferMs = 1500 # Buffer 1.5 seconds of audio before STT
targetBufferMs = 3000 # Buffer 3 seconds of audio before STT
# PCM16 at 16kHz mono = 32000 bytes/second
bytesPerSecond = 32000
bytesPerMs = bytesPerSecond / 1000
# Track background STT/AI tasks so they don't block the WebSocket loop
backgroundTasks: list[asyncio.Task] = []
logger.info(f"Audio processing started for session {sessionId}")
try:
@ -235,17 +238,25 @@ class TeamsbotService:
audioBuffer.extend(audioChunk)
bufferDurationMs = len(audioBuffer) / bytesPerMs
# Process when buffer has enough audio
# Process when buffer has enough audio - run in background to not block WebSocket
if bufferDurationMs >= targetBufferMs:
await self._processAudioBuffer(
bytes(audioBuffer),
chunkBytes = bytes(audioBuffer)
audioBuffer.clear()
bufferDurationMs = 0
task = asyncio.create_task(
self._processAudioBuffer(
chunkBytes,
sessionId,
interface,
voiceInterface,
websocket,
)
audioBuffer.clear()
bufferDurationMs = 0
)
backgroundTasks.append(task)
# Clean up completed tasks
backgroundTasks = [t for t in backgroundTasks if not t.done()]
except Exception as e:
if "disconnect" not in str(e).lower():
@ -261,6 +272,10 @@ class TeamsbotService:
websocket,
)
# Wait for any remaining background tasks
if backgroundTasks:
await asyncio.gather(*backgroundTasks, return_exceptions=True)
logger.info(f"Audio processing ended for session {sessionId}")
async def _processAudioBuffer(
@ -274,12 +289,14 @@ class TeamsbotService:
"""Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
# Step 1: STT -- convert audio to text
# skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams)
try:
sttResult = await voiceInterface.speechToText(
audioContent=audioBytes,
language=self.config.language,
sampleRate=16000,
channels=1
channels=1,
skipFallbacks=True
)
except Exception as e:
logger.warning(f"STT failed for session {sessionId}: {e}")

View file

@ -66,7 +66,8 @@ class VoiceObjects:
# Speech-to-Text Operations
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None) -> Dict[str, Any]:
sampleRate: int = None, channels: int = None,
skipFallbacks: bool = False) -> Dict[str, Any]:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
@ -75,6 +76,7 @@ class VoiceObjects:
language: Language code (e.g., 'de-DE', 'en-US')
sampleRate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None)
skipFallbacks: If True, skip fallback attempts (use when audio format is known)
Returns:
Dict containing transcribed text, confidence, and metadata
@ -87,7 +89,8 @@ class VoiceObjects:
audioContent=audioContent,
language=language,
sampleRate=sampleRate,
channels=channels
channels=channels,
skipFallbacks=skipFallbacks
)
if result["success"]: