From ad254aafb1f4e0907641e08028a44b6f6ce0bcb2 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Tue, 17 Feb 2026 18:43:42 +0100
Subject: [PATCH] feat: handle audioChunk messages from bot, STT via Google
Cloud Speech
Co-authored-by: Cursor
---
modules/features/teamsbot/service.py | 65 ++++++++++++++++++++++++++++
1 file changed, 65 insertions(+)
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 0cd17bde..eddec193 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -274,6 +274,20 @@ class TeamsbotService:
logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
await self._handleBotStatus(sessionId, status, errorMessage, interface)
+ elif msgType == "audioChunk":
+ audioData = message.get("audio", {})
+ audioBase64 = audioData.get("data", "")
+ sampleRate = audioData.get("sampleRate", 16000)
+ if audioBase64:
+ await self._processAudioChunk(
+ sessionId=sessionId,
+ audioBase64=audioBase64,
+ sampleRate=sampleRate,
+ interface=interface,
+ voiceInterface=voiceInterface,
+ websocket=websocket,
+ )
+
elif msgType == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
@@ -321,6 +335,57 @@ class TeamsbotService:
if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId))
+ async def _processAudioChunk(
+ self,
+ sessionId: str,
+ audioBase64: str,
+ sampleRate: int,
+ interface,
+ voiceInterface,
+ websocket: WebSocket,
+ ):
+ """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
+ import base64
+ try:
+ audioBytes = base64.b64decode(audioBase64)
+ if len(audioBytes) < 1000:
+ return
+
+ # Use the existing Google Cloud Speech connector for STT
+ speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
+ if not speechConnector or not hasattr(speechConnector, 'speech_client'):
+ logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
+ return
+
+ from google.cloud import speech
+ config = speech.RecognitionConfig(
+ encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+ sample_rate_hertz=sampleRate,
+ language_code=self.config.language or "de-DE",
+ enable_automatic_punctuation=True,
+ )
+ audio = speech.RecognitionAudio(content=audioBytes)
+
+ response = speechConnector.speech_client.recognize(config=config, audio=audio)
+
+ for result in response.results:
+ if result.alternatives:
+ text = result.alternatives[0].transcript.strip()
+ if text:
+ logger.info(f"[AudioChunk] STT result: {text[:80]}...")
+ await self._processTranscript(
+ sessionId=sessionId,
+ speaker="Meeting Audio",
+ text=text,
+ isFinal=True,
+ interface=interface,
+ voiceInterface=voiceInterface,
+ websocket=websocket,
+ source="audioCapture",
+ )
+ except Exception as e:
+ logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
+
async def _processTranscript(
self,
sessionId: str,