From ad254aafb1f4e0907641e08028a44b6f6ce0bcb2 Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Tue, 17 Feb 2026 18:43:42 +0100 Subject: [PATCH] feat: handle audioChunk messages from bot, STT via Google Cloud Speech Co-authored-by: Cursor --- modules/features/teamsbot/service.py | 65 ++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 0cd17bde..eddec193 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -274,6 +274,20 @@ class TeamsbotService: logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}") await self._handleBotStatus(sessionId, status, errorMessage, interface) + elif msgType == "audioChunk": + audioData = message.get("audio", {}) + audioBase64 = audioData.get("data", "") + sampleRate = audioData.get("sampleRate", 16000) + if audioBase64: + await self._processAudioChunk( + sessionId=sessionId, + audioBase64=audioBase64, + sampleRate=sampleRate, + interface=interface, + voiceInterface=voiceInterface, + websocket=websocket, + ) + elif msgType == "ping": await websocket.send_text(json.dumps({"type": "pong"})) @@ -321,6 +335,57 @@ class TeamsbotService: if dbStatus == TeamsbotSessionStatus.ENDED.value: asyncio.create_task(self._generateMeetingSummary(sessionId)) + async def _processAudioChunk( + self, + sessionId: str, + audioBase64: str, + sampleRate: int, + interface, + voiceInterface, + websocket: WebSocket, + ): + """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline.""" + import base64 + try: + audioBytes = base64.b64decode(audioBase64) + if len(audioBytes) < 1000: + return + + # Use the existing Google Cloud Speech connector for STT + speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None + if not speechConnector or not hasattr(speechConnector, 'speech_client'): + logger.warning(f"[AudioChunk] No speech client available for session {sessionId}") + return + + from google.cloud import speech + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=sampleRate, + language_code=self.config.language or "de-DE", + enable_automatic_punctuation=True, + ) + audio = speech.RecognitionAudio(content=audioBytes) + + response = speechConnector.speech_client.recognize(config=config, audio=audio) + + for result in response.results: + if result.alternatives: + text = result.alternatives[0].transcript.strip() + if text: + logger.info(f"[AudioChunk] STT result: {text[:80]}...") + await self._processTranscript( + sessionId=sessionId, + speaker="Meeting Audio", + text=text, + isFinal=True, + interface=interface, + voiceInterface=voiceInterface, + websocket=websocket, + source="audioCapture", + ) + except Exception as e: + logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}") + async def _processTranscript( self, sessionId: str,