From ad254aafb1f4e0907641e08028a44b6f6ce0bcb2 Mon Sep 17 00:00:00 2001
From: patrick-motsch <p.motsch@valueon.ch>
Date: Tue, 17 Feb 2026 18:43:42 +0100
Subject: [PATCH] feat: handle audioChunk messages from bot, STT via Google
 Cloud Speech

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 modules/features/teamsbot/service.py | 65 ++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 0cd17bde..eddec193 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -274,6 +274,20 @@ class TeamsbotService:
                     logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
                     await self._handleBotStatus(sessionId, status, errorMessage, interface)
 
+                elif msgType == "audioChunk":
+                    audioData = message.get("audio", {})
+                    audioBase64 = audioData.get("data", "")
+                    sampleRate = audioData.get("sampleRate", 16000)
+                    if audioBase64:
+                        await self._processAudioChunk(
+                            sessionId=sessionId,
+                            audioBase64=audioBase64,
+                            sampleRate=sampleRate,
+                            interface=interface,
+                            voiceInterface=voiceInterface,
+                            websocket=websocket,
+                        )
+
                 elif msgType == "ping":
                     await websocket.send_text(json.dumps({"type": "pong"}))
 
@@ -321,6 +335,57 @@ class TeamsbotService:
         if dbStatus == TeamsbotSessionStatus.ENDED.value:
             asyncio.create_task(self._generateMeetingSummary(sessionId))
 
+    async def _processAudioChunk(
+        self,
+        sessionId: str,
+        audioBase64: str,
+        sampleRate: int,
+        interface,
+        voiceInterface,
+        websocket: WebSocket,
+    ):
+        """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
+        import base64
+        try:
+            audioBytes = base64.b64decode(audioBase64)
+            if len(audioBytes) < 1000:
+                return
+
+            # Use the existing Google Cloud Speech connector for STT
+            speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
+            if not speechConnector or not hasattr(speechConnector, 'speech_client'):
+                logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
+                return
+
+            from google.cloud import speech
+            config = speech.RecognitionConfig(
+                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=sampleRate,
+                language_code=self.config.language or "de-DE",
+                enable_automatic_punctuation=True,
+            )
+            audio = speech.RecognitionAudio(content=audioBytes)
+
+            response = speechConnector.speech_client.recognize(config=config, audio=audio)
+
+            for result in response.results:
+                if result.alternatives:
+                    text = result.alternatives[0].transcript.strip()
+                    if text:
+                        logger.info(f"[AudioChunk] STT result: {text[:80]}...")
+                        await self._processTranscript(
+                            sessionId=sessionId,
+                            speaker="Meeting Audio",
+                            text=text,
+                            isFinal=True,
+                            interface=interface,
+                            voiceInterface=voiceInterface,
+                            websocket=websocket,
+                            source="audioCapture",
+                        )
+        except Exception as e:
+            logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
+
     async def _processTranscript(
         self,
         sessionId: str,