feat: handle audioChunk messages from bot, STT via Google Cloud Speech

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 18:43:42 +01:00 · 2026-02-17 18:43:42 +01:00 · ad254aafb1
commit ad254aafb1
parent de573fd834
1 changed files with 65 additions and 0 deletions
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -274,6 +274,20 @@ class TeamsbotService:
                    logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
                    await self._handleBotStatus(sessionId, status, errorMessage, interface)

+                elif msgType == "audioChunk":
+                    audioData = message.get("audio", {})
+                    audioBase64 = audioData.get("data", "")
+                    sampleRate = audioData.get("sampleRate", 16000)
+                    if audioBase64:
+                        await self._processAudioChunk(
+                            sessionId=sessionId,
+                            audioBase64=audioBase64,
+                            sampleRate=sampleRate,
+                            interface=interface,
+                            voiceInterface=voiceInterface,
+                            websocket=websocket,
+                        )
+
                elif msgType == "ping":
                    await websocket.send_text(json.dumps({"type": "pong"}))

@ -321,6 +335,57 @@ class TeamsbotService:
        if dbStatus == TeamsbotSessionStatus.ENDED.value:
            asyncio.create_task(self._generateMeetingSummary(sessionId))

+    async def _processAudioChunk(
+        self,
+        sessionId: str,
+        audioBase64: str,
+        sampleRate: int,
+        interface,
+        voiceInterface,
+        websocket: WebSocket,
+    ):
+        """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
+        import base64
+        try:
+            audioBytes = base64.b64decode(audioBase64)
+            if len(audioBytes) < 1000:
+                return
+
+            # Use the existing Google Cloud Speech connector for STT
+            speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
+            if not speechConnector or not hasattr(speechConnector, 'speech_client'):
+                logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
+                return
+
+            from google.cloud import speech
+            config = speech.RecognitionConfig(
+                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                sample_rate_hertz=sampleRate,
+                language_code=self.config.language or "de-DE",
+                enable_automatic_punctuation=True,
+            )
+            audio = speech.RecognitionAudio(content=audioBytes)
+
+            response = speechConnector.speech_client.recognize(config=config, audio=audio)
+
+            for result in response.results:
+                if result.alternatives:
+                    text = result.alternatives[0].transcript.strip()
+                    if text:
+                        logger.info(f"[AudioChunk] STT result: {text[:80]}...")
+                        await self._processTranscript(
+                            sessionId=sessionId,
+                            speaker="Meeting Audio",
+                            text=text,
+                            isFinal=True,
+                            interface=interface,
+                            voiceInterface=voiceInterface,
+                            websocket=websocket,
+                            source="audioCapture",
+                        )
+        except Exception as e:
+            logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
+
    async def _processTranscript(
        self,
        sessionId: str,