feat: handle audioChunk messages from bot, STT via Google Cloud Speech

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 18:43:42 +01:00 · 2026-02-17 18:43:42 +01:00 · ad254aafb1
commit ad254aafb1
parent de573fd834
1 changed files with 65 additions and 0 deletions
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -274,6 +274,20 @@ class TeamsbotService:
                    logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
                    await self._handleBotStatus(sessionId, status, errorMessage, interface)
                elif msgType == "audioChunk":
                    audioData = message.get("audio", {})
                    audioBase64 = audioData.get("data", "")
                    sampleRate = audioData.get("sampleRate", 16000)
                    if audioBase64:
                        await self._processAudioChunk(
                            sessionId=sessionId,
                            audioBase64=audioBase64,
                            sampleRate=sampleRate,
                            interface=interface,
                            voiceInterface=voiceInterface,
                            websocket=websocket,
                        )
                elif msgType == "ping":
                    await websocket.send_text(json.dumps({"type": "pong"}))
@ -321,6 +335,57 @@ class TeamsbotService:
        if dbStatus == TeamsbotSessionStatus.ENDED.value:
            asyncio.create_task(self._generateMeetingSummary(sessionId))
    async def _processAudioChunk(
        self,
        sessionId: str,
        audioBase64: str,
        sampleRate: int,
        interface,
        voiceInterface,
        websocket: WebSocket,
    ):
        """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
        import base64
        try:
            audioBytes = base64.b64decode(audioBase64)
            if len(audioBytes) < 1000:
                return
            # Use the existing Google Cloud Speech connector for STT
            speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
            if not speechConnector or not hasattr(speechConnector, 'speech_client'):
                logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
                return
            from google.cloud import speech
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=sampleRate,
                language_code=self.config.language or "de-DE",
                enable_automatic_punctuation=True,
            )
            audio = speech.RecognitionAudio(content=audioBytes)
            response = speechConnector.speech_client.recognize(config=config, audio=audio)
            for result in response.results:
                if result.alternatives:
                    text = result.alternatives[0].transcript.strip()
                    if text:
                        logger.info(f"[AudioChunk] STT result: {text[:80]}...")
                        await self._processTranscript(
                            sessionId=sessionId,
                            speaker="Meeting Audio",
                            text=text,
                            isFinal=True,
                            interface=interface,
                            voiceInterface=voiceInterface,
                            websocket=websocket,
                            source="audioCapture",
                        )
        except Exception as e:
            logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
    async def _processTranscript(
        self,
        sessionId: str,