feat: handle audioChunk messages from bot, STT via Google Cloud Speech

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
patrick-motsch 2026-02-17 18:43:42 +01:00
parent de573fd834
commit ad254aafb1

View file

@ -274,6 +274,20 @@ class TeamsbotService:
logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
await self._handleBotStatus(sessionId, status, errorMessage, interface)
elif msgType == "audioChunk":
audioData = message.get("audio", {})
audioBase64 = audioData.get("data", "")
sampleRate = audioData.get("sampleRate", 16000)
if audioBase64:
await self._processAudioChunk(
sessionId=sessionId,
audioBase64=audioBase64,
sampleRate=sampleRate,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
)
elif msgType == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
@ -321,6 +335,57 @@ class TeamsbotService:
if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId))
async def _processAudioChunk(
self,
sessionId: str,
audioBase64: str,
sampleRate: int,
interface,
voiceInterface,
websocket: WebSocket,
):
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
import base64
try:
audioBytes = base64.b64decode(audioBase64)
if len(audioBytes) < 1000:
return
# Use the existing Google Cloud Speech connector for STT
speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
if not speechConnector or not hasattr(speechConnector, 'speech_client'):
logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
return
from google.cloud import speech
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=sampleRate,
language_code=self.config.language or "de-DE",
enable_automatic_punctuation=True,
)
audio = speech.RecognitionAudio(content=audioBytes)
response = speechConnector.speech_client.recognize(config=config, audio=audio)
for result in response.results:
if result.alternatives:
text = result.alternatives[0].transcript.strip()
if text:
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
await self._processTranscript(
sessionId=sessionId,
speaker="Meeting Audio",
text=text,
isFinal=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source="audioCapture",
)
except Exception as e:
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
async def _processTranscript(
self,
sessionId: str,