feat: handle audioChunk messages from bot, STT via Google Cloud Speech
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
de573fd834
commit
ad254aafb1
1 changed files with 65 additions and 0 deletions
|
|
@ -274,6 +274,20 @@ class TeamsbotService:
|
||||||
logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
|
logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
|
||||||
await self._handleBotStatus(sessionId, status, errorMessage, interface)
|
await self._handleBotStatus(sessionId, status, errorMessage, interface)
|
||||||
|
|
||||||
|
elif msgType == "audioChunk":
|
||||||
|
audioData = message.get("audio", {})
|
||||||
|
audioBase64 = audioData.get("data", "")
|
||||||
|
sampleRate = audioData.get("sampleRate", 16000)
|
||||||
|
if audioBase64:
|
||||||
|
await self._processAudioChunk(
|
||||||
|
sessionId=sessionId,
|
||||||
|
audioBase64=audioBase64,
|
||||||
|
sampleRate=sampleRate,
|
||||||
|
interface=interface,
|
||||||
|
voiceInterface=voiceInterface,
|
||||||
|
websocket=websocket,
|
||||||
|
)
|
||||||
|
|
||||||
elif msgType == "ping":
|
elif msgType == "ping":
|
||||||
await websocket.send_text(json.dumps({"type": "pong"}))
|
await websocket.send_text(json.dumps({"type": "pong"}))
|
||||||
|
|
||||||
|
|
@ -321,6 +335,57 @@ class TeamsbotService:
|
||||||
if dbStatus == TeamsbotSessionStatus.ENDED.value:
|
if dbStatus == TeamsbotSessionStatus.ENDED.value:
|
||||||
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
||||||
|
|
||||||
|
async def _processAudioChunk(
|
||||||
|
self,
|
||||||
|
sessionId: str,
|
||||||
|
audioBase64: str,
|
||||||
|
sampleRate: int,
|
||||||
|
interface,
|
||||||
|
voiceInterface,
|
||||||
|
websocket: WebSocket,
|
||||||
|
):
|
||||||
|
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
|
||||||
|
import base64
|
||||||
|
try:
|
||||||
|
audioBytes = base64.b64decode(audioBase64)
|
||||||
|
if len(audioBytes) < 1000:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Use the existing Google Cloud Speech connector for STT
|
||||||
|
speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
|
||||||
|
if not speechConnector or not hasattr(speechConnector, 'speech_client'):
|
||||||
|
logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
|
||||||
|
return
|
||||||
|
|
||||||
|
from google.cloud import speech
|
||||||
|
config = speech.RecognitionConfig(
|
||||||
|
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||||
|
sample_rate_hertz=sampleRate,
|
||||||
|
language_code=self.config.language or "de-DE",
|
||||||
|
enable_automatic_punctuation=True,
|
||||||
|
)
|
||||||
|
audio = speech.RecognitionAudio(content=audioBytes)
|
||||||
|
|
||||||
|
response = speechConnector.speech_client.recognize(config=config, audio=audio)
|
||||||
|
|
||||||
|
for result in response.results:
|
||||||
|
if result.alternatives:
|
||||||
|
text = result.alternatives[0].transcript.strip()
|
||||||
|
if text:
|
||||||
|
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
||||||
|
await self._processTranscript(
|
||||||
|
sessionId=sessionId,
|
||||||
|
speaker="Meeting Audio",
|
||||||
|
text=text,
|
||||||
|
isFinal=True,
|
||||||
|
interface=interface,
|
||||||
|
voiceInterface=voiceInterface,
|
||||||
|
websocket=websocket,
|
||||||
|
source="audioCapture",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
||||||
|
|
||||||
async def _processTranscript(
|
async def _processTranscript(
|
||||||
self,
|
self,
|
||||||
sessionId: str,
|
sessionId: str,
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue