feat: handle audioChunk messages from bot, STT via Google Cloud Speech
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
de573fd834
commit
ad254aafb1
1 changed files with 65 additions and 0 deletions
|
|
@ -274,6 +274,20 @@ class TeamsbotService:
|
|||
logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
|
||||
await self._handleBotStatus(sessionId, status, errorMessage, interface)
|
||||
|
||||
elif msgType == "audioChunk":
|
||||
audioData = message.get("audio", {})
|
||||
audioBase64 = audioData.get("data", "")
|
||||
sampleRate = audioData.get("sampleRate", 16000)
|
||||
if audioBase64:
|
||||
await self._processAudioChunk(
|
||||
sessionId=sessionId,
|
||||
audioBase64=audioBase64,
|
||||
sampleRate=sampleRate,
|
||||
interface=interface,
|
||||
voiceInterface=voiceInterface,
|
||||
websocket=websocket,
|
||||
)
|
||||
|
||||
elif msgType == "ping":
|
||||
await websocket.send_text(json.dumps({"type": "pong"}))
|
||||
|
||||
|
|
@ -321,6 +335,57 @@ class TeamsbotService:
|
|||
if dbStatus == TeamsbotSessionStatus.ENDED.value:
|
||||
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
||||
|
||||
async def _processAudioChunk(
|
||||
self,
|
||||
sessionId: str,
|
||||
audioBase64: str,
|
||||
sampleRate: int,
|
||||
interface,
|
||||
voiceInterface,
|
||||
websocket: WebSocket,
|
||||
):
|
||||
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
|
||||
import base64
|
||||
try:
|
||||
audioBytes = base64.b64decode(audioBase64)
|
||||
if len(audioBytes) < 1000:
|
||||
return
|
||||
|
||||
# Use the existing Google Cloud Speech connector for STT
|
||||
speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
|
||||
if not speechConnector or not hasattr(speechConnector, 'speech_client'):
|
||||
logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
|
||||
return
|
||||
|
||||
from google.cloud import speech
|
||||
config = speech.RecognitionConfig(
|
||||
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=sampleRate,
|
||||
language_code=self.config.language or "de-DE",
|
||||
enable_automatic_punctuation=True,
|
||||
)
|
||||
audio = speech.RecognitionAudio(content=audioBytes)
|
||||
|
||||
response = speechConnector.speech_client.recognize(config=config, audio=audio)
|
||||
|
||||
for result in response.results:
|
||||
if result.alternatives:
|
||||
text = result.alternatives[0].transcript.strip()
|
||||
if text:
|
||||
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
|
||||
await self._processTranscript(
|
||||
sessionId=sessionId,
|
||||
speaker="Meeting Audio",
|
||||
text=text,
|
||||
isFinal=True,
|
||||
interface=interface,
|
||||
voiceInterface=voiceInterface,
|
||||
websocket=websocket,
|
||||
source="audioCapture",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
||||
|
||||
async def _processTranscript(
|
||||
self,
|
||||
sessionId: str,
|
||||
|
|
|
|||
Loading…
Reference in a new issue