/** * useVoiceStream — single hook for mic capture + STT streaming. * * Starts MediaRecorder, opens a WebSocket to the generic STT endpoint, * sends audio chunks, and receives interim/final transcripts from * Google Streaming Recognition on the backend. * * No client-side VAD, no segmentation, no recorder restarts. * Google handles silence detection and endpoint natively. */ import { useCallback, useEffect, useRef, useState } from 'react'; import api from '../api'; export type VoiceStreamStatus = 'idle' | 'connecting' | 'listening' | 'error'; export interface VoiceStreamCallbacks { onInterim?: (text: string) => void; onFinal?: (text: string) => void; onStatusChange?: (status: VoiceStreamStatus) => void; onError?: (error: unknown) => void; } export interface VoiceStreamApi { status: VoiceStreamStatus; interimText: string; start: (language?: string) => Promise; stop: () => void; } const _RECORDING_CHUNK_MS = 250; const _MAX_RECONNECT_ATTEMPTS = 3; export function useVoiceStream(callbacks: VoiceStreamCallbacks): VoiceStreamApi { const [status, setStatus] = useState('idle'); const [interimText, setInterimText] = useState(''); const cbRef = useRef(callbacks); cbRef.current = callbacks; const wsRef = useRef(null); const recorderRef = useRef(null); const streamRef = useRef(null); const languageRef = useRef('de-DE'); const stoppingRef = useRef(false); const reconnectAttemptsRef = useRef(0); const _setStatus = useCallback((next: VoiceStreamStatus) => { setStatus(next); cbRef.current.onStatusChange?.(next); }, []); const _pickMimeType = useCallback((): string => { for (const mime of ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4']) { try { if (MediaRecorder.isTypeSupported(mime)) return mime; } catch { /* skip */ } } throw new Error('No supported audio MIME type for MediaRecorder'); }, []); const _closeWs = useCallback(() => { const ws = wsRef.current; if (!ws) return; wsRef.current = null; try { if (ws.readyState === WebSocket.OPEN) { ws.send(JSON.stringify({ type: 'close' })); } ws.close(); } catch { /* ignore */ } }, []); const _stopRecorder = useCallback(() => { const recorder = recorderRef.current; if (recorder && recorder.state !== 'inactive') { try { recorder.stop(); } catch { /* ignore */ } } recorderRef.current = null; }, []); const _releaseDevices = useCallback(() => { if (streamRef.current) { streamRef.current.getTracks().forEach(t => t.stop()); streamRef.current = null; } }, []); const stop = useCallback(() => { stoppingRef.current = true; _stopRecorder(); _closeWs(); _releaseDevices(); setInterimText(''); _setStatus('idle'); stoppingRef.current = false; }, [_stopRecorder, _closeWs, _releaseDevices, _setStatus]); const start = useCallback(async (language?: string) => { if (status === 'listening' || status === 'connecting') return; stoppingRef.current = false; reconnectAttemptsRef.current = 0; languageRef.current = language || 'de-DE'; _setStatus('connecting'); try { if (!streamRef.current) { streamRef.current = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true, channelCount: 1 }, }); } const tokenResp = await api.post('/voice-google/stt/token'); const wsToken: string = tokenResp.data.wsToken; const baseURL = api.defaults.baseURL || window.location.origin; const wsBase = baseURL.replace(/^http/i, 'ws'); const wsUrl = `${wsBase}/voice-google/stt/stream?wsToken=${encodeURIComponent(wsToken)}`; const ws = new WebSocket(wsUrl); wsRef.current = ws; ws.onopen = () => { if (stoppingRef.current) { ws.close(); return; } ws.send(JSON.stringify({ type: 'open', language: languageRef.current })); const mimeType = _pickMimeType(); const recorder = new MediaRecorder(streamRef.current!, { mimeType }); recorderRef.current = recorder; recorder.ondataavailable = (event: BlobEvent) => { if (!event.data || event.data.size === 0) return; if (ws.readyState !== WebSocket.OPEN) return; const reader = new FileReader(); reader.onloadend = () => { if (ws.readyState !== WebSocket.OPEN) return; const dataUrl = reader.result as string; const b64 = dataUrl.split(',')[1]; if (b64) ws.send(JSON.stringify({ type: 'audio', chunk: b64 })); }; reader.readAsDataURL(event.data); }; recorder.start(_RECORDING_CHUNK_MS); _setStatus('listening'); }; ws.onmessage = (event) => { try { const msg = JSON.parse(event.data); if (msg.type === 'interim' && msg.text) { setInterimText(msg.text); cbRef.current.onInterim?.(msg.text); } else if (msg.type === 'final' && msg.text) { setInterimText(''); cbRef.current.onFinal?.(msg.text); } else if (msg.type === 'error') { cbRef.current.onError?.(new Error(msg.message || msg.code || 'STT error')); } else if (msg.type === 'reconnect_required') { if (reconnectAttemptsRef.current < _MAX_RECONNECT_ATTEMPTS && !stoppingRef.current) { reconnectAttemptsRef.current++; _closeWs(); start(languageRef.current).catch(() => {}); } } } catch { /* ignore parse errors */ } }; ws.onerror = () => { if (!stoppingRef.current) { cbRef.current.onError?.(new Error('WebSocket connection error')); _setStatus('error'); } }; ws.onclose = () => { if (!stoppingRef.current) { _setStatus('idle'); } }; } catch (err) { cbRef.current.onError?.(err); _setStatus('error'); _releaseDevices(); throw err; } }, [status, _setStatus, _pickMimeType, _closeWs, _releaseDevices]); useEffect(() => { return () => { stoppingRef.current = true; _stopRecorder(); _closeWs(); _releaseDevices(); }; }, [_stopRecorder, _closeWs, _releaseDevices]); return { status, interimText, start, stop }; }