198 lines
6.4 KiB
TypeScript
198 lines
6.4 KiB
TypeScript
/**
|
|
* useVoiceStream — single hook for mic capture + STT streaming.
|
|
*
|
|
* Starts MediaRecorder, opens a WebSocket to the generic STT endpoint,
|
|
* sends audio chunks, and receives interim/final transcripts from
|
|
* Google Streaming Recognition on the backend.
|
|
*
|
|
* No client-side VAD, no segmentation, no recorder restarts.
|
|
* Google handles silence detection and endpoint natively.
|
|
*/
|
|
|
|
import { useCallback, useEffect, useRef, useState } from 'react';
|
|
import api from '../api';
|
|
|
|
export type VoiceStreamStatus = 'idle' | 'connecting' | 'listening' | 'error';
|
|
|
|
export interface VoiceStreamCallbacks {
|
|
onInterim?: (text: string) => void;
|
|
onFinal?: (text: string) => void;
|
|
onStatusChange?: (status: VoiceStreamStatus) => void;
|
|
onError?: (error: unknown) => void;
|
|
}
|
|
|
|
export interface VoiceStreamApi {
|
|
status: VoiceStreamStatus;
|
|
interimText: string;
|
|
start: (language?: string) => Promise<void>;
|
|
stop: () => void;
|
|
}
|
|
|
|
const _RECORDING_CHUNK_MS = 250;
|
|
const _MAX_RECONNECT_ATTEMPTS = 3;
|
|
|
|
export function useVoiceStream(callbacks: VoiceStreamCallbacks): VoiceStreamApi {
|
|
const [status, setStatus] = useState<VoiceStreamStatus>('idle');
|
|
const [interimText, setInterimText] = useState('');
|
|
|
|
const cbRef = useRef(callbacks);
|
|
cbRef.current = callbacks;
|
|
|
|
const wsRef = useRef<WebSocket | null>(null);
|
|
const recorderRef = useRef<MediaRecorder | null>(null);
|
|
const streamRef = useRef<MediaStream | null>(null);
|
|
const languageRef = useRef('de-DE');
|
|
const stoppingRef = useRef(false);
|
|
const reconnectAttemptsRef = useRef(0);
|
|
|
|
const _setStatus = useCallback((next: VoiceStreamStatus) => {
|
|
setStatus(next);
|
|
cbRef.current.onStatusChange?.(next);
|
|
}, []);
|
|
|
|
const _pickMimeType = useCallback((): string => {
|
|
for (const mime of ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4']) {
|
|
try { if (MediaRecorder.isTypeSupported(mime)) return mime; } catch { /* skip */ }
|
|
}
|
|
throw new Error('No supported audio MIME type for MediaRecorder');
|
|
}, []);
|
|
|
|
const _closeWs = useCallback(() => {
|
|
const ws = wsRef.current;
|
|
if (!ws) return;
|
|
wsRef.current = null;
|
|
try {
|
|
if (ws.readyState === WebSocket.OPEN) {
|
|
ws.send(JSON.stringify({ type: 'close' }));
|
|
}
|
|
ws.close();
|
|
} catch { /* ignore */ }
|
|
}, []);
|
|
|
|
const _stopRecorder = useCallback(() => {
|
|
const recorder = recorderRef.current;
|
|
if (recorder && recorder.state !== 'inactive') {
|
|
try { recorder.stop(); } catch { /* ignore */ }
|
|
}
|
|
recorderRef.current = null;
|
|
}, []);
|
|
|
|
const _releaseDevices = useCallback(() => {
|
|
if (streamRef.current) {
|
|
streamRef.current.getTracks().forEach(t => t.stop());
|
|
streamRef.current = null;
|
|
}
|
|
}, []);
|
|
|
|
const stop = useCallback(() => {
|
|
stoppingRef.current = true;
|
|
_stopRecorder();
|
|
_closeWs();
|
|
_releaseDevices();
|
|
setInterimText('');
|
|
_setStatus('idle');
|
|
stoppingRef.current = false;
|
|
}, [_stopRecorder, _closeWs, _releaseDevices, _setStatus]);
|
|
|
|
const start = useCallback(async (language?: string) => {
|
|
if (status === 'listening' || status === 'connecting') return;
|
|
stoppingRef.current = false;
|
|
reconnectAttemptsRef.current = 0;
|
|
languageRef.current = language || 'de-DE';
|
|
_setStatus('connecting');
|
|
|
|
try {
|
|
if (!streamRef.current) {
|
|
streamRef.current = await navigator.mediaDevices.getUserMedia({
|
|
audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true, channelCount: 1 },
|
|
});
|
|
}
|
|
|
|
const tokenResp = await api.post('/voice-google/stt/token');
|
|
const wsToken: string = tokenResp.data.wsToken;
|
|
|
|
const baseURL = api.defaults.baseURL || window.location.origin;
|
|
const wsBase = baseURL.replace(/^http/i, 'ws');
|
|
const wsUrl = `${wsBase}/voice-google/stt/stream?wsToken=${encodeURIComponent(wsToken)}`;
|
|
|
|
const ws = new WebSocket(wsUrl);
|
|
wsRef.current = ws;
|
|
|
|
ws.onopen = () => {
|
|
if (stoppingRef.current) { ws.close(); return; }
|
|
ws.send(JSON.stringify({ type: 'open', language: languageRef.current }));
|
|
|
|
const mimeType = _pickMimeType();
|
|
const recorder = new MediaRecorder(streamRef.current!, { mimeType });
|
|
recorderRef.current = recorder;
|
|
|
|
recorder.ondataavailable = (event: BlobEvent) => {
|
|
if (!event.data || event.data.size === 0) return;
|
|
if (ws.readyState !== WebSocket.OPEN) return;
|
|
const reader = new FileReader();
|
|
reader.onloadend = () => {
|
|
if (ws.readyState !== WebSocket.OPEN) return;
|
|
const dataUrl = reader.result as string;
|
|
const b64 = dataUrl.split(',')[1];
|
|
if (b64) ws.send(JSON.stringify({ type: 'audio', chunk: b64 }));
|
|
};
|
|
reader.readAsDataURL(event.data);
|
|
};
|
|
|
|
recorder.start(_RECORDING_CHUNK_MS);
|
|
_setStatus('listening');
|
|
};
|
|
|
|
ws.onmessage = (event) => {
|
|
try {
|
|
const msg = JSON.parse(event.data);
|
|
if (msg.type === 'interim' && msg.text) {
|
|
setInterimText(msg.text);
|
|
cbRef.current.onInterim?.(msg.text);
|
|
} else if (msg.type === 'final' && msg.text) {
|
|
setInterimText('');
|
|
cbRef.current.onFinal?.(msg.text);
|
|
} else if (msg.type === 'error') {
|
|
cbRef.current.onError?.(new Error(msg.message || msg.code || 'STT error'));
|
|
} else if (msg.type === 'reconnect_required') {
|
|
if (reconnectAttemptsRef.current < _MAX_RECONNECT_ATTEMPTS && !stoppingRef.current) {
|
|
reconnectAttemptsRef.current++;
|
|
_closeWs();
|
|
start(languageRef.current).catch(() => {});
|
|
}
|
|
}
|
|
} catch { /* ignore parse errors */ }
|
|
};
|
|
|
|
ws.onerror = () => {
|
|
if (!stoppingRef.current) {
|
|
cbRef.current.onError?.(new Error('WebSocket connection error'));
|
|
_setStatus('error');
|
|
}
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
if (!stoppingRef.current) {
|
|
_setStatus('idle');
|
|
}
|
|
};
|
|
|
|
} catch (err) {
|
|
cbRef.current.onError?.(err);
|
|
_setStatus('error');
|
|
_releaseDevices();
|
|
throw err;
|
|
}
|
|
}, [status, _setStatus, _pickMimeType, _closeWs, _releaseDevices]);
|
|
|
|
useEffect(() => {
|
|
return () => {
|
|
stoppingRef.current = true;
|
|
_stopRecorder();
|
|
_closeWs();
|
|
_releaseDevices();
|
|
};
|
|
}, [_stopRecorder, _closeWs, _releaseDevices]);
|
|
|
|
return { status, interimText, start, stop };
|
|
}
|