frontend_nyla/src/hooks/useSpeechAudioCapture.ts
2026-03-17 19:19:32 +01:00

198 lines
6.4 KiB
TypeScript

/**
* useVoiceStream — single hook for mic capture + STT streaming.
*
* Starts MediaRecorder, opens a WebSocket to the generic STT endpoint,
* sends audio chunks, and receives interim/final transcripts from
* Google Streaming Recognition on the backend.
*
* No client-side VAD, no segmentation, no recorder restarts.
* Google handles silence detection and endpoint natively.
*/
import { useCallback, useEffect, useRef, useState } from 'react';
import api from '../api';
export type VoiceStreamStatus = 'idle' | 'connecting' | 'listening' | 'error';
export interface VoiceStreamCallbacks {
onInterim?: (text: string) => void;
onFinal?: (text: string) => void;
onStatusChange?: (status: VoiceStreamStatus) => void;
onError?: (error: unknown) => void;
}
export interface VoiceStreamApi {
status: VoiceStreamStatus;
interimText: string;
start: (language?: string) => Promise<void>;
stop: () => void;
}
const _RECORDING_CHUNK_MS = 250;
const _MAX_RECONNECT_ATTEMPTS = 3;
export function useVoiceStream(callbacks: VoiceStreamCallbacks): VoiceStreamApi {
const [status, setStatus] = useState<VoiceStreamStatus>('idle');
const [interimText, setInterimText] = useState('');
const cbRef = useRef(callbacks);
cbRef.current = callbacks;
const wsRef = useRef<WebSocket | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const languageRef = useRef('de-DE');
const stoppingRef = useRef(false);
const reconnectAttemptsRef = useRef(0);
const _setStatus = useCallback((next: VoiceStreamStatus) => {
setStatus(next);
cbRef.current.onStatusChange?.(next);
}, []);
const _pickMimeType = useCallback((): string => {
for (const mime of ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4']) {
try { if (MediaRecorder.isTypeSupported(mime)) return mime; } catch { /* skip */ }
}
throw new Error('No supported audio MIME type for MediaRecorder');
}, []);
const _closeWs = useCallback(() => {
const ws = wsRef.current;
if (!ws) return;
wsRef.current = null;
try {
if (ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'close' }));
}
ws.close();
} catch { /* ignore */ }
}, []);
const _stopRecorder = useCallback(() => {
const recorder = recorderRef.current;
if (recorder && recorder.state !== 'inactive') {
try { recorder.stop(); } catch { /* ignore */ }
}
recorderRef.current = null;
}, []);
const _releaseDevices = useCallback(() => {
if (streamRef.current) {
streamRef.current.getTracks().forEach(t => t.stop());
streamRef.current = null;
}
}, []);
const stop = useCallback(() => {
stoppingRef.current = true;
_stopRecorder();
_closeWs();
_releaseDevices();
setInterimText('');
_setStatus('idle');
stoppingRef.current = false;
}, [_stopRecorder, _closeWs, _releaseDevices, _setStatus]);
const start = useCallback(async (language?: string) => {
if (status === 'listening' || status === 'connecting') return;
stoppingRef.current = false;
reconnectAttemptsRef.current = 0;
languageRef.current = language || 'de-DE';
_setStatus('connecting');
try {
if (!streamRef.current) {
streamRef.current = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true, channelCount: 1 },
});
}
const tokenResp = await api.post('/voice-google/stt/token');
const wsToken: string = tokenResp.data.wsToken;
const baseURL = api.defaults.baseURL || window.location.origin;
const wsBase = baseURL.replace(/^http/i, 'ws');
const wsUrl = `${wsBase}/voice-google/stt/stream?wsToken=${encodeURIComponent(wsToken)}`;
const ws = new WebSocket(wsUrl);
wsRef.current = ws;
ws.onopen = () => {
if (stoppingRef.current) { ws.close(); return; }
ws.send(JSON.stringify({ type: 'open', language: languageRef.current }));
const mimeType = _pickMimeType();
const recorder = new MediaRecorder(streamRef.current!, { mimeType });
recorderRef.current = recorder;
recorder.ondataavailable = (event: BlobEvent) => {
if (!event.data || event.data.size === 0) return;
if (ws.readyState !== WebSocket.OPEN) return;
const reader = new FileReader();
reader.onloadend = () => {
if (ws.readyState !== WebSocket.OPEN) return;
const dataUrl = reader.result as string;
const b64 = dataUrl.split(',')[1];
if (b64) ws.send(JSON.stringify({ type: 'audio', chunk: b64 }));
};
reader.readAsDataURL(event.data);
};
recorder.start(_RECORDING_CHUNK_MS);
_setStatus('listening');
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
if (msg.type === 'interim' && msg.text) {
setInterimText(msg.text);
cbRef.current.onInterim?.(msg.text);
} else if (msg.type === 'final' && msg.text) {
setInterimText('');
cbRef.current.onFinal?.(msg.text);
} else if (msg.type === 'error') {
cbRef.current.onError?.(new Error(msg.message || msg.code || 'STT error'));
} else if (msg.type === 'reconnect_required') {
if (reconnectAttemptsRef.current < _MAX_RECONNECT_ATTEMPTS && !stoppingRef.current) {
reconnectAttemptsRef.current++;
_closeWs();
start(languageRef.current).catch(() => {});
}
}
} catch { /* ignore parse errors */ }
};
ws.onerror = () => {
if (!stoppingRef.current) {
cbRef.current.onError?.(new Error('WebSocket connection error'));
_setStatus('error');
}
};
ws.onclose = () => {
if (!stoppingRef.current) {
_setStatus('idle');
}
};
} catch (err) {
cbRef.current.onError?.(err);
_setStatus('error');
_releaseDevices();
throw err;
}
}, [status, _setStatus, _pickMimeType, _closeWs, _releaseDevices]);
useEffect(() => {
return () => {
stoppingRef.current = true;
_stopRecorder();
_closeWs();
_releaseDevices();
};
}, [_stopRecorder, _closeWs, _releaseDevices]);
return { status, interimText, start, stop };
}