ui-nyla/src/hooks/useSpeechAudioCapture.ts
ValueOn AG 7eb305f910
Some checks failed
Deploy Nyla Frontend to Integration / deploy (push) Failing after 56s
cp adapted to 2026 poweron
2026-06-09 09:53:38 +02:00

233 lines
7.8 KiB
TypeScript

// Copyright (c) 2026 PowerOn AG
// All rights reserved.
/**
* useVoiceStream — single hook for mic capture + STT streaming.
*
* Starts MediaRecorder, opens a WebSocket to the generic STT endpoint,
* sends audio chunks, and receives interim/final transcripts from
* Google Streaming Recognition on the backend.
*
* No client-side VAD, no segmentation, no recorder restarts.
* Google handles silence detection and endpoint natively.
*/
import { useCallback, useEffect, useRef, useState } from 'react';
import api from '../api';
export type VoiceStreamStatus = 'idle' | 'connecting' | 'listening' | 'error';
export interface VoiceStreamCallbacks {
onInterim?: (text: string) => void;
onFinal?: (text: string) => void;
onStatusChange?: (status: VoiceStreamStatus) => void;
onError?: (error: unknown) => void;
}
/** Options for the initial `open` message on the generic STT WebSocket (Google streaming). */
export interface SttStreamOpenOptions {
model?: string;
lightweight?: boolean;
singleUtterance?: boolean;
}
export interface VoiceStreamApi {
status: VoiceStreamStatus;
interimText: string;
start: (language?: string, sttOpenOptions?: SttStreamOpenOptions) => Promise<void>;
stop: () => void;
}
const _RECORDING_CHUNK_MS = 250;
const _MAX_RECONNECT_ATTEMPTS = 3;
export function useVoiceStream(callbacks: VoiceStreamCallbacks): VoiceStreamApi {
const [status, setStatus] = useState<VoiceStreamStatus>('idle');
const [interimText, setInterimText] = useState('');
const cbRef = useRef(callbacks);
cbRef.current = callbacks;
const wsRef = useRef<WebSocket | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const languageRef = useRef('de-DE');
const sttOpenOptsRef = useRef<SttStreamOpenOptions | undefined>(undefined);
const stoppingRef = useRef(false);
const reconnectAttemptsRef = useRef(0);
const _setStatus = useCallback((next: VoiceStreamStatus) => {
setStatus(next);
cbRef.current.onStatusChange?.(next);
}, []);
const _pickMimeType = useCallback((): string => {
for (const mime of ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4']) {
try { if (MediaRecorder.isTypeSupported(mime)) return mime; } catch { /* skip */ }
}
throw new Error('No supported audio MIME type for MediaRecorder');
}, []);
const _closeWs = useCallback(() => {
const ws = wsRef.current;
if (!ws) return;
wsRef.current = null;
try {
if (ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ type: 'close' }));
}
ws.close();
} catch { /* ignore */ }
}, []);
const _stopRecorder = useCallback(() => {
const recorder = recorderRef.current;
if (recorder && recorder.state !== 'inactive') {
try { recorder.stop(); } catch { /* ignore */ }
}
recorderRef.current = null;
}, []);
const _releaseDevices = useCallback(() => {
if (streamRef.current) {
streamRef.current.getTracks().forEach(t => t.stop());
streamRef.current = null;
}
}, []);
const statusRef = useRef<VoiceStreamStatus>('idle');
const _setStatusTracked = useCallback((next: VoiceStreamStatus) => {
statusRef.current = next;
_setStatus(next);
}, [_setStatus]);
const stop = useCallback(() => {
stoppingRef.current = true;
_stopRecorder();
_closeWs();
_releaseDevices();
setInterimText('');
_setStatusTracked('idle');
stoppingRef.current = false;
}, [_stopRecorder, _closeWs, _releaseDevices, _setStatusTracked]);
const _buildOpenPayload = useCallback(() => {
const o = sttOpenOptsRef.current;
return {
type: 'open' as const,
language: languageRef.current,
model: o?.model ?? 'latest_long',
lightweight: o?.lightweight ?? false,
singleUtterance: o?.singleUtterance ?? false,
};
}, []);
const start = useCallback(async (language?: string, sttOpenOptions?: SttStreamOpenOptions) => {
if (statusRef.current === 'listening' || statusRef.current === 'connecting') return;
stoppingRef.current = false;
reconnectAttemptsRef.current = 0;
languageRef.current = language || 'de-DE';
sttOpenOptsRef.current = sttOpenOptions;
_setStatusTracked('connecting');
try {
const existingStream = streamRef.current;
const tracksAlive = existingStream?.getTracks().some(t => t.readyState === 'live');
if (!existingStream || !tracksAlive) {
if (existingStream) existingStream.getTracks().forEach(t => t.stop());
streamRef.current = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true, channelCount: 1 },
});
}
const tokenResp = await api.post('/voice-google/stt/token');
const wsToken: string = tokenResp.data.wsToken;
const baseURL = api.defaults.baseURL || window.location.origin;
const wsBase = baseURL.replace(/^http/i, 'ws');
const wsUrl = `${wsBase}/voice-google/stt/stream?wsToken=${encodeURIComponent(wsToken)}`;
const ws = new WebSocket(wsUrl);
wsRef.current = ws;
ws.onopen = () => {
if (stoppingRef.current) { ws.close(); return; }
ws.send(JSON.stringify(_buildOpenPayload()));
const mimeType = _pickMimeType();
const recorder = new MediaRecorder(streamRef.current!, { mimeType });
recorderRef.current = recorder;
recorder.ondataavailable = (event: BlobEvent) => {
if (!event.data || event.data.size === 0) return;
if (ws.readyState !== WebSocket.OPEN) return;
const reader = new FileReader();
reader.onloadend = () => {
if (ws.readyState !== WebSocket.OPEN) return;
const dataUrl = reader.result as string;
const b64 = dataUrl.split(',')[1];
if (b64) ws.send(JSON.stringify({ type: 'audio', chunk: b64 }));
};
reader.readAsDataURL(event.data);
};
recorder.start(_RECORDING_CHUNK_MS);
_setStatusTracked('listening');
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
if (msg.type === 'interim' && msg.text) {
setInterimText(msg.text);
cbRef.current.onInterim?.(msg.text);
} else if (msg.type === 'final' && msg.text) {
setInterimText('');
cbRef.current.onFinal?.(msg.text);
} else if (msg.type === 'error') {
cbRef.current.onError?.(new Error(msg.message || msg.code || 'STT error'));
} else if (msg.type === 'end_of_single_utterance') {
if (!stoppingRef.current && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify(_buildOpenPayload()));
}
} else if (msg.type === 'reconnect_required') {
if (reconnectAttemptsRef.current < _MAX_RECONNECT_ATTEMPTS && !stoppingRef.current) {
reconnectAttemptsRef.current++;
_closeWs();
start(languageRef.current, sttOpenOptsRef.current).catch(() => {});
}
}
} catch { /* ignore parse errors */ }
};
ws.onerror = () => {
if (!stoppingRef.current) {
cbRef.current.onError?.(new Error('WebSocket connection error'));
_setStatusTracked('error');
}
};
ws.onclose = () => {
if (!stoppingRef.current) {
_setStatusTracked('idle');
}
};
} catch (err) {
cbRef.current.onError?.(err);
_setStatusTracked('error');
_releaseDevices();
throw err;
}
}, [_setStatusTracked, _pickMimeType, _closeWs, _releaseDevices, _buildOpenPayload]);
useEffect(() => {
return () => {
stoppingRef.current = true;
_stopRecorder();
_closeWs();
_releaseDevices();
};
}, [_stopRecorder, _closeWs, _releaseDevices]);
return { status, interimText, start, stop };
}