261 lines
8.2 KiB
TypeScript
261 lines
8.2 KiB
TypeScript
/**
|
|
* Voice Controller - imperative state machine for CommCoach voice interaction.
|
|
*
|
|
* States: idle | listening | botSpeaking | interrupted | muted
|
|
*
|
|
* Key principle: SpeechRecognition is created once and lives until deactivate().
|
|
* When botSpeaking, we ignore onresult events instead of stopping recognition.
|
|
*/
|
|
|
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
|
|
|
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
|
|
|
|
const SILENCE_TIMEOUT_MS = 1500;
|
|
const MIN_WORDS_TO_SEND = 4;
|
|
const REC_AUTORESTART_DELAY_MS = 300;
|
|
|
|
export interface VoiceControllerApi {
|
|
state: VoiceState;
|
|
liveTranscript: string;
|
|
activate: () => void;
|
|
deactivate: () => void;
|
|
ttsPlaying: () => void;
|
|
ttsPaused: () => void;
|
|
ttsEnded: () => void;
|
|
toggleMute: () => void;
|
|
}
|
|
|
|
export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
|
|
const [state, setState] = useState<VoiceState>('idle');
|
|
const [liveTranscript, setLiveTranscript] = useState('');
|
|
const stateRef = useRef<VoiceState>('idle');
|
|
const streamRef = useRef<MediaStream | null>(null);
|
|
const recognitionRef = useRef<SpeechRecognition | null>(null);
|
|
const transcriptPartsRef = useRef<string[]>([]);
|
|
const processedIndexRef = useRef(0);
|
|
const silenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
|
const onMessageRef = useRef(onMessage);
|
|
onMessageRef.current = onMessage;
|
|
|
|
const _dlog = useCallback((tag: string, info?: string) => {
|
|
const t = new Date();
|
|
const ts = `${t.getMinutes()}:${String(t.getSeconds()).padStart(2, '0')}.${String(t.getMilliseconds()).padStart(3, '0')}`;
|
|
const entry = `[${ts}] ${tag}${info ? ' ' + info : ''}`;
|
|
(window as any).__dlog?.(entry);
|
|
}, []);
|
|
|
|
const _setState = useCallback((next: VoiceState) => {
|
|
const prev = stateRef.current;
|
|
if (prev === next) return;
|
|
_dlog('VOICE', `${prev} -> ${next}`);
|
|
stateRef.current = next;
|
|
setState(next);
|
|
}, [_dlog]);
|
|
|
|
const _finalizeTranscript = useCallback(() => {
|
|
const full = transcriptPartsRef.current.join(' ').trim();
|
|
_dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`);
|
|
if (full) {
|
|
const wordCount = full.split(/\s+/).filter(Boolean).length;
|
|
if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
|
|
}
|
|
transcriptPartsRef.current = [];
|
|
setLiveTranscript('');
|
|
}, [_dlog]);
|
|
|
|
const _resetSilenceTimer = useCallback(() => {
|
|
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
|
silenceTimerRef.current = setTimeout(() => {
|
|
_finalizeTranscript();
|
|
}, SILENCE_TIMEOUT_MS);
|
|
}, [_finalizeTranscript]);
|
|
|
|
const _startRecognition = useCallback(() => {
|
|
const rec = recognitionRef.current;
|
|
if (!rec) return;
|
|
try {
|
|
rec.start();
|
|
_dlog('REC-START', 'ok');
|
|
} catch {
|
|
_dlog('REC-START', 'failed');
|
|
}
|
|
}, [_dlog]);
|
|
|
|
const _stopRecognition = useCallback(() => {
|
|
const rec = recognitionRef.current;
|
|
if (!rec) return;
|
|
try {
|
|
rec.stop();
|
|
} catch {
|
|
/* ignore */
|
|
}
|
|
}, []);
|
|
|
|
const _createRecognition = useCallback(() => {
|
|
const SpeechRecognitionApi = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
|
|
if (!SpeechRecognitionApi) return;
|
|
|
|
const recognition = new SpeechRecognitionApi();
|
|
recognition.continuous = true;
|
|
recognition.interimResults = true;
|
|
recognition.lang = 'de-DE';
|
|
|
|
recognition.onspeechstart = () => {
|
|
if (stateRef.current === 'botSpeaking') return;
|
|
transcriptPartsRef.current = [];
|
|
setLiveTranscript('');
|
|
_resetSilenceTimer();
|
|
};
|
|
|
|
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
|
const ignore = stateRef.current === 'botSpeaking';
|
|
const interimParts: string[] = [];
|
|
for (let i = processedIndexRef.current; i < event.results.length; i++) {
|
|
const r = event.results[i];
|
|
if (r.isFinal) {
|
|
const text = r[0].transcript.trim();
|
|
if (text && !ignore) transcriptPartsRef.current.push(text);
|
|
processedIndexRef.current = i + 1;
|
|
} else {
|
|
if (ignore) continue;
|
|
const text = r[0].transcript.trim();
|
|
if (text) interimParts.push(text);
|
|
}
|
|
}
|
|
if (ignore) return;
|
|
const currentInterim = interimParts.join(' ');
|
|
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
|
|
setLiveTranscript(preview);
|
|
if (preview) _resetSilenceTimer();
|
|
};
|
|
|
|
recognition.onspeechend = () => {
|
|
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
|
if (stateRef.current === 'botSpeaking') {
|
|
transcriptPartsRef.current = [];
|
|
setLiveTranscript('');
|
|
return;
|
|
}
|
|
_finalizeTranscript();
|
|
};
|
|
|
|
recognition.onend = () => {
|
|
_dlog('REC-END', `state=${stateRef.current}`);
|
|
if (recognitionRef.current !== recognition) return;
|
|
const cur = stateRef.current;
|
|
if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return;
|
|
processedIndexRef.current = 0;
|
|
setTimeout(() => {
|
|
if (recognitionRef.current !== recognition) return;
|
|
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
|
|
try {
|
|
recognition.start();
|
|
_dlog('REC-AUTOSTART', 'ok');
|
|
} catch {
|
|
_dlog('REC-AUTOSTART', 'failed');
|
|
}
|
|
}, REC_AUTORESTART_DELAY_MS);
|
|
};
|
|
|
|
recognition.onerror = (event: any) => {
|
|
_dlog('REC-ERR', event.error);
|
|
if (event.error === 'no-speech' || event.error === 'aborted') return;
|
|
console.warn('SpeechRecognition error:', event.error);
|
|
};
|
|
|
|
recognitionRef.current = recognition;
|
|
_startRecognition();
|
|
}, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]);
|
|
|
|
const activate = useCallback(async () => {
|
|
if (stateRef.current !== 'idle') return;
|
|
_setState('listening');
|
|
|
|
try {
|
|
if (!streamRef.current) {
|
|
const stream = await navigator.mediaDevices.getUserMedia({
|
|
audio: { echoCancellation: true, noiseSuppression: true },
|
|
});
|
|
streamRef.current = stream;
|
|
}
|
|
_createRecognition();
|
|
} catch (err) {
|
|
console.warn('Mic access failed:', err);
|
|
_setState('idle');
|
|
}
|
|
}, [_setState, _createRecognition]);
|
|
|
|
const deactivate = useCallback(() => {
|
|
_setState('idle');
|
|
if (silenceTimerRef.current) {
|
|
clearTimeout(silenceTimerRef.current);
|
|
silenceTimerRef.current = null;
|
|
}
|
|
if (recognitionRef.current) {
|
|
try { recognitionRef.current.stop(); } catch { /* ignore */ }
|
|
recognitionRef.current = null;
|
|
}
|
|
if (streamRef.current) {
|
|
streamRef.current.getTracks().forEach(t => t.stop());
|
|
streamRef.current = null;
|
|
}
|
|
transcriptPartsRef.current = [];
|
|
processedIndexRef.current = 0;
|
|
setLiveTranscript('');
|
|
}, [_setState]);
|
|
|
|
const ttsPlaying = useCallback(() => {
|
|
const cur = stateRef.current;
|
|
if (cur === 'muted') return;
|
|
_setState('botSpeaking');
|
|
}, [_setState]);
|
|
|
|
const ttsPaused = useCallback(() => {
|
|
const cur = stateRef.current;
|
|
if (cur === 'botSpeaking') _setState('interrupted');
|
|
}, [_setState]);
|
|
|
|
const ttsEnded = useCallback(() => {
|
|
const cur = stateRef.current;
|
|
if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening');
|
|
}, [_setState]);
|
|
|
|
const toggleMute = useCallback(() => {
|
|
const cur = stateRef.current;
|
|
if (cur === 'muted') {
|
|
_setState('listening');
|
|
_startRecognition();
|
|
} else if (cur === 'listening' || cur === 'interrupted') {
|
|
_setState('muted');
|
|
_stopRecognition();
|
|
} else if (cur === 'botSpeaking') {
|
|
_setState('muted');
|
|
}
|
|
}, [_setState, _startRecognition, _stopRecognition]);
|
|
|
|
useEffect(() => {
|
|
return () => {
|
|
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
|
if (recognitionRef.current) {
|
|
try { recognitionRef.current.stop(); } catch { /* ignore */ }
|
|
recognitionRef.current = null;
|
|
}
|
|
if (streamRef.current) {
|
|
streamRef.current.getTracks().forEach(t => t.stop());
|
|
streamRef.current = null;
|
|
}
|
|
};
|
|
}, []);
|
|
|
|
return {
|
|
state,
|
|
liveTranscript,
|
|
activate,
|
|
deactivate,
|
|
ttsPlaying,
|
|
ttsPaused,
|
|
ttsEnded,
|
|
toggleMute,
|
|
};
|
|
}
|