/** * Voice Controller - imperative state machine for CommCoach voice interaction. * * States: idle | listening | botSpeaking | interrupted | muted * * Key principle: SpeechRecognition is created once and lives until deactivate(). * When botSpeaking, we ignore onresult events instead of stopping recognition. */ import { useState, useRef, useCallback, useEffect } from 'react'; export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted'; const SILENCE_TIMEOUT_MS = 1500; const MIN_WORDS_TO_SEND = 4; const REC_AUTORESTART_DELAY_MS = 300; export interface VoiceControllerApi { state: VoiceState; liveTranscript: string; activate: () => void; deactivate: () => void; ttsPlaying: () => void; ttsPaused: () => void; ttsEnded: () => void; toggleMute: () => void; } export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi { const [state, setState] = useState('idle'); const [liveTranscript, setLiveTranscript] = useState(''); const stateRef = useRef('idle'); const streamRef = useRef(null); const recognitionRef = useRef(null); const transcriptPartsRef = useRef([]); const processedIndexRef = useRef(0); const silenceTimerRef = useRef | null>(null); const onMessageRef = useRef(onMessage); onMessageRef.current = onMessage; const _dlog = useCallback((tag: string, info?: string) => { const t = new Date(); const ts = `${t.getMinutes()}:${String(t.getSeconds()).padStart(2, '0')}.${String(t.getMilliseconds()).padStart(3, '0')}`; const entry = `[${ts}] ${tag}${info ? ' ' + info : ''}`; (window as any).__dlog?.(entry); }, []); const _setState = useCallback((next: VoiceState) => { const prev = stateRef.current; if (prev === next) return; _dlog('VOICE', `${prev} -> ${next}`); stateRef.current = next; setState(next); }, [_dlog]); const _finalizeTranscript = useCallback(() => { const full = transcriptPartsRef.current.join(' ').trim(); _dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`); if (full) { const wordCount = full.split(/\s+/).filter(Boolean).length; if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full); } transcriptPartsRef.current = []; setLiveTranscript(''); }, [_dlog]); const _resetSilenceTimer = useCallback(() => { if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current); silenceTimerRef.current = setTimeout(() => { _finalizeTranscript(); }, SILENCE_TIMEOUT_MS); }, [_finalizeTranscript]); const _startRecognition = useCallback(() => { const rec = recognitionRef.current; if (!rec) return; try { rec.start(); _dlog('REC-START', 'ok'); } catch { _dlog('REC-START', 'failed'); } }, [_dlog]); const _stopRecognition = useCallback(() => { const rec = recognitionRef.current; if (!rec) return; try { rec.stop(); } catch { /* ignore */ } }, []); const _createRecognition = useCallback(() => { const SpeechRecognitionApi = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition; if (!SpeechRecognitionApi) return; const recognition = new SpeechRecognitionApi(); recognition.continuous = true; recognition.interimResults = true; recognition.lang = 'de-DE'; recognition.onspeechstart = () => { if (stateRef.current === 'botSpeaking') return; transcriptPartsRef.current = []; setLiveTranscript(''); _resetSilenceTimer(); }; recognition.onresult = (event: SpeechRecognitionEvent) => { const ignore = stateRef.current === 'botSpeaking'; const interimParts: string[] = []; for (let i = processedIndexRef.current; i < event.results.length; i++) { const r = event.results[i]; if (r.isFinal) { const text = r[0].transcript.trim(); if (text && !ignore) transcriptPartsRef.current.push(text); processedIndexRef.current = i + 1; } else { if (ignore) continue; const text = r[0].transcript.trim(); if (text) interimParts.push(text); } } if (ignore) return; const currentInterim = interimParts.join(' '); const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim(); setLiveTranscript(preview); if (preview) _resetSilenceTimer(); }; recognition.onspeechend = () => { if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current); if (stateRef.current === 'botSpeaking') { transcriptPartsRef.current = []; setLiveTranscript(''); return; } _finalizeTranscript(); }; recognition.onend = () => { _dlog('REC-END', `state=${stateRef.current}`); if (recognitionRef.current !== recognition) return; const cur = stateRef.current; if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return; processedIndexRef.current = 0; setTimeout(() => { if (recognitionRef.current !== recognition) return; if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return; try { recognition.start(); _dlog('REC-AUTOSTART', 'ok'); } catch { _dlog('REC-AUTOSTART', 'failed'); } }, REC_AUTORESTART_DELAY_MS); }; recognition.onerror = (event: any) => { _dlog('REC-ERR', event.error); if (event.error === 'no-speech' || event.error === 'aborted') return; console.warn('SpeechRecognition error:', event.error); }; recognitionRef.current = recognition; _startRecognition(); }, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]); const activate = useCallback(async () => { if (stateRef.current !== 'idle') return; _setState('listening'); try { if (!streamRef.current) { const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true }, }); streamRef.current = stream; } _createRecognition(); } catch (err) { console.warn('Mic access failed:', err); _setState('idle'); } }, [_setState, _createRecognition]); const deactivate = useCallback(() => { _setState('idle'); if (silenceTimerRef.current) { clearTimeout(silenceTimerRef.current); silenceTimerRef.current = null; } if (recognitionRef.current) { try { recognitionRef.current.stop(); } catch { /* ignore */ } recognitionRef.current = null; } if (streamRef.current) { streamRef.current.getTracks().forEach(t => t.stop()); streamRef.current = null; } transcriptPartsRef.current = []; processedIndexRef.current = 0; setLiveTranscript(''); }, [_setState]); const ttsPlaying = useCallback(() => { const cur = stateRef.current; if (cur === 'muted') return; _setState('botSpeaking'); }, [_setState]); const ttsPaused = useCallback(() => { const cur = stateRef.current; if (cur === 'botSpeaking') _setState('interrupted'); }, [_setState]); const ttsEnded = useCallback(() => { const cur = stateRef.current; if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening'); }, [_setState]); const toggleMute = useCallback(() => { const cur = stateRef.current; if (cur === 'muted') { _setState('listening'); _startRecognition(); } else if (cur === 'listening' || cur === 'interrupted') { _setState('muted'); _stopRecognition(); } else if (cur === 'botSpeaking') { _setState('muted'); } }, [_setState, _startRecognition, _stopRecognition]); useEffect(() => { return () => { if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current); if (recognitionRef.current) { try { recognitionRef.current.stop(); } catch { /* ignore */ } recognitionRef.current = null; } if (streamRef.current) { streamRef.current.getTracks().forEach(t => t.stop()); streamRef.current = null; } }; }, []); return { state, liveTranscript, activate, deactivate, ttsPlaying, ttsPaused, ttsEnded, toggleMute, }; }