frontend_nyla/src/pages/views/commcoach/useVoiceController.ts

261 lines
8.2 KiB
TypeScript

/**
* Voice Controller - imperative state machine for CommCoach voice interaction.
*
* States: idle | listening | botSpeaking | interrupted | muted
*
* Key principle: SpeechRecognition is created once and lives until deactivate().
* When botSpeaking, we ignore onresult events instead of stopping recognition.
*/
import { useState, useRef, useCallback, useEffect } from 'react';
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
const SILENCE_TIMEOUT_MS = 1500;
const MIN_WORDS_TO_SEND = 4;
const REC_AUTORESTART_DELAY_MS = 300;
export interface VoiceControllerApi {
state: VoiceState;
liveTranscript: string;
activate: () => void;
deactivate: () => void;
ttsPlaying: () => void;
ttsPaused: () => void;
ttsEnded: () => void;
toggleMute: () => void;
}
export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
const [state, setState] = useState<VoiceState>('idle');
const [liveTranscript, setLiveTranscript] = useState('');
const stateRef = useRef<VoiceState>('idle');
const streamRef = useRef<MediaStream | null>(null);
const recognitionRef = useRef<SpeechRecognition | null>(null);
const transcriptPartsRef = useRef<string[]>([]);
const processedIndexRef = useRef(0);
const silenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const onMessageRef = useRef(onMessage);
onMessageRef.current = onMessage;
const _dlog = useCallback((tag: string, info?: string) => {
const t = new Date();
const ts = `${t.getMinutes()}:${String(t.getSeconds()).padStart(2, '0')}.${String(t.getMilliseconds()).padStart(3, '0')}`;
const entry = `[${ts}] ${tag}${info ? ' ' + info : ''}`;
(window as any).__dlog?.(entry);
}, []);
const _setState = useCallback((next: VoiceState) => {
const prev = stateRef.current;
if (prev === next) return;
_dlog('VOICE', `${prev} -> ${next}`);
stateRef.current = next;
setState(next);
}, [_dlog]);
const _finalizeTranscript = useCallback(() => {
const full = transcriptPartsRef.current.join(' ').trim();
_dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`);
if (full) {
const wordCount = full.split(/\s+/).filter(Boolean).length;
if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
}
transcriptPartsRef.current = [];
setLiveTranscript('');
}, [_dlog]);
const _resetSilenceTimer = useCallback(() => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
silenceTimerRef.current = setTimeout(() => {
_finalizeTranscript();
}, SILENCE_TIMEOUT_MS);
}, [_finalizeTranscript]);
const _startRecognition = useCallback(() => {
const rec = recognitionRef.current;
if (!rec) return;
try {
rec.start();
_dlog('REC-START', 'ok');
} catch {
_dlog('REC-START', 'failed');
}
}, [_dlog]);
const _stopRecognition = useCallback(() => {
const rec = recognitionRef.current;
if (!rec) return;
try {
rec.stop();
} catch {
/* ignore */
}
}, []);
const _createRecognition = useCallback(() => {
const SpeechRecognitionApi = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
if (!SpeechRecognitionApi) return;
const recognition = new SpeechRecognitionApi();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'de-DE';
recognition.onspeechstart = () => {
if (stateRef.current === 'botSpeaking') return;
transcriptPartsRef.current = [];
setLiveTranscript('');
_resetSilenceTimer();
};
recognition.onresult = (event: SpeechRecognitionEvent) => {
const ignore = stateRef.current === 'botSpeaking';
const interimParts: string[] = [];
for (let i = processedIndexRef.current; i < event.results.length; i++) {
const r = event.results[i];
if (r.isFinal) {
const text = r[0].transcript.trim();
if (text && !ignore) transcriptPartsRef.current.push(text);
processedIndexRef.current = i + 1;
} else {
if (ignore) continue;
const text = r[0].transcript.trim();
if (text) interimParts.push(text);
}
}
if (ignore) return;
const currentInterim = interimParts.join(' ');
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
setLiveTranscript(preview);
if (preview) _resetSilenceTimer();
};
recognition.onspeechend = () => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
if (stateRef.current === 'botSpeaking') {
transcriptPartsRef.current = [];
setLiveTranscript('');
return;
}
_finalizeTranscript();
};
recognition.onend = () => {
_dlog('REC-END', `state=${stateRef.current}`);
if (recognitionRef.current !== recognition) return;
const cur = stateRef.current;
if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return;
processedIndexRef.current = 0;
setTimeout(() => {
if (recognitionRef.current !== recognition) return;
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
try {
recognition.start();
_dlog('REC-AUTOSTART', 'ok');
} catch {
_dlog('REC-AUTOSTART', 'failed');
}
}, REC_AUTORESTART_DELAY_MS);
};
recognition.onerror = (event: any) => {
_dlog('REC-ERR', event.error);
if (event.error === 'no-speech' || event.error === 'aborted') return;
console.warn('SpeechRecognition error:', event.error);
};
recognitionRef.current = recognition;
_startRecognition();
}, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]);
const activate = useCallback(async () => {
if (stateRef.current !== 'idle') return;
_setState('listening');
try {
if (!streamRef.current) {
const stream = await navigator.mediaDevices.getUserMedia({
audio: { echoCancellation: true, noiseSuppression: true },
});
streamRef.current = stream;
}
_createRecognition();
} catch (err) {
console.warn('Mic access failed:', err);
_setState('idle');
}
}, [_setState, _createRecognition]);
const deactivate = useCallback(() => {
_setState('idle');
if (silenceTimerRef.current) {
clearTimeout(silenceTimerRef.current);
silenceTimerRef.current = null;
}
if (recognitionRef.current) {
try { recognitionRef.current.stop(); } catch { /* ignore */ }
recognitionRef.current = null;
}
if (streamRef.current) {
streamRef.current.getTracks().forEach(t => t.stop());
streamRef.current = null;
}
transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
}, [_setState]);
const ttsPlaying = useCallback(() => {
const cur = stateRef.current;
if (cur === 'muted') return;
_setState('botSpeaking');
}, [_setState]);
const ttsPaused = useCallback(() => {
const cur = stateRef.current;
if (cur === 'botSpeaking') _setState('interrupted');
}, [_setState]);
const ttsEnded = useCallback(() => {
const cur = stateRef.current;
if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening');
}, [_setState]);
const toggleMute = useCallback(() => {
const cur = stateRef.current;
if (cur === 'muted') {
_setState('listening');
_startRecognition();
} else if (cur === 'listening' || cur === 'interrupted') {
_setState('muted');
_stopRecognition();
} else if (cur === 'botSpeaking') {
_setState('muted');
}
}, [_setState, _startRecognition, _stopRecognition]);
useEffect(() => {
return () => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
if (recognitionRef.current) {
try { recognitionRef.current.stop(); } catch { /* ignore */ }
recognitionRef.current = null;
}
if (streamRef.current) {
streamRef.current.getTracks().forEach(t => t.stop());
streamRef.current = null;
}
};
}, []);
return {
state,
liveTranscript,
activate,
deactivate,
ttsPlaying,
ttsPaused,
ttsEnded,
toggleMute,
};
}