commcoach: voice state machine refactor + document handling + abort + status events
TASK 1: muted as orthogonal flag (separate from VoiceState) TASK 2: AbortController in sendMessage (abort previous AI calls) TASK 3: cancelPendingSpeech for text input TASK 4: silenceTimer reduced to 1s Made-with: Cursor
This commit is contained in:
parent
fe40097d33
commit
31bf734def
4 changed files with 98 additions and 55 deletions
|
|
@ -306,6 +306,7 @@ export async function sendMessageStreamApi(
|
|||
onEvent: (event: SSEEvent) => void,
|
||||
onError?: (error: Error) => void,
|
||||
onComplete?: () => void,
|
||||
signal?: AbortSignal,
|
||||
): Promise<void> {
|
||||
try {
|
||||
const baseURL = api.defaults.baseURL || '';
|
||||
|
|
@ -322,6 +323,7 @@ export async function sendMessageStreamApi(
|
|||
headers,
|
||||
body: JSON.stringify({ content }),
|
||||
credentials: 'include',
|
||||
signal,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ export function useCommcoach(): CommcoachHookReturn {
|
|||
|
||||
const isMountedRef = useRef(true);
|
||||
const currentAudioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const abortControllerRef = useRef<AbortController | null>(null);
|
||||
const onTtsEventRef = useRef<((event: TtsEvent) => void) | null>(null);
|
||||
const onDocumentCreatedRef = useRef<((doc: any) => void) | null>(null);
|
||||
|
||||
|
|
@ -337,6 +338,11 @@ export function useCommcoach(): CommcoachHookReturn {
|
|||
const sendMessage = useCallback(async (content: string) => {
|
||||
const normalizedContent = content.trim();
|
||||
if (!normalizedContent || !instanceId || !session) return;
|
||||
|
||||
abortControllerRef.current?.abort();
|
||||
const ac = new AbortController();
|
||||
abortControllerRef.current = ac;
|
||||
|
||||
if (currentAudioRef.current) {
|
||||
currentAudioRef.current.pause();
|
||||
currentAudioRef.current = null;
|
||||
|
|
@ -364,7 +370,7 @@ export function useCommcoach(): CommcoachHookReturn {
|
|||
session.id,
|
||||
normalizedContent,
|
||||
(event: SSEEvent) => {
|
||||
if (!isMountedRef.current) return;
|
||||
if (!isMountedRef.current || ac.signal.aborted) return;
|
||||
const eventType = event.type;
|
||||
const eventData = event.data;
|
||||
|
||||
|
|
@ -404,6 +410,7 @@ export function useCommcoach(): CommcoachHookReturn {
|
|||
}
|
||||
},
|
||||
(err) => {
|
||||
if (err.name === 'AbortError') return;
|
||||
if (isMountedRef.current) {
|
||||
setError(err.message);
|
||||
setIsStreaming(false);
|
||||
|
|
@ -417,8 +424,10 @@ export function useCommcoach(): CommcoachHookReturn {
|
|||
setStreamingMessage(null);
|
||||
}
|
||||
},
|
||||
ac.signal,
|
||||
);
|
||||
} catch (err: any) {
|
||||
if (err.name === 'AbortError') return;
|
||||
if (isMountedRef.current) {
|
||||
setError(err.message);
|
||||
setIsStreaming(false);
|
||||
|
|
|
|||
|
|
@ -120,8 +120,9 @@ export const CommcoachDossierView: React.FC = () => {
|
|||
|
||||
const handleSend = useCallback(async () => {
|
||||
if (!coach.inputValue.trim() || coach.isStreaming) return;
|
||||
voice.cancelPendingSpeech();
|
||||
await coach.sendMessage(coach.inputValue);
|
||||
}, [coach]);
|
||||
}, [coach, voice]);
|
||||
|
||||
const handleKeyDown = useCallback((e: React.KeyboardEvent) => {
|
||||
if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); handleSend(); }
|
||||
|
|
@ -340,11 +341,11 @@ export const CommcoachDossierView: React.FC = () => {
|
|||
<button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button>
|
||||
)}
|
||||
<button
|
||||
className={`${styles.btnSmall} ${voice.state === 'muted' ? styles.mutedActive : ''}`}
|
||||
className={`${styles.btnSmall} ${voice.muted ? styles.mutedActive : ''}`}
|
||||
onClick={voice.toggleMute}
|
||||
title={voice.state === 'muted' ? 'Stummschaltung aufheben' : 'Stummschalten'}
|
||||
title={voice.muted ? 'Stummschaltung aufheben' : 'Stummschalten'}
|
||||
>
|
||||
{voice.state === 'muted' ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
|
||||
{voice.muted ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
|
||||
</button>
|
||||
<button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}>
|
||||
{coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'}
|
||||
|
|
@ -391,7 +392,7 @@ export const CommcoachDossierView: React.FC = () => {
|
|||
<div className={styles.inputArea}>
|
||||
<div className={styles.voiceStatus}>
|
||||
<span className={`${styles.voiceIndicator} ${voice.state === 'listening' ? styles.voiceActive : ''}`}>
|
||||
{voice.state === 'muted'
|
||||
{voice.muted
|
||||
? 'Stumm – Mikrofon aus'
|
||||
: voice.state === 'botSpeaking'
|
||||
? (coach.streamingStatus || 'Coach spricht...')
|
||||
|
|
|
|||
|
|
@ -1,22 +1,24 @@
|
|||
/**
|
||||
* Voice Controller - imperative state machine for CommCoach voice interaction.
|
||||
*
|
||||
* States: idle | listening | botSpeaking | interrupted | muted
|
||||
* States: idle | listening | botSpeaking | interrupted
|
||||
* Muted: orthogonal boolean flag (independent of main state)
|
||||
*
|
||||
* Key principle: SpeechRecognition is created once and lives until deactivate().
|
||||
* When botSpeaking, we ignore onresult events instead of stopping recognition.
|
||||
* Recognition is STOPPED during botSpeaking or when muted=true.
|
||||
* Recognition is STARTED when entering listening/interrupted AND muted=false.
|
||||
* Each start() creates a fresh results session (processedIndex resets to 0).
|
||||
*/
|
||||
|
||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||
|
||||
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
|
||||
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted';
|
||||
|
||||
const SILENCE_TIMEOUT_MS = 1500;
|
||||
const MIN_WORDS_TO_SEND = 4;
|
||||
const SILENCE_TIMEOUT_MS = 1000;
|
||||
const REC_AUTORESTART_DELAY_MS = 300;
|
||||
|
||||
export interface VoiceControllerApi {
|
||||
state: VoiceState;
|
||||
muted: boolean;
|
||||
liveTranscript: string;
|
||||
activate: () => void;
|
||||
deactivate: () => void;
|
||||
|
|
@ -24,12 +26,15 @@ export interface VoiceControllerApi {
|
|||
ttsPaused: () => void;
|
||||
ttsEnded: () => void;
|
||||
toggleMute: () => void;
|
||||
cancelPendingSpeech: () => void;
|
||||
}
|
||||
|
||||
export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
|
||||
const [state, setState] = useState<VoiceState>('idle');
|
||||
const [muted, setMuted] = useState(false);
|
||||
const [liveTranscript, setLiveTranscript] = useState('');
|
||||
const stateRef = useRef<VoiceState>('idle');
|
||||
const mutedRef = useRef(false);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const recognitionRef = useRef<SpeechRecognition | null>(null);
|
||||
const transcriptPartsRef = useRef<string[]>([]);
|
||||
|
|
@ -53,25 +58,36 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
setState(next);
|
||||
}, [_dlog]);
|
||||
|
||||
const _setMuted = useCallback((next: boolean) => {
|
||||
mutedRef.current = next;
|
||||
setMuted(next);
|
||||
_dlog('MUTED', String(next));
|
||||
}, [_dlog]);
|
||||
|
||||
const _cancelSilenceTimer = useCallback(() => {
|
||||
if (silenceTimerRef.current) {
|
||||
clearTimeout(silenceTimerRef.current);
|
||||
silenceTimerRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
const _finalizeTranscript = useCallback(() => {
|
||||
const full = transcriptPartsRef.current.join(' ').trim();
|
||||
_dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`);
|
||||
if (full) {
|
||||
const wordCount = full.split(/\s+/).filter(Boolean).length;
|
||||
if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
|
||||
}
|
||||
_dlog('SEND', `"${full.substring(0, 80)}"`);
|
||||
if (full) onMessageRef.current(full);
|
||||
transcriptPartsRef.current = [];
|
||||
setLiveTranscript('');
|
||||
}, [_dlog]);
|
||||
|
||||
const _resetSilenceTimer = useCallback(() => {
|
||||
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
||||
_cancelSilenceTimer();
|
||||
silenceTimerRef.current = setTimeout(() => {
|
||||
_finalizeTranscript();
|
||||
}, SILENCE_TIMEOUT_MS);
|
||||
}, [_finalizeTranscript]);
|
||||
}, [_cancelSilenceTimer, _finalizeTranscript]);
|
||||
|
||||
const _startRecognition = useCallback(() => {
|
||||
if (mutedRef.current) return;
|
||||
const rec = recognitionRef.current;
|
||||
if (!rec) return;
|
||||
try {
|
||||
|
|
@ -102,28 +118,24 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
recognition.lang = 'de-DE';
|
||||
|
||||
recognition.onspeechstart = () => {
|
||||
if (stateRef.current === 'botSpeaking') return;
|
||||
transcriptPartsRef.current = [];
|
||||
setLiveTranscript('');
|
||||
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
|
||||
_resetSilenceTimer();
|
||||
};
|
||||
|
||||
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
||||
const ignore = stateRef.current === 'botSpeaking';
|
||||
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
|
||||
const interimParts: string[] = [];
|
||||
for (let i = processedIndexRef.current; i < event.results.length; i++) {
|
||||
const r = event.results[i];
|
||||
if (r.isFinal) {
|
||||
const text = r[0].transcript.trim();
|
||||
if (text && !ignore) transcriptPartsRef.current.push(text);
|
||||
if (text) transcriptPartsRef.current.push(text);
|
||||
processedIndexRef.current = i + 1;
|
||||
} else {
|
||||
if (ignore) continue;
|
||||
const text = r[0].transcript.trim();
|
||||
if (text) interimParts.push(text);
|
||||
}
|
||||
}
|
||||
if (ignore) return;
|
||||
const currentInterim = interimParts.join(' ');
|
||||
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
|
||||
setLiveTranscript(preview);
|
||||
|
|
@ -131,24 +143,20 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
};
|
||||
|
||||
recognition.onspeechend = () => {
|
||||
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
||||
if (stateRef.current === 'botSpeaking') {
|
||||
transcriptPartsRef.current = [];
|
||||
setLiveTranscript('');
|
||||
return;
|
||||
}
|
||||
_finalizeTranscript();
|
||||
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
|
||||
_resetSilenceTimer();
|
||||
};
|
||||
|
||||
recognition.onend = () => {
|
||||
_dlog('REC-END', `state=${stateRef.current}`);
|
||||
_dlog('REC-END', `state=${stateRef.current} muted=${mutedRef.current}`);
|
||||
if (recognitionRef.current !== recognition) return;
|
||||
const cur = stateRef.current;
|
||||
if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return;
|
||||
if (cur === 'botSpeaking' || cur === 'idle' || mutedRef.current) return;
|
||||
processedIndexRef.current = 0;
|
||||
setTimeout(() => {
|
||||
if (recognitionRef.current !== recognition) return;
|
||||
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
|
||||
if (mutedRef.current) return;
|
||||
try {
|
||||
recognition.start();
|
||||
_dlog('REC-AUTOSTART', 'ok');
|
||||
|
|
@ -166,11 +174,14 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
|
||||
recognitionRef.current = recognition;
|
||||
_startRecognition();
|
||||
}, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]);
|
||||
}, [_dlog, _resetSilenceTimer, _startRecognition]);
|
||||
|
||||
const activate = useCallback(async () => {
|
||||
if (stateRef.current !== 'idle') return;
|
||||
_setState('listening');
|
||||
transcriptPartsRef.current = [];
|
||||
processedIndexRef.current = 0;
|
||||
setLiveTranscript('');
|
||||
|
||||
try {
|
||||
if (!streamRef.current) {
|
||||
|
|
@ -187,11 +198,8 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
}, [_setState, _createRecognition]);
|
||||
|
||||
const deactivate = useCallback(() => {
|
||||
_cancelSilenceTimer();
|
||||
_setState('idle');
|
||||
if (silenceTimerRef.current) {
|
||||
clearTimeout(silenceTimerRef.current);
|
||||
silenceTimerRef.current = null;
|
||||
}
|
||||
if (recognitionRef.current) {
|
||||
try { recognitionRef.current.stop(); } catch { /* ignore */ }
|
||||
recognitionRef.current = null;
|
||||
|
|
@ -203,36 +211,57 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
transcriptPartsRef.current = [];
|
||||
processedIndexRef.current = 0;
|
||||
setLiveTranscript('');
|
||||
}, [_setState]);
|
||||
}, [_setState, _cancelSilenceTimer]);
|
||||
|
||||
const ttsPlaying = useCallback(() => {
|
||||
const cur = stateRef.current;
|
||||
if (cur === 'muted') return;
|
||||
if (cur === 'idle') return;
|
||||
_cancelSilenceTimer();
|
||||
_finalizeTranscript();
|
||||
_stopRecognition();
|
||||
_setState('botSpeaking');
|
||||
}, [_setState]);
|
||||
}, [_setState, _cancelSilenceTimer, _finalizeTranscript, _stopRecognition]);
|
||||
|
||||
const ttsPaused = useCallback(() => {
|
||||
const cur = stateRef.current;
|
||||
if (cur === 'botSpeaking') _setState('interrupted');
|
||||
}, [_setState]);
|
||||
if (cur !== 'botSpeaking') return;
|
||||
transcriptPartsRef.current = [];
|
||||
processedIndexRef.current = 0;
|
||||
setLiveTranscript('');
|
||||
_setState('interrupted');
|
||||
_startRecognition();
|
||||
}, [_setState, _startRecognition]);
|
||||
|
||||
const ttsEnded = useCallback(() => {
|
||||
const cur = stateRef.current;
|
||||
if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening');
|
||||
}, [_setState]);
|
||||
if (cur !== 'botSpeaking' && cur !== 'interrupted') return;
|
||||
transcriptPartsRef.current = [];
|
||||
processedIndexRef.current = 0;
|
||||
setLiveTranscript('');
|
||||
_setState('listening');
|
||||
_startRecognition();
|
||||
}, [_setState, _startRecognition]);
|
||||
|
||||
const toggleMute = useCallback(() => {
|
||||
const cur = stateRef.current;
|
||||
if (cur === 'muted') {
|
||||
_setState('listening');
|
||||
_startRecognition();
|
||||
} else if (cur === 'listening' || cur === 'interrupted') {
|
||||
_setState('muted');
|
||||
if (cur === 'idle') return;
|
||||
if (mutedRef.current) {
|
||||
_setMuted(false);
|
||||
if (cur === 'listening' || cur === 'interrupted') {
|
||||
_startRecognition();
|
||||
}
|
||||
} else {
|
||||
_setMuted(true);
|
||||
_stopRecognition();
|
||||
} else if (cur === 'botSpeaking') {
|
||||
_setState('muted');
|
||||
}
|
||||
}, [_setState, _startRecognition, _stopRecognition]);
|
||||
}, [_setMuted, _startRecognition, _stopRecognition]);
|
||||
|
||||
const cancelPendingSpeech = useCallback(() => {
|
||||
_cancelSilenceTimer();
|
||||
transcriptPartsRef.current = [];
|
||||
setLiveTranscript('');
|
||||
_dlog('CANCEL-SPEECH', 'pending speech cleared for text input');
|
||||
}, [_cancelSilenceTimer, _dlog]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
|
|
@ -250,6 +279,7 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
|
||||
return {
|
||||
state,
|
||||
muted,
|
||||
liveTranscript,
|
||||
activate,
|
||||
deactivate,
|
||||
|
|
@ -257,5 +287,6 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
|
|||
ttsPaused,
|
||||
ttsEnded,
|
||||
toggleMute,
|
||||
cancelPendingSpeech,
|
||||
};
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue