commcoach: voice state machine refactor + document handling + abort + status events

TASK 1: muted as orthogonal flag (separate from VoiceState)
TASK 2: AbortController in sendMessage (abort previous AI calls)
TASK 3: cancelPendingSpeech for text input
TASK 4: silenceTimer reduced to 1s

Made-with: Cursor
This commit is contained in:
ValueOn AG 2026-03-07 01:02:15 +01:00
parent fe40097d33
commit 31bf734def
4 changed files with 98 additions and 55 deletions

View file

@ -306,6 +306,7 @@ export async function sendMessageStreamApi(
onEvent: (event: SSEEvent) => void, onEvent: (event: SSEEvent) => void,
onError?: (error: Error) => void, onError?: (error: Error) => void,
onComplete?: () => void, onComplete?: () => void,
signal?: AbortSignal,
): Promise<void> { ): Promise<void> {
try { try {
const baseURL = api.defaults.baseURL || ''; const baseURL = api.defaults.baseURL || '';
@ -322,6 +323,7 @@ export async function sendMessageStreamApi(
headers, headers,
body: JSON.stringify({ content }), body: JSON.stringify({ content }),
credentials: 'include', credentials: 'include',
signal,
}); });
if (!response.ok) { if (!response.ok) {

View file

@ -91,6 +91,7 @@ export function useCommcoach(): CommcoachHookReturn {
const isMountedRef = useRef(true); const isMountedRef = useRef(true);
const currentAudioRef = useRef<HTMLAudioElement | null>(null); const currentAudioRef = useRef<HTMLAudioElement | null>(null);
const abortControllerRef = useRef<AbortController | null>(null);
const onTtsEventRef = useRef<((event: TtsEvent) => void) | null>(null); const onTtsEventRef = useRef<((event: TtsEvent) => void) | null>(null);
const onDocumentCreatedRef = useRef<((doc: any) => void) | null>(null); const onDocumentCreatedRef = useRef<((doc: any) => void) | null>(null);
@ -337,6 +338,11 @@ export function useCommcoach(): CommcoachHookReturn {
const sendMessage = useCallback(async (content: string) => { const sendMessage = useCallback(async (content: string) => {
const normalizedContent = content.trim(); const normalizedContent = content.trim();
if (!normalizedContent || !instanceId || !session) return; if (!normalizedContent || !instanceId || !session) return;
abortControllerRef.current?.abort();
const ac = new AbortController();
abortControllerRef.current = ac;
if (currentAudioRef.current) { if (currentAudioRef.current) {
currentAudioRef.current.pause(); currentAudioRef.current.pause();
currentAudioRef.current = null; currentAudioRef.current = null;
@ -364,7 +370,7 @@ export function useCommcoach(): CommcoachHookReturn {
session.id, session.id,
normalizedContent, normalizedContent,
(event: SSEEvent) => { (event: SSEEvent) => {
if (!isMountedRef.current) return; if (!isMountedRef.current || ac.signal.aborted) return;
const eventType = event.type; const eventType = event.type;
const eventData = event.data; const eventData = event.data;
@ -404,6 +410,7 @@ export function useCommcoach(): CommcoachHookReturn {
} }
}, },
(err) => { (err) => {
if (err.name === 'AbortError') return;
if (isMountedRef.current) { if (isMountedRef.current) {
setError(err.message); setError(err.message);
setIsStreaming(false); setIsStreaming(false);
@ -417,8 +424,10 @@ export function useCommcoach(): CommcoachHookReturn {
setStreamingMessage(null); setStreamingMessage(null);
} }
}, },
ac.signal,
); );
} catch (err: any) { } catch (err: any) {
if (err.name === 'AbortError') return;
if (isMountedRef.current) { if (isMountedRef.current) {
setError(err.message); setError(err.message);
setIsStreaming(false); setIsStreaming(false);

View file

@ -120,8 +120,9 @@ export const CommcoachDossierView: React.FC = () => {
const handleSend = useCallback(async () => { const handleSend = useCallback(async () => {
if (!coach.inputValue.trim() || coach.isStreaming) return; if (!coach.inputValue.trim() || coach.isStreaming) return;
voice.cancelPendingSpeech();
await coach.sendMessage(coach.inputValue); await coach.sendMessage(coach.inputValue);
}, [coach]); }, [coach, voice]);
const handleKeyDown = useCallback((e: React.KeyboardEvent) => { const handleKeyDown = useCallback((e: React.KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); handleSend(); } if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); handleSend(); }
@ -340,11 +341,11 @@ export const CommcoachDossierView: React.FC = () => {
<button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button> <button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button>
)} )}
<button <button
className={`${styles.btnSmall} ${voice.state === 'muted' ? styles.mutedActive : ''}`} className={`${styles.btnSmall} ${voice.muted ? styles.mutedActive : ''}`}
onClick={voice.toggleMute} onClick={voice.toggleMute}
title={voice.state === 'muted' ? 'Stummschaltung aufheben' : 'Stummschalten'} title={voice.muted ? 'Stummschaltung aufheben' : 'Stummschalten'}
> >
{voice.state === 'muted' ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'} {voice.muted ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
</button> </button>
<button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}> <button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}>
{coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'} {coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'}
@ -391,7 +392,7 @@ export const CommcoachDossierView: React.FC = () => {
<div className={styles.inputArea}> <div className={styles.inputArea}>
<div className={styles.voiceStatus}> <div className={styles.voiceStatus}>
<span className={`${styles.voiceIndicator} ${voice.state === 'listening' ? styles.voiceActive : ''}`}> <span className={`${styles.voiceIndicator} ${voice.state === 'listening' ? styles.voiceActive : ''}`}>
{voice.state === 'muted' {voice.muted
? 'Stumm Mikrofon aus' ? 'Stumm Mikrofon aus'
: voice.state === 'botSpeaking' : voice.state === 'botSpeaking'
? (coach.streamingStatus || 'Coach spricht...') ? (coach.streamingStatus || 'Coach spricht...')

View file

@ -1,22 +1,24 @@
/** /**
* Voice Controller - imperative state machine for CommCoach voice interaction. * Voice Controller - imperative state machine for CommCoach voice interaction.
* *
* States: idle | listening | botSpeaking | interrupted | muted * States: idle | listening | botSpeaking | interrupted
* Muted: orthogonal boolean flag (independent of main state)
* *
* Key principle: SpeechRecognition is created once and lives until deactivate(). * Recognition is STOPPED during botSpeaking or when muted=true.
* When botSpeaking, we ignore onresult events instead of stopping recognition. * Recognition is STARTED when entering listening/interrupted AND muted=false.
* Each start() creates a fresh results session (processedIndex resets to 0).
*/ */
import { useState, useRef, useCallback, useEffect } from 'react'; import { useState, useRef, useCallback, useEffect } from 'react';
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted'; export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted';
const SILENCE_TIMEOUT_MS = 1500; const SILENCE_TIMEOUT_MS = 1000;
const MIN_WORDS_TO_SEND = 4;
const REC_AUTORESTART_DELAY_MS = 300; const REC_AUTORESTART_DELAY_MS = 300;
export interface VoiceControllerApi { export interface VoiceControllerApi {
state: VoiceState; state: VoiceState;
muted: boolean;
liveTranscript: string; liveTranscript: string;
activate: () => void; activate: () => void;
deactivate: () => void; deactivate: () => void;
@ -24,12 +26,15 @@ export interface VoiceControllerApi {
ttsPaused: () => void; ttsPaused: () => void;
ttsEnded: () => void; ttsEnded: () => void;
toggleMute: () => void; toggleMute: () => void;
cancelPendingSpeech: () => void;
} }
export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi { export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
const [state, setState] = useState<VoiceState>('idle'); const [state, setState] = useState<VoiceState>('idle');
const [muted, setMuted] = useState(false);
const [liveTranscript, setLiveTranscript] = useState(''); const [liveTranscript, setLiveTranscript] = useState('');
const stateRef = useRef<VoiceState>('idle'); const stateRef = useRef<VoiceState>('idle');
const mutedRef = useRef(false);
const streamRef = useRef<MediaStream | null>(null); const streamRef = useRef<MediaStream | null>(null);
const recognitionRef = useRef<SpeechRecognition | null>(null); const recognitionRef = useRef<SpeechRecognition | null>(null);
const transcriptPartsRef = useRef<string[]>([]); const transcriptPartsRef = useRef<string[]>([]);
@ -53,25 +58,36 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
setState(next); setState(next);
}, [_dlog]); }, [_dlog]);
const _setMuted = useCallback((next: boolean) => {
mutedRef.current = next;
setMuted(next);
_dlog('MUTED', String(next));
}, [_dlog]);
const _cancelSilenceTimer = useCallback(() => {
if (silenceTimerRef.current) {
clearTimeout(silenceTimerRef.current);
silenceTimerRef.current = null;
}
}, []);
const _finalizeTranscript = useCallback(() => { const _finalizeTranscript = useCallback(() => {
const full = transcriptPartsRef.current.join(' ').trim(); const full = transcriptPartsRef.current.join(' ').trim();
_dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`); _dlog('SEND', `"${full.substring(0, 80)}"`);
if (full) { if (full) onMessageRef.current(full);
const wordCount = full.split(/\s+/).filter(Boolean).length;
if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
}
transcriptPartsRef.current = []; transcriptPartsRef.current = [];
setLiveTranscript(''); setLiveTranscript('');
}, [_dlog]); }, [_dlog]);
const _resetSilenceTimer = useCallback(() => { const _resetSilenceTimer = useCallback(() => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current); _cancelSilenceTimer();
silenceTimerRef.current = setTimeout(() => { silenceTimerRef.current = setTimeout(() => {
_finalizeTranscript(); _finalizeTranscript();
}, SILENCE_TIMEOUT_MS); }, SILENCE_TIMEOUT_MS);
}, [_finalizeTranscript]); }, [_cancelSilenceTimer, _finalizeTranscript]);
const _startRecognition = useCallback(() => { const _startRecognition = useCallback(() => {
if (mutedRef.current) return;
const rec = recognitionRef.current; const rec = recognitionRef.current;
if (!rec) return; if (!rec) return;
try { try {
@ -102,28 +118,24 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
recognition.lang = 'de-DE'; recognition.lang = 'de-DE';
recognition.onspeechstart = () => { recognition.onspeechstart = () => {
if (stateRef.current === 'botSpeaking') return; if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
transcriptPartsRef.current = [];
setLiveTranscript('');
_resetSilenceTimer(); _resetSilenceTimer();
}; };
recognition.onresult = (event: SpeechRecognitionEvent) => { recognition.onresult = (event: SpeechRecognitionEvent) => {
const ignore = stateRef.current === 'botSpeaking'; if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
const interimParts: string[] = []; const interimParts: string[] = [];
for (let i = processedIndexRef.current; i < event.results.length; i++) { for (let i = processedIndexRef.current; i < event.results.length; i++) {
const r = event.results[i]; const r = event.results[i];
if (r.isFinal) { if (r.isFinal) {
const text = r[0].transcript.trim(); const text = r[0].transcript.trim();
if (text && !ignore) transcriptPartsRef.current.push(text); if (text) transcriptPartsRef.current.push(text);
processedIndexRef.current = i + 1; processedIndexRef.current = i + 1;
} else { } else {
if (ignore) continue;
const text = r[0].transcript.trim(); const text = r[0].transcript.trim();
if (text) interimParts.push(text); if (text) interimParts.push(text);
} }
} }
if (ignore) return;
const currentInterim = interimParts.join(' '); const currentInterim = interimParts.join(' ');
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim(); const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
setLiveTranscript(preview); setLiveTranscript(preview);
@ -131,24 +143,20 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
}; };
recognition.onspeechend = () => { recognition.onspeechend = () => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current); if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
if (stateRef.current === 'botSpeaking') { _resetSilenceTimer();
transcriptPartsRef.current = [];
setLiveTranscript('');
return;
}
_finalizeTranscript();
}; };
recognition.onend = () => { recognition.onend = () => {
_dlog('REC-END', `state=${stateRef.current}`); _dlog('REC-END', `state=${stateRef.current} muted=${mutedRef.current}`);
if (recognitionRef.current !== recognition) return; if (recognitionRef.current !== recognition) return;
const cur = stateRef.current; const cur = stateRef.current;
if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return; if (cur === 'botSpeaking' || cur === 'idle' || mutedRef.current) return;
processedIndexRef.current = 0; processedIndexRef.current = 0;
setTimeout(() => { setTimeout(() => {
if (recognitionRef.current !== recognition) return; if (recognitionRef.current !== recognition) return;
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return; if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
if (mutedRef.current) return;
try { try {
recognition.start(); recognition.start();
_dlog('REC-AUTOSTART', 'ok'); _dlog('REC-AUTOSTART', 'ok');
@ -166,11 +174,14 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
recognitionRef.current = recognition; recognitionRef.current = recognition;
_startRecognition(); _startRecognition();
}, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]); }, [_dlog, _resetSilenceTimer, _startRecognition]);
const activate = useCallback(async () => { const activate = useCallback(async () => {
if (stateRef.current !== 'idle') return; if (stateRef.current !== 'idle') return;
_setState('listening'); _setState('listening');
transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
try { try {
if (!streamRef.current) { if (!streamRef.current) {
@ -187,11 +198,8 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
}, [_setState, _createRecognition]); }, [_setState, _createRecognition]);
const deactivate = useCallback(() => { const deactivate = useCallback(() => {
_cancelSilenceTimer();
_setState('idle'); _setState('idle');
if (silenceTimerRef.current) {
clearTimeout(silenceTimerRef.current);
silenceTimerRef.current = null;
}
if (recognitionRef.current) { if (recognitionRef.current) {
try { recognitionRef.current.stop(); } catch { /* ignore */ } try { recognitionRef.current.stop(); } catch { /* ignore */ }
recognitionRef.current = null; recognitionRef.current = null;
@ -203,36 +211,57 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
transcriptPartsRef.current = []; transcriptPartsRef.current = [];
processedIndexRef.current = 0; processedIndexRef.current = 0;
setLiveTranscript(''); setLiveTranscript('');
}, [_setState]); }, [_setState, _cancelSilenceTimer]);
const ttsPlaying = useCallback(() => { const ttsPlaying = useCallback(() => {
const cur = stateRef.current; const cur = stateRef.current;
if (cur === 'muted') return; if (cur === 'idle') return;
_cancelSilenceTimer();
_finalizeTranscript();
_stopRecognition();
_setState('botSpeaking'); _setState('botSpeaking');
}, [_setState]); }, [_setState, _cancelSilenceTimer, _finalizeTranscript, _stopRecognition]);
const ttsPaused = useCallback(() => { const ttsPaused = useCallback(() => {
const cur = stateRef.current; const cur = stateRef.current;
if (cur === 'botSpeaking') _setState('interrupted'); if (cur !== 'botSpeaking') return;
}, [_setState]); transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
_setState('interrupted');
_startRecognition();
}, [_setState, _startRecognition]);
const ttsEnded = useCallback(() => { const ttsEnded = useCallback(() => {
const cur = stateRef.current; const cur = stateRef.current;
if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening'); if (cur !== 'botSpeaking' && cur !== 'interrupted') return;
}, [_setState]); transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
_setState('listening');
_startRecognition();
}, [_setState, _startRecognition]);
const toggleMute = useCallback(() => { const toggleMute = useCallback(() => {
const cur = stateRef.current; const cur = stateRef.current;
if (cur === 'muted') { if (cur === 'idle') return;
_setState('listening'); if (mutedRef.current) {
_setMuted(false);
if (cur === 'listening' || cur === 'interrupted') {
_startRecognition(); _startRecognition();
} else if (cur === 'listening' || cur === 'interrupted') {
_setState('muted');
_stopRecognition();
} else if (cur === 'botSpeaking') {
_setState('muted');
} }
}, [_setState, _startRecognition, _stopRecognition]); } else {
_setMuted(true);
_stopRecognition();
}
}, [_setMuted, _startRecognition, _stopRecognition]);
const cancelPendingSpeech = useCallback(() => {
_cancelSilenceTimer();
transcriptPartsRef.current = [];
setLiveTranscript('');
_dlog('CANCEL-SPEECH', 'pending speech cleared for text input');
}, [_cancelSilenceTimer, _dlog]);
useEffect(() => { useEffect(() => {
return () => { return () => {
@ -250,6 +279,7 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
return { return {
state, state,
muted,
liveTranscript, liveTranscript,
activate, activate,
deactivate, deactivate,
@ -257,5 +287,6 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
ttsPaused, ttsPaused,
ttsEnded, ttsEnded,
toggleMute, toggleMute,
cancelPendingSpeech,
}; };
} }