commcoach: voice state machine refactor + document handling + abort + status events

TASK 1: muted as orthogonal flag (separate from VoiceState)
TASK 2: AbortController in sendMessage (abort previous AI calls)
TASK 3: cancelPendingSpeech for text input
TASK 4: silenceTimer reduced to 1s

Made-with: Cursor
This commit is contained in:
ValueOn AG 2026-03-07 01:02:15 +01:00
parent fe40097d33
commit 31bf734def
4 changed files with 98 additions and 55 deletions

View file

@ -306,6 +306,7 @@ export async function sendMessageStreamApi(
onEvent: (event: SSEEvent) => void,
onError?: (error: Error) => void,
onComplete?: () => void,
signal?: AbortSignal,
): Promise<void> {
try {
const baseURL = api.defaults.baseURL || '';
@ -322,6 +323,7 @@ export async function sendMessageStreamApi(
headers,
body: JSON.stringify({ content }),
credentials: 'include',
signal,
});
if (!response.ok) {

View file

@ -91,6 +91,7 @@ export function useCommcoach(): CommcoachHookReturn {
const isMountedRef = useRef(true);
const currentAudioRef = useRef<HTMLAudioElement | null>(null);
const abortControllerRef = useRef<AbortController | null>(null);
const onTtsEventRef = useRef<((event: TtsEvent) => void) | null>(null);
const onDocumentCreatedRef = useRef<((doc: any) => void) | null>(null);
@ -337,6 +338,11 @@ export function useCommcoach(): CommcoachHookReturn {
const sendMessage = useCallback(async (content: string) => {
const normalizedContent = content.trim();
if (!normalizedContent || !instanceId || !session) return;
abortControllerRef.current?.abort();
const ac = new AbortController();
abortControllerRef.current = ac;
if (currentAudioRef.current) {
currentAudioRef.current.pause();
currentAudioRef.current = null;
@ -364,7 +370,7 @@ export function useCommcoach(): CommcoachHookReturn {
session.id,
normalizedContent,
(event: SSEEvent) => {
if (!isMountedRef.current) return;
if (!isMountedRef.current || ac.signal.aborted) return;
const eventType = event.type;
const eventData = event.data;
@ -404,6 +410,7 @@ export function useCommcoach(): CommcoachHookReturn {
}
},
(err) => {
if (err.name === 'AbortError') return;
if (isMountedRef.current) {
setError(err.message);
setIsStreaming(false);
@ -417,8 +424,10 @@ export function useCommcoach(): CommcoachHookReturn {
setStreamingMessage(null);
}
},
ac.signal,
);
} catch (err: any) {
if (err.name === 'AbortError') return;
if (isMountedRef.current) {
setError(err.message);
setIsStreaming(false);

View file

@ -120,8 +120,9 @@ export const CommcoachDossierView: React.FC = () => {
const handleSend = useCallback(async () => {
if (!coach.inputValue.trim() || coach.isStreaming) return;
voice.cancelPendingSpeech();
await coach.sendMessage(coach.inputValue);
}, [coach]);
}, [coach, voice]);
const handleKeyDown = useCallback((e: React.KeyboardEvent) => {
if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); handleSend(); }
@ -340,11 +341,11 @@ export const CommcoachDossierView: React.FC = () => {
<button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button>
)}
<button
className={`${styles.btnSmall} ${voice.state === 'muted' ? styles.mutedActive : ''}`}
className={`${styles.btnSmall} ${voice.muted ? styles.mutedActive : ''}`}
onClick={voice.toggleMute}
title={voice.state === 'muted' ? 'Stummschaltung aufheben' : 'Stummschalten'}
title={voice.muted ? 'Stummschaltung aufheben' : 'Stummschalten'}
>
{voice.state === 'muted' ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
{voice.muted ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
</button>
<button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}>
{coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'}
@ -391,7 +392,7 @@ export const CommcoachDossierView: React.FC = () => {
<div className={styles.inputArea}>
<div className={styles.voiceStatus}>
<span className={`${styles.voiceIndicator} ${voice.state === 'listening' ? styles.voiceActive : ''}`}>
{voice.state === 'muted'
{voice.muted
? 'Stumm Mikrofon aus'
: voice.state === 'botSpeaking'
? (coach.streamingStatus || 'Coach spricht...')

View file

@ -1,22 +1,24 @@
/**
* Voice Controller - imperative state machine for CommCoach voice interaction.
*
* States: idle | listening | botSpeaking | interrupted | muted
* States: idle | listening | botSpeaking | interrupted
* Muted: orthogonal boolean flag (independent of main state)
*
* Key principle: SpeechRecognition is created once and lives until deactivate().
* When botSpeaking, we ignore onresult events instead of stopping recognition.
* Recognition is STOPPED during botSpeaking or when muted=true.
* Recognition is STARTED when entering listening/interrupted AND muted=false.
* Each start() creates a fresh results session (processedIndex resets to 0).
*/
import { useState, useRef, useCallback, useEffect } from 'react';
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted';
const SILENCE_TIMEOUT_MS = 1500;
const MIN_WORDS_TO_SEND = 4;
const SILENCE_TIMEOUT_MS = 1000;
const REC_AUTORESTART_DELAY_MS = 300;
export interface VoiceControllerApi {
state: VoiceState;
muted: boolean;
liveTranscript: string;
activate: () => void;
deactivate: () => void;
@ -24,12 +26,15 @@ export interface VoiceControllerApi {
ttsPaused: () => void;
ttsEnded: () => void;
toggleMute: () => void;
cancelPendingSpeech: () => void;
}
export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
const [state, setState] = useState<VoiceState>('idle');
const [muted, setMuted] = useState(false);
const [liveTranscript, setLiveTranscript] = useState('');
const stateRef = useRef<VoiceState>('idle');
const mutedRef = useRef(false);
const streamRef = useRef<MediaStream | null>(null);
const recognitionRef = useRef<SpeechRecognition | null>(null);
const transcriptPartsRef = useRef<string[]>([]);
@ -53,25 +58,36 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
setState(next);
}, [_dlog]);
const _setMuted = useCallback((next: boolean) => {
mutedRef.current = next;
setMuted(next);
_dlog('MUTED', String(next));
}, [_dlog]);
const _cancelSilenceTimer = useCallback(() => {
if (silenceTimerRef.current) {
clearTimeout(silenceTimerRef.current);
silenceTimerRef.current = null;
}
}, []);
const _finalizeTranscript = useCallback(() => {
const full = transcriptPartsRef.current.join(' ').trim();
_dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`);
if (full) {
const wordCount = full.split(/\s+/).filter(Boolean).length;
if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
}
_dlog('SEND', `"${full.substring(0, 80)}"`);
if (full) onMessageRef.current(full);
transcriptPartsRef.current = [];
setLiveTranscript('');
}, [_dlog]);
const _resetSilenceTimer = useCallback(() => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
_cancelSilenceTimer();
silenceTimerRef.current = setTimeout(() => {
_finalizeTranscript();
}, SILENCE_TIMEOUT_MS);
}, [_finalizeTranscript]);
}, [_cancelSilenceTimer, _finalizeTranscript]);
const _startRecognition = useCallback(() => {
if (mutedRef.current) return;
const rec = recognitionRef.current;
if (!rec) return;
try {
@ -102,28 +118,24 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
recognition.lang = 'de-DE';
recognition.onspeechstart = () => {
if (stateRef.current === 'botSpeaking') return;
transcriptPartsRef.current = [];
setLiveTranscript('');
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
_resetSilenceTimer();
};
recognition.onresult = (event: SpeechRecognitionEvent) => {
const ignore = stateRef.current === 'botSpeaking';
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
const interimParts: string[] = [];
for (let i = processedIndexRef.current; i < event.results.length; i++) {
const r = event.results[i];
if (r.isFinal) {
const text = r[0].transcript.trim();
if (text && !ignore) transcriptPartsRef.current.push(text);
if (text) transcriptPartsRef.current.push(text);
processedIndexRef.current = i + 1;
} else {
if (ignore) continue;
const text = r[0].transcript.trim();
if (text) interimParts.push(text);
}
}
if (ignore) return;
const currentInterim = interimParts.join(' ');
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
setLiveTranscript(preview);
@ -131,24 +143,20 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
};
recognition.onspeechend = () => {
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
if (stateRef.current === 'botSpeaking') {
transcriptPartsRef.current = [];
setLiveTranscript('');
return;
}
_finalizeTranscript();
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
_resetSilenceTimer();
};
recognition.onend = () => {
_dlog('REC-END', `state=${stateRef.current}`);
_dlog('REC-END', `state=${stateRef.current} muted=${mutedRef.current}`);
if (recognitionRef.current !== recognition) return;
const cur = stateRef.current;
if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return;
if (cur === 'botSpeaking' || cur === 'idle' || mutedRef.current) return;
processedIndexRef.current = 0;
setTimeout(() => {
if (recognitionRef.current !== recognition) return;
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
if (mutedRef.current) return;
try {
recognition.start();
_dlog('REC-AUTOSTART', 'ok');
@ -166,11 +174,14 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
recognitionRef.current = recognition;
_startRecognition();
}, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]);
}, [_dlog, _resetSilenceTimer, _startRecognition]);
const activate = useCallback(async () => {
if (stateRef.current !== 'idle') return;
_setState('listening');
transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
try {
if (!streamRef.current) {
@ -187,11 +198,8 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
}, [_setState, _createRecognition]);
const deactivate = useCallback(() => {
_cancelSilenceTimer();
_setState('idle');
if (silenceTimerRef.current) {
clearTimeout(silenceTimerRef.current);
silenceTimerRef.current = null;
}
if (recognitionRef.current) {
try { recognitionRef.current.stop(); } catch { /* ignore */ }
recognitionRef.current = null;
@ -203,36 +211,57 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
}, [_setState]);
}, [_setState, _cancelSilenceTimer]);
const ttsPlaying = useCallback(() => {
const cur = stateRef.current;
if (cur === 'muted') return;
if (cur === 'idle') return;
_cancelSilenceTimer();
_finalizeTranscript();
_stopRecognition();
_setState('botSpeaking');
}, [_setState]);
}, [_setState, _cancelSilenceTimer, _finalizeTranscript, _stopRecognition]);
const ttsPaused = useCallback(() => {
const cur = stateRef.current;
if (cur === 'botSpeaking') _setState('interrupted');
}, [_setState]);
if (cur !== 'botSpeaking') return;
transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
_setState('interrupted');
_startRecognition();
}, [_setState, _startRecognition]);
const ttsEnded = useCallback(() => {
const cur = stateRef.current;
if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening');
}, [_setState]);
if (cur !== 'botSpeaking' && cur !== 'interrupted') return;
transcriptPartsRef.current = [];
processedIndexRef.current = 0;
setLiveTranscript('');
_setState('listening');
_startRecognition();
}, [_setState, _startRecognition]);
const toggleMute = useCallback(() => {
const cur = stateRef.current;
if (cur === 'muted') {
_setState('listening');
if (cur === 'idle') return;
if (mutedRef.current) {
_setMuted(false);
if (cur === 'listening' || cur === 'interrupted') {
_startRecognition();
} else if (cur === 'listening' || cur === 'interrupted') {
_setState('muted');
_stopRecognition();
} else if (cur === 'botSpeaking') {
_setState('muted');
}
}, [_setState, _startRecognition, _stopRecognition]);
} else {
_setMuted(true);
_stopRecognition();
}
}, [_setMuted, _startRecognition, _stopRecognition]);
const cancelPendingSpeech = useCallback(() => {
_cancelSilenceTimer();
transcriptPartsRef.current = [];
setLiveTranscript('');
_dlog('CANCEL-SPEECH', 'pending speech cleared for text input');
}, [_cancelSilenceTimer, _dlog]);
useEffect(() => {
return () => {
@ -250,6 +279,7 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
return {
state,
muted,
liveTranscript,
activate,
deactivate,
@ -257,5 +287,6 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
ttsPaused,
ttsEnded,
toggleMute,
cancelPendingSpeech,
};
}