From 31bf734defee55c7aa41a708405c1312f3f91129 Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Sat, 7 Mar 2026 01:02:15 +0100
Subject: [PATCH] commcoach: voice state machine refactor + document handling +
 abort + status events

TASK 1: muted as orthogonal flag (separate from VoiceState)
TASK 2: AbortController in sendMessage (abort previous AI calls)
TASK 3: cancelPendingSpeech for text input
TASK 4: silenceTimer reduced to 1s

Made-with: Cursor
---
 src/api/commcoachApi.ts                       |   2 +
 src/hooks/useCommcoach.ts                     |  11 +-
 .../views/commcoach/CommcoachDossierView.tsx  |  11 +-
 .../views/commcoach/useVoiceController.ts     | 129 +++++++++++-------
 4 files changed, 98 insertions(+), 55 deletions(-)
diff --git a/src/api/commcoachApi.ts b/src/api/commcoachApi.ts
index 9a29be1..ef9b0be 100644
--- a/src/api/commcoachApi.ts
+++ b/src/api/commcoachApi.ts
@@ -306,6 +306,7 @@ export async function sendMessageStreamApi(
   onEvent: (event: SSEEvent) => void,
   onError?: (error: Error) => void,
   onComplete?: () => void,
+  signal?: AbortSignal,
 ): Promise<void> {
   try {
     const baseURL = api.defaults.baseURL || '';
@@ -322,6 +323,7 @@ export async function sendMessageStreamApi(
       headers,
       body: JSON.stringify({ content }),
       credentials: 'include',
+      signal,
     });
 
     if (!response.ok) {
diff --git a/src/hooks/useCommcoach.ts b/src/hooks/useCommcoach.ts
index d416280..b5f3bf0 100644
--- a/src/hooks/useCommcoach.ts
+++ b/src/hooks/useCommcoach.ts
@@ -91,6 +91,7 @@ export function useCommcoach(): CommcoachHookReturn {
 
   const isMountedRef = useRef(true);
   const currentAudioRef = useRef<HTMLAudioElement | null>(null);
+  const abortControllerRef = useRef<AbortController | null>(null);
   const onTtsEventRef = useRef<((event: TtsEvent) => void) | null>(null);
   const onDocumentCreatedRef = useRef<((doc: any) => void) | null>(null);
 
@@ -337,6 +338,11 @@ export function useCommcoach(): CommcoachHookReturn {
   const sendMessage = useCallback(async (content: string) => {
     const normalizedContent = content.trim();
     if (!normalizedContent || !instanceId || !session) return;
+
+    abortControllerRef.current?.abort();
+    const ac = new AbortController();
+    abortControllerRef.current = ac;
+
     if (currentAudioRef.current) {
       currentAudioRef.current.pause();
       currentAudioRef.current = null;
@@ -364,7 +370,7 @@ export function useCommcoach(): CommcoachHookReturn {
         session.id,
         normalizedContent,
         (event: SSEEvent) => {
-          if (!isMountedRef.current) return;
+          if (!isMountedRef.current || ac.signal.aborted) return;
           const eventType = event.type;
           const eventData = event.data;
 
@@ -404,6 +410,7 @@ export function useCommcoach(): CommcoachHookReturn {
           }
         },
         (err) => {
+          if (err.name === 'AbortError') return;
           if (isMountedRef.current) {
             setError(err.message);
             setIsStreaming(false);
@@ -417,8 +424,10 @@ export function useCommcoach(): CommcoachHookReturn {
             setStreamingMessage(null);
           }
         },
+        ac.signal,
       );
     } catch (err: any) {
+      if (err.name === 'AbortError') return;
       if (isMountedRef.current) {
         setError(err.message);
         setIsStreaming(false);
diff --git a/src/pages/views/commcoach/CommcoachDossierView.tsx b/src/pages/views/commcoach/CommcoachDossierView.tsx
index 41eac36..4eb9318 100644
--- a/src/pages/views/commcoach/CommcoachDossierView.tsx
+++ b/src/pages/views/commcoach/CommcoachDossierView.tsx
@@ -120,8 +120,9 @@ export const CommcoachDossierView: React.FC = () => {
 
   const handleSend = useCallback(async () => {
     if (!coach.inputValue.trim() || coach.isStreaming) return;
+    voice.cancelPendingSpeech();
     await coach.sendMessage(coach.inputValue);
-  }, [coach]);
+  }, [coach, voice]);
 
   const handleKeyDown = useCallback((e: React.KeyboardEvent) => {
     if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); handleSend(); }
@@ -340,11 +341,11 @@ export const CommcoachDossierView: React.FC = () => {
                       <button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button>
                     )}
                     <button
-                      className={`${styles.btnSmall} ${voice.state === 'muted' ? styles.mutedActive : ''}`}
+                      className={`${styles.btnSmall} ${voice.muted ? styles.mutedActive : ''}`}
                       onClick={voice.toggleMute}
-                      title={voice.state === 'muted' ? 'Stummschaltung aufheben' : 'Stummschalten'}
+                      title={voice.muted ? 'Stummschaltung aufheben' : 'Stummschalten'}
                     >
-                      {voice.state === 'muted' ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
+                      {voice.muted ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
                     </button>
                     <button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}>
                       {coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'}
@@ -391,7 +392,7 @@ export const CommcoachDossierView: React.FC = () => {
                 <div className={styles.inputArea}>
                   <div className={styles.voiceStatus}>
                     <span className={`${styles.voiceIndicator} ${voice.state === 'listening' ? styles.voiceActive : ''}`}>
-                      {voice.state === 'muted'
+                      {voice.muted
                         ? 'Stumm – Mikrofon aus'
                         : voice.state === 'botSpeaking'
                           ? (coach.streamingStatus || 'Coach spricht...')
diff --git a/src/pages/views/commcoach/useVoiceController.ts b/src/pages/views/commcoach/useVoiceController.ts
index e8cb151..71d8c4b 100644
--- a/src/pages/views/commcoach/useVoiceController.ts
+++ b/src/pages/views/commcoach/useVoiceController.ts
@@ -1,22 +1,24 @@
 /**
  * Voice Controller - imperative state machine for CommCoach voice interaction.
  *
- * States: idle | listening | botSpeaking | interrupted | muted
+ * States: idle | listening | botSpeaking | interrupted
+ * Muted: orthogonal boolean flag (independent of main state)
  *
- * Key principle: SpeechRecognition is created once and lives until deactivate().
- * When botSpeaking, we ignore onresult events instead of stopping recognition.
+ * Recognition is STOPPED during botSpeaking or when muted=true.
+ * Recognition is STARTED when entering listening/interrupted AND muted=false.
+ * Each start() creates a fresh results session (processedIndex resets to 0).
  */
 
 import { useState, useRef, useCallback, useEffect } from 'react';
 
-export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
+export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted';
 
-const SILENCE_TIMEOUT_MS = 1500;
-const MIN_WORDS_TO_SEND = 4;
+const SILENCE_TIMEOUT_MS = 1000;
 const REC_AUTORESTART_DELAY_MS = 300;
 
 export interface VoiceControllerApi {
   state: VoiceState;
+  muted: boolean;
   liveTranscript: string;
   activate: () => void;
   deactivate: () => void;
@@ -24,12 +26,15 @@ export interface VoiceControllerApi {
   ttsPaused: () => void;
   ttsEnded: () => void;
   toggleMute: () => void;
+  cancelPendingSpeech: () => void;
 }
 
 export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
   const [state, setState] = useState<VoiceState>('idle');
+  const [muted, setMuted] = useState(false);
   const [liveTranscript, setLiveTranscript] = useState('');
   const stateRef = useRef<VoiceState>('idle');
+  const mutedRef = useRef(false);
   const streamRef = useRef<MediaStream | null>(null);
   const recognitionRef = useRef<SpeechRecognition | null>(null);
   const transcriptPartsRef = useRef<string[]>([]);
@@ -53,25 +58,36 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
     setState(next);
   }, [_dlog]);
 
+  const _setMuted = useCallback((next: boolean) => {
+    mutedRef.current = next;
+    setMuted(next);
+    _dlog('MUTED', String(next));
+  }, [_dlog]);
+
+  const _cancelSilenceTimer = useCallback(() => {
+    if (silenceTimerRef.current) {
+      clearTimeout(silenceTimerRef.current);
+      silenceTimerRef.current = null;
+    }
+  }, []);
+
   const _finalizeTranscript = useCallback(() => {
     const full = transcriptPartsRef.current.join(' ').trim();
-    _dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`);
-    if (full) {
-      const wordCount = full.split(/\s+/).filter(Boolean).length;
-      if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
-    }
+    _dlog('SEND', `"${full.substring(0, 80)}"`);
+    if (full) onMessageRef.current(full);
     transcriptPartsRef.current = [];
     setLiveTranscript('');
   }, [_dlog]);
 
   const _resetSilenceTimer = useCallback(() => {
-    if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
+    _cancelSilenceTimer();
     silenceTimerRef.current = setTimeout(() => {
       _finalizeTranscript();
     }, SILENCE_TIMEOUT_MS);
-  }, [_finalizeTranscript]);
+  }, [_cancelSilenceTimer, _finalizeTranscript]);
 
   const _startRecognition = useCallback(() => {
+    if (mutedRef.current) return;
     const rec = recognitionRef.current;
     if (!rec) return;
     try {
@@ -102,28 +118,24 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
     recognition.lang = 'de-DE';
 
     recognition.onspeechstart = () => {
-      if (stateRef.current === 'botSpeaking') return;
-      transcriptPartsRef.current = [];
-      setLiveTranscript('');
+      if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
       _resetSilenceTimer();
     };
 
     recognition.onresult = (event: SpeechRecognitionEvent) => {
-      const ignore = stateRef.current === 'botSpeaking';
+      if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
       const interimParts: string[] = [];
       for (let i = processedIndexRef.current; i < event.results.length; i++) {
         const r = event.results[i];
         if (r.isFinal) {
           const text = r[0].transcript.trim();
-          if (text && !ignore) transcriptPartsRef.current.push(text);
+          if (text) transcriptPartsRef.current.push(text);
           processedIndexRef.current = i + 1;
         } else {
-          if (ignore) continue;
           const text = r[0].transcript.trim();
           if (text) interimParts.push(text);
         }
       }
-      if (ignore) return;
       const currentInterim = interimParts.join(' ');
       const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
       setLiveTranscript(preview);
@@ -131,24 +143,20 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
     };
 
     recognition.onspeechend = () => {
-      if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
-      if (stateRef.current === 'botSpeaking') {
-        transcriptPartsRef.current = [];
-        setLiveTranscript('');
-        return;
-      }
-      _finalizeTranscript();
+      if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
+      _resetSilenceTimer();
     };
 
     recognition.onend = () => {
-      _dlog('REC-END', `state=${stateRef.current}`);
+      _dlog('REC-END', `state=${stateRef.current} muted=${mutedRef.current}`);
       if (recognitionRef.current !== recognition) return;
       const cur = stateRef.current;
-      if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return;
+      if (cur === 'botSpeaking' || cur === 'idle' || mutedRef.current) return;
       processedIndexRef.current = 0;
       setTimeout(() => {
         if (recognitionRef.current !== recognition) return;
         if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
+        if (mutedRef.current) return;
         try {
           recognition.start();
           _dlog('REC-AUTOSTART', 'ok');
@@ -166,11 +174,14 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
 
     recognitionRef.current = recognition;
     _startRecognition();
-  }, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]);
+  }, [_dlog, _resetSilenceTimer, _startRecognition]);
 
   const activate = useCallback(async () => {
     if (stateRef.current !== 'idle') return;
     _setState('listening');
+    transcriptPartsRef.current = [];
+    processedIndexRef.current = 0;
+    setLiveTranscript('');
 
     try {
       if (!streamRef.current) {
@@ -187,11 +198,8 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
   }, [_setState, _createRecognition]);
 
   const deactivate = useCallback(() => {
+    _cancelSilenceTimer();
     _setState('idle');
-    if (silenceTimerRef.current) {
-      clearTimeout(silenceTimerRef.current);
-      silenceTimerRef.current = null;
-    }
     if (recognitionRef.current) {
       try { recognitionRef.current.stop(); } catch { /* ignore */ }
       recognitionRef.current = null;
@@ -203,36 +211,57 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
     transcriptPartsRef.current = [];
     processedIndexRef.current = 0;
     setLiveTranscript('');
-  }, [_setState]);
+  }, [_setState, _cancelSilenceTimer]);
 
   const ttsPlaying = useCallback(() => {
     const cur = stateRef.current;
-    if (cur === 'muted') return;
+    if (cur === 'idle') return;
+    _cancelSilenceTimer();
+    _finalizeTranscript();
+    _stopRecognition();
     _setState('botSpeaking');
-  }, [_setState]);
+  }, [_setState, _cancelSilenceTimer, _finalizeTranscript, _stopRecognition]);
 
   const ttsPaused = useCallback(() => {
     const cur = stateRef.current;
-    if (cur === 'botSpeaking') _setState('interrupted');
-  }, [_setState]);
+    if (cur !== 'botSpeaking') return;
+    transcriptPartsRef.current = [];
+    processedIndexRef.current = 0;
+    setLiveTranscript('');
+    _setState('interrupted');
+    _startRecognition();
+  }, [_setState, _startRecognition]);
 
   const ttsEnded = useCallback(() => {
     const cur = stateRef.current;
-    if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening');
-  }, [_setState]);
+    if (cur !== 'botSpeaking' && cur !== 'interrupted') return;
+    transcriptPartsRef.current = [];
+    processedIndexRef.current = 0;
+    setLiveTranscript('');
+    _setState('listening');
+    _startRecognition();
+  }, [_setState, _startRecognition]);
 
   const toggleMute = useCallback(() => {
     const cur = stateRef.current;
-    if (cur === 'muted') {
-      _setState('listening');
-      _startRecognition();
-    } else if (cur === 'listening' || cur === 'interrupted') {
-      _setState('muted');
+    if (cur === 'idle') return;
+    if (mutedRef.current) {
+      _setMuted(false);
+      if (cur === 'listening' || cur === 'interrupted') {
+        _startRecognition();
+      }
+    } else {
+      _setMuted(true);
       _stopRecognition();
-    } else if (cur === 'botSpeaking') {
-      _setState('muted');
     }
-  }, [_setState, _startRecognition, _stopRecognition]);
+  }, [_setMuted, _startRecognition, _stopRecognition]);
+
+  const cancelPendingSpeech = useCallback(() => {
+    _cancelSilenceTimer();
+    transcriptPartsRef.current = [];
+    setLiveTranscript('');
+    _dlog('CANCEL-SPEECH', 'pending speech cleared for text input');
+  }, [_cancelSilenceTimer, _dlog]);
 
   useEffect(() => {
     return () => {
@@ -250,6 +279,7 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
 
   return {
     state,
+    muted,
     liveTranscript,
     activate,
     deactivate,
@@ -257,5 +287,6 @@ export function useVoiceController(onMessage: (text: string) => void): VoiceCont
     ttsPaused,
     ttsEnded,
     toggleMute,
+    cancelPendingSpeech,
   };
 }