fix: voice transcript cascading - processedIndex nur bei Recognition-Neustart resetten
Made-with: Cursor
This commit is contained in:
parent
93016a7e4c
commit
48215f165c
4 changed files with 331 additions and 241 deletions
|
|
@ -501,8 +501,9 @@ export function useCommcoach(): CommcoachHookReturn {
|
||||||
try {
|
try {
|
||||||
const completed = await completeSessionApi(request, instanceId, session.id);
|
const completed = await completeSessionApi(request, instanceId, session.id);
|
||||||
if (isMountedRef.current) {
|
if (isMountedRef.current) {
|
||||||
|
setMessages([]);
|
||||||
setSession(completed);
|
setSession(completed);
|
||||||
if (selectedContextId) await selectContext(selectedContextId);
|
if (selectedContextId) await selectContext(selectedContextId, { skipSessionResume: true });
|
||||||
}
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
if (isMountedRef.current) setError(err.message || 'Fehler beim Abschliessen');
|
if (isMountedRef.current) setError(err.message || 'Fehler beim Abschliessen');
|
||||||
|
|
|
||||||
|
|
@ -20,20 +20,10 @@ import AutoScroll from '../../../components/UiComponents/AutoScroll/AutoScroll';
|
||||||
import ReactMarkdown from 'react-markdown';
|
import ReactMarkdown from 'react-markdown';
|
||||||
import remarkGfm from 'remark-gfm';
|
import remarkGfm from 'remark-gfm';
|
||||||
import styles from './CommcoachDossierView.module.css';
|
import styles from './CommcoachDossierView.module.css';
|
||||||
|
import { useVoiceController } from './useVoiceController';
|
||||||
|
|
||||||
type TabKey = 'coaching' | 'tasks' | 'sessions' | 'scores' | 'documents';
|
type TabKey = 'coaching' | 'tasks' | 'sessions' | 'scores' | 'documents';
|
||||||
|
|
||||||
/**
|
|
||||||
* Voice State Machine
|
|
||||||
*
|
|
||||||
* idle – no session active, everything off
|
|
||||||
* listening – mic on, recognition active, TTS off
|
|
||||||
* botSpeaking – TTS playing, mic/recognition suspended
|
|
||||||
* interrupted – TTS paused (resumable), mic on, recognition active
|
|
||||||
* muted – mic off, TTS continues if playing
|
|
||||||
*/
|
|
||||||
type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
|
|
||||||
|
|
||||||
export const CommcoachDossierView: React.FC = () => {
|
export const CommcoachDossierView: React.FC = () => {
|
||||||
const coach = useCommcoach();
|
const coach = useCommcoach();
|
||||||
const { request } = useApiRequest();
|
const { request } = useApiRequest();
|
||||||
|
|
@ -53,16 +43,10 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
const [selectedPersonaId, setSelectedPersonaId] = useState<string | undefined>(undefined);
|
const [selectedPersonaId, setSelectedPersonaId] = useState<string | undefined>(undefined);
|
||||||
|
|
||||||
const inputRef = useRef<HTMLTextAreaElement>(null);
|
const inputRef = useRef<HTMLTextAreaElement>(null);
|
||||||
const streamRef = useRef<MediaStream | null>(null);
|
const sendMessageRef = useRef(coach.sendMessage);
|
||||||
const speechRecognitionRef = useRef<SpeechRecognition | null>(null);
|
sendMessageRef.current = coach.sendMessage;
|
||||||
const transcriptPartsRef = useRef<string[]>([]);
|
|
||||||
const processedResultIndexRef = useRef(0);
|
|
||||||
const silenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
|
||||||
const [liveTranscript, setLiveTranscript] = useState('');
|
|
||||||
|
|
||||||
// Voice State Machine
|
const voice = useVoiceController((text) => sendMessageRef.current(text));
|
||||||
const [voiceState, setVoiceState] = useState<VoiceState>('idle');
|
|
||||||
const voiceStateRef = useRef<VoiceState>('idle');
|
|
||||||
|
|
||||||
// #region agent log
|
// #region agent log
|
||||||
const debugLogsRef = useRef<string[]>([]);
|
const debugLogsRef = useRef<string[]>([]);
|
||||||
|
|
@ -78,31 +62,15 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
useEffect(() => { (window as any).__dlog = _dlog; return () => { delete (window as any).__dlog; }; }, [_dlog]);
|
useEffect(() => { (window as any).__dlog = _dlog; return () => { delete (window as any).__dlog; }; }, [_dlog]);
|
||||||
// #endregion
|
// #endregion
|
||||||
|
|
||||||
const _transitionVoice = useCallback((next: VoiceState) => {
|
|
||||||
const prev = voiceStateRef.current;
|
|
||||||
if (prev === next) return;
|
|
||||||
_dlog('VOICE', `${prev} -> ${next}`);
|
|
||||||
voiceStateRef.current = next;
|
|
||||||
setVoiceState(next);
|
|
||||||
}, [_dlog]);
|
|
||||||
|
|
||||||
// Subscribe to TTS events from the hook
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
coach.onTtsEventRef.current = (event: TtsEvent) => {
|
coach.onTtsEventRef.current = (event: TtsEvent) => {
|
||||||
const cur = voiceStateRef.current;
|
if (event === 'playing') voice.ttsPlaying();
|
||||||
if (event === 'playing') {
|
else if (event === 'ended') voice.ttsEnded();
|
||||||
if (cur !== 'muted') _transitionVoice('botSpeaking');
|
else if (event === 'paused') voice.ttsPaused();
|
||||||
} else if (event === 'ended') {
|
else if (event === 'error') voice.ttsEnded();
|
||||||
if (cur === 'botSpeaking') _transitionVoice('listening');
|
|
||||||
if (cur === 'interrupted') _transitionVoice('listening');
|
|
||||||
} else if (event === 'paused') {
|
|
||||||
if (cur === 'botSpeaking') _transitionVoice('interrupted');
|
|
||||||
} else if (event === 'error') {
|
|
||||||
if (cur === 'botSpeaking') _transitionVoice('listening');
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
return () => { coach.onTtsEventRef.current = null; };
|
return () => { coach.onTtsEventRef.current = null; };
|
||||||
}, [coach.onTtsEventRef, _transitionVoice]);
|
}, [coach.onTtsEventRef, voice.ttsPlaying, voice.ttsEnded, voice.ttsPaused]);
|
||||||
|
|
||||||
// Auto-select first context
|
// Auto-select first context
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
|
|
@ -139,190 +107,16 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
.catch(() => {});
|
.catch(() => {});
|
||||||
}, [instanceId, request]);
|
}, [instanceId, request]);
|
||||||
|
|
||||||
// Transition to idle when session ends or tab changes away
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (activeTab !== 'coaching' || !coach.session) {
|
if (activeTab !== 'coaching' || !coach.session) {
|
||||||
_transitionVoice('idle');
|
voice.deactivate();
|
||||||
} else if (voiceStateRef.current === 'idle') {
|
} else if (voice.state === 'idle') {
|
||||||
_transitionVoice('listening');
|
voice.activate();
|
||||||
}
|
}
|
||||||
}, [activeTab, coach.session?.id, _transitionVoice]);
|
}, [activeTab, coach.session?.id, voice]);
|
||||||
|
|
||||||
// Hardware control: start/stop recognition + mic based on voiceState
|
const handleStopTts = useCallback(() => coach.stopTts(), [coach]);
|
||||||
useEffect(() => {
|
const handleResumeTts = useCallback(() => coach.resumeTts(), [coach]);
|
||||||
const micShouldBeOn = voiceState === 'listening' || voiceState === 'interrupted';
|
|
||||||
const micShouldBeOff = voiceState === 'idle' || voiceState === 'botSpeaking' || voiceState === 'muted';
|
|
||||||
|
|
||||||
if (micShouldBeOff) {
|
|
||||||
if (speechRecognitionRef.current) {
|
|
||||||
try { speechRecognitionRef.current.stop(); } catch { /* ignore */ }
|
|
||||||
}
|
|
||||||
if (voiceState === 'idle' && streamRef.current) {
|
|
||||||
streamRef.current.getTracks().forEach(t => t.stop());
|
|
||||||
streamRef.current = null;
|
|
||||||
speechRecognitionRef.current = null;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!micShouldBeOn) return;
|
|
||||||
|
|
||||||
const SpeechRecognitionApi = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
|
|
||||||
if (!SpeechRecognitionApi) return;
|
|
||||||
|
|
||||||
if (speechRecognitionRef.current) {
|
|
||||||
try {
|
|
||||||
speechRecognitionRef.current.start();
|
|
||||||
_dlog('REC-RESTART', 'reused existing');
|
|
||||||
} catch {
|
|
||||||
_dlog('REC-RESTART', 'existing failed, recreating');
|
|
||||||
speechRecognitionRef.current = null;
|
|
||||||
}
|
|
||||||
if (speechRecognitionRef.current) return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let cancelled = false;
|
|
||||||
const SILENCE_TIMEOUT_MS = 1500;
|
|
||||||
const MIN_WORDS_TO_INTERRUPT = 4;
|
|
||||||
|
|
||||||
const init = async () => {
|
|
||||||
try {
|
|
||||||
if (!streamRef.current) {
|
|
||||||
const stream = await navigator.mediaDevices.getUserMedia({
|
|
||||||
audio: { echoCancellation: true, noiseSuppression: true },
|
|
||||||
});
|
|
||||||
if (cancelled) { stream.getTracks().forEach(t => t.stop()); return; }
|
|
||||||
streamRef.current = stream;
|
|
||||||
}
|
|
||||||
|
|
||||||
const recognition = new SpeechRecognitionApi();
|
|
||||||
recognition.continuous = true;
|
|
||||||
recognition.interimResults = true;
|
|
||||||
recognition.lang = 'de-DE';
|
|
||||||
|
|
||||||
const _sendAndClearTranscript = () => {
|
|
||||||
const fullTranscript = transcriptPartsRef.current.join(' ').trim();
|
|
||||||
_dlog('SEND', `words=${fullTranscript.split(/\s+/).filter(Boolean).length} "${fullTranscript.substring(0,60)}"`);
|
|
||||||
if (fullTranscript) {
|
|
||||||
const wordCount = fullTranscript.split(/\s+/).filter(Boolean).length;
|
|
||||||
if (wordCount >= MIN_WORDS_TO_INTERRUPT) coach.sendMessage(fullTranscript);
|
|
||||||
}
|
|
||||||
transcriptPartsRef.current = [];
|
|
||||||
processedResultIndexRef.current = 0;
|
|
||||||
setLiveTranscript('');
|
|
||||||
};
|
|
||||||
|
|
||||||
const _resetSilenceTimer = () => {
|
|
||||||
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
|
||||||
silenceTimerRef.current = setTimeout(() => {
|
|
||||||
if (cancelled) return;
|
|
||||||
_sendAndClearTranscript();
|
|
||||||
}, SILENCE_TIMEOUT_MS);
|
|
||||||
};
|
|
||||||
|
|
||||||
recognition.onspeechstart = () => {
|
|
||||||
if (cancelled || voiceStateRef.current === 'botSpeaking') return;
|
|
||||||
transcriptPartsRef.current = [];
|
|
||||||
processedResultIndexRef.current = 0;
|
|
||||||
setLiveTranscript('');
|
|
||||||
_resetSilenceTimer();
|
|
||||||
};
|
|
||||||
|
|
||||||
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
|
||||||
if (cancelled) return;
|
|
||||||
const isBotSpeaking = voiceStateRef.current === 'botSpeaking';
|
|
||||||
const interimParts: string[] = [];
|
|
||||||
for (let i = processedResultIndexRef.current; i < event.results.length; i++) {
|
|
||||||
const r = event.results[i];
|
|
||||||
if (r.isFinal) {
|
|
||||||
const text = r[0].transcript.trim();
|
|
||||||
if (text && !isBotSpeaking) transcriptPartsRef.current.push(text);
|
|
||||||
processedResultIndexRef.current = i + 1;
|
|
||||||
} else {
|
|
||||||
if (isBotSpeaking) continue;
|
|
||||||
const text = r[0].transcript.trim();
|
|
||||||
if (text) interimParts.push(text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (isBotSpeaking) return;
|
|
||||||
const currentInterim = interimParts.join(' ');
|
|
||||||
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
|
|
||||||
setLiveTranscript(preview);
|
|
||||||
if (preview) _resetSilenceTimer();
|
|
||||||
};
|
|
||||||
|
|
||||||
recognition.onspeechend = () => {
|
|
||||||
if (cancelled) return;
|
|
||||||
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
|
||||||
if (voiceStateRef.current === 'botSpeaking') {
|
|
||||||
transcriptPartsRef.current = [];
|
|
||||||
processedResultIndexRef.current = 0;
|
|
||||||
setLiveTranscript('');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
_sendAndClearTranscript();
|
|
||||||
};
|
|
||||||
|
|
||||||
recognition.onend = () => {
|
|
||||||
_dlog('REC-END', `state=${voiceStateRef.current}`);
|
|
||||||
if (cancelled) return;
|
|
||||||
if (voiceStateRef.current === 'botSpeaking' || voiceStateRef.current === 'muted' || voiceStateRef.current === 'idle') return;
|
|
||||||
if (speechRecognitionRef.current === recognition) {
|
|
||||||
try { recognition.start(); } catch { speechRecognitionRef.current = null; }
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
recognition.onerror = (event: any) => {
|
|
||||||
_dlog('REC-ERR', event.error);
|
|
||||||
if (event.error === 'no-speech' || event.error === 'aborted') return;
|
|
||||||
console.warn('SpeechRecognition error:', event.error);
|
|
||||||
};
|
|
||||||
|
|
||||||
speechRecognitionRef.current = recognition;
|
|
||||||
recognition.start();
|
|
||||||
} catch (err) {
|
|
||||||
console.warn('Mic access failed:', err);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
init();
|
|
||||||
return () => { cancelled = true; };
|
|
||||||
}, [voiceState, _dlog, coach]);
|
|
||||||
|
|
||||||
// Cleanup on unmount
|
|
||||||
useEffect(() => {
|
|
||||||
return () => {
|
|
||||||
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
|
||||||
if (speechRecognitionRef.current) {
|
|
||||||
try { speechRecognitionRef.current.stop(); } catch { /* ignore */ }
|
|
||||||
speechRecognitionRef.current = null;
|
|
||||||
}
|
|
||||||
if (streamRef.current) {
|
|
||||||
streamRef.current.getTracks().forEach(t => t.stop());
|
|
||||||
streamRef.current = null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
// Voice actions
|
|
||||||
const handleStopTts = useCallback(() => {
|
|
||||||
coach.stopTts();
|
|
||||||
}, [coach]);
|
|
||||||
|
|
||||||
const handleResumeTts = useCallback(() => {
|
|
||||||
coach.resumeTts();
|
|
||||||
}, [coach]);
|
|
||||||
|
|
||||||
const handleToggleMute = useCallback(() => {
|
|
||||||
const cur = voiceStateRef.current;
|
|
||||||
if (cur === 'muted') {
|
|
||||||
_transitionVoice('listening');
|
|
||||||
} else if (cur === 'listening' || cur === 'interrupted') {
|
|
||||||
_transitionVoice('muted');
|
|
||||||
} else if (cur === 'botSpeaking') {
|
|
||||||
_transitionVoice('muted');
|
|
||||||
}
|
|
||||||
}, [_transitionVoice]);
|
|
||||||
|
|
||||||
const handleSend = useCallback(async () => {
|
const handleSend = useCallback(async () => {
|
||||||
if (!coach.inputValue.trim() || coach.isStreaming) return;
|
if (!coach.inputValue.trim() || coach.isStreaming) return;
|
||||||
|
|
@ -539,18 +333,18 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
<div className={styles.sessionHeader}>
|
<div className={styles.sessionHeader}>
|
||||||
<span className={styles.sessionLabel}>Session aktiv</span>
|
<span className={styles.sessionLabel}>Session aktiv</span>
|
||||||
<div className={styles.sessionActions}>
|
<div className={styles.sessionActions}>
|
||||||
{voiceState === 'botSpeaking' && (
|
{voice.state === 'botSpeaking' && (
|
||||||
<button className={styles.btnSmallDanger} onClick={handleStopTts}>Stop</button>
|
<button className={styles.btnSmallDanger} onClick={handleStopTts}>Stop</button>
|
||||||
)}
|
)}
|
||||||
{voiceState === 'interrupted' && coach.hasAudioToResume() && (
|
{voice.state === 'interrupted' && coach.hasAudioToResume() && (
|
||||||
<button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button>
|
<button className={styles.btnSmall} onClick={handleResumeTts}>Weitersprechen</button>
|
||||||
)}
|
)}
|
||||||
<button
|
<button
|
||||||
className={`${styles.btnSmall} ${voiceState === 'muted' ? styles.mutedActive : ''}`}
|
className={`${styles.btnSmall} ${voice.state === 'muted' ? styles.mutedActive : ''}`}
|
||||||
onClick={handleToggleMute}
|
onClick={voice.toggleMute}
|
||||||
title={voiceState === 'muted' ? 'Stummschaltung aufheben' : 'Stummschalten'}
|
title={voice.state === 'muted' ? 'Stummschaltung aufheben' : 'Stummschalten'}
|
||||||
>
|
>
|
||||||
{voiceState === 'muted' ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
|
{voice.state === 'muted' ? '\u{1F507} Stumm' : '\u{1F3A4} Ton an'}
|
||||||
</button>
|
</button>
|
||||||
<button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}>
|
<button className={styles.btnSmall} onClick={coach.completeSession} disabled={!!coach.actionLoading}>
|
||||||
{coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'}
|
{coach.actionLoading === 'completing' ? 'Wird abgeschlossen...' : 'Abschliessen'}
|
||||||
|
|
@ -562,7 +356,7 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Messages */}
|
{/* Messages */}
|
||||||
<AutoScroll scrollDependency={coach.messages.length + (coach.isStreaming ? 1 : 0) + liveTranscript.length}>
|
<AutoScroll scrollDependency={coach.messages.length + (coach.isStreaming ? 1 : 0) + voice.liveTranscript.length}>
|
||||||
<div className={styles.messages}>
|
<div className={styles.messages}>
|
||||||
{coach.messages.map(msg => (
|
{coach.messages.map(msg => (
|
||||||
<div key={msg.id} className={`${styles.message} ${msg.role === 'user' ? styles.messageUser : styles.messageAssistant}`}>
|
<div key={msg.id} className={`${styles.message} ${msg.role === 'user' ? styles.messageUser : styles.messageAssistant}`}>
|
||||||
|
|
@ -574,9 +368,9 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
))}
|
))}
|
||||||
{liveTranscript && (
|
{voice.liveTranscript && (
|
||||||
<div className={`${styles.message} ${styles.messageUser}`}>
|
<div className={`${styles.message} ${styles.messageUser}`}>
|
||||||
<div className={`${styles.messageBubble} ${styles.messageLive}`}>{liveTranscript}</div>
|
<div className={`${styles.messageBubble} ${styles.messageLive}`}>{voice.liveTranscript}</div>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
{coach.isStreaming && (
|
{coach.isStreaming && (
|
||||||
|
|
@ -596,17 +390,17 @@ export const CommcoachDossierView: React.FC = () => {
|
||||||
{/* Input Area */}
|
{/* Input Area */}
|
||||||
<div className={styles.inputArea}>
|
<div className={styles.inputArea}>
|
||||||
<div className={styles.voiceStatus}>
|
<div className={styles.voiceStatus}>
|
||||||
<span className={`${styles.voiceIndicator} ${voiceState === 'listening' ? styles.voiceActive : ''}`}>
|
<span className={`${styles.voiceIndicator} ${voice.state === 'listening' ? styles.voiceActive : ''}`}>
|
||||||
{voiceState === 'muted'
|
{voice.state === 'muted'
|
||||||
? 'Stumm – Mikrofon aus'
|
? 'Stumm – Mikrofon aus'
|
||||||
: voiceState === 'botSpeaking'
|
: voice.state === 'botSpeaking'
|
||||||
? (coach.streamingStatus || 'Coach spricht...')
|
? (coach.streamingStatus || 'Coach spricht...')
|
||||||
: coach.isStreaming
|
: coach.isStreaming
|
||||||
? (coach.streamingStatus || 'Coach denkt nach...')
|
? (coach.streamingStatus || 'Coach denkt nach...')
|
||||||
: voiceState === 'interrupted'
|
: voice.state === 'interrupted'
|
||||||
? 'Unterbrochen – Mikrofon an'
|
? 'Unterbrochen – Mikrofon an'
|
||||||
: voiceState === 'listening'
|
: voice.state === 'listening'
|
||||||
? (liveTranscript ? 'Spricht...' : 'Mikrofon an – bitte sprechen')
|
? (voice.liveTranscript ? 'Spricht...' : 'Mikrofon an – bitte sprechen')
|
||||||
: 'Mikrofon wird gestartet...'}
|
: 'Mikrofon wird gestartet...'}
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
261
src/pages/views/commcoach/useVoiceController.ts
Normal file
261
src/pages/views/commcoach/useVoiceController.ts
Normal file
|
|
@ -0,0 +1,261 @@
|
||||||
|
/**
|
||||||
|
* Voice Controller - imperative state machine for CommCoach voice interaction.
|
||||||
|
*
|
||||||
|
* States: idle | listening | botSpeaking | interrupted | muted
|
||||||
|
*
|
||||||
|
* Key principle: SpeechRecognition is created once and lives until deactivate().
|
||||||
|
* When botSpeaking, we ignore onresult events instead of stopping recognition.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||||
|
|
||||||
|
export type VoiceState = 'idle' | 'listening' | 'botSpeaking' | 'interrupted' | 'muted';
|
||||||
|
|
||||||
|
const SILENCE_TIMEOUT_MS = 1500;
|
||||||
|
const MIN_WORDS_TO_SEND = 4;
|
||||||
|
const REC_AUTORESTART_DELAY_MS = 300;
|
||||||
|
|
||||||
|
export interface VoiceControllerApi {
|
||||||
|
state: VoiceState;
|
||||||
|
liveTranscript: string;
|
||||||
|
activate: () => void;
|
||||||
|
deactivate: () => void;
|
||||||
|
ttsPlaying: () => void;
|
||||||
|
ttsPaused: () => void;
|
||||||
|
ttsEnded: () => void;
|
||||||
|
toggleMute: () => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function useVoiceController(onMessage: (text: string) => void): VoiceControllerApi {
|
||||||
|
const [state, setState] = useState<VoiceState>('idle');
|
||||||
|
const [liveTranscript, setLiveTranscript] = useState('');
|
||||||
|
const stateRef = useRef<VoiceState>('idle');
|
||||||
|
const streamRef = useRef<MediaStream | null>(null);
|
||||||
|
const recognitionRef = useRef<SpeechRecognition | null>(null);
|
||||||
|
const transcriptPartsRef = useRef<string[]>([]);
|
||||||
|
const processedIndexRef = useRef(0);
|
||||||
|
const silenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||||
|
const onMessageRef = useRef(onMessage);
|
||||||
|
onMessageRef.current = onMessage;
|
||||||
|
|
||||||
|
const _dlog = useCallback((tag: string, info?: string) => {
|
||||||
|
const t = new Date();
|
||||||
|
const ts = `${t.getMinutes()}:${String(t.getSeconds()).padStart(2, '0')}.${String(t.getMilliseconds()).padStart(3, '0')}`;
|
||||||
|
const entry = `[${ts}] ${tag}${info ? ' ' + info : ''}`;
|
||||||
|
(window as any).__dlog?.(entry);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const _setState = useCallback((next: VoiceState) => {
|
||||||
|
const prev = stateRef.current;
|
||||||
|
if (prev === next) return;
|
||||||
|
_dlog('VOICE', `${prev} -> ${next}`);
|
||||||
|
stateRef.current = next;
|
||||||
|
setState(next);
|
||||||
|
}, [_dlog]);
|
||||||
|
|
||||||
|
const _finalizeTranscript = useCallback(() => {
|
||||||
|
const full = transcriptPartsRef.current.join(' ').trim();
|
||||||
|
_dlog('SEND', `words=${full.split(/\s+/).filter(Boolean).length} "${full.substring(0, 60)}"`);
|
||||||
|
if (full) {
|
||||||
|
const wordCount = full.split(/\s+/).filter(Boolean).length;
|
||||||
|
if (wordCount >= MIN_WORDS_TO_SEND) onMessageRef.current(full);
|
||||||
|
}
|
||||||
|
transcriptPartsRef.current = [];
|
||||||
|
setLiveTranscript('');
|
||||||
|
}, [_dlog]);
|
||||||
|
|
||||||
|
const _resetSilenceTimer = useCallback(() => {
|
||||||
|
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
||||||
|
silenceTimerRef.current = setTimeout(() => {
|
||||||
|
_finalizeTranscript();
|
||||||
|
}, SILENCE_TIMEOUT_MS);
|
||||||
|
}, [_finalizeTranscript]);
|
||||||
|
|
||||||
|
const _startRecognition = useCallback(() => {
|
||||||
|
const rec = recognitionRef.current;
|
||||||
|
if (!rec) return;
|
||||||
|
try {
|
||||||
|
rec.start();
|
||||||
|
_dlog('REC-START', 'ok');
|
||||||
|
} catch {
|
||||||
|
_dlog('REC-START', 'failed');
|
||||||
|
}
|
||||||
|
}, [_dlog]);
|
||||||
|
|
||||||
|
const _stopRecognition = useCallback(() => {
|
||||||
|
const rec = recognitionRef.current;
|
||||||
|
if (!rec) return;
|
||||||
|
try {
|
||||||
|
rec.stop();
|
||||||
|
} catch {
|
||||||
|
/* ignore */
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const _createRecognition = useCallback(() => {
|
||||||
|
const SpeechRecognitionApi = (window as any).SpeechRecognition || (window as any).webkitSpeechRecognition;
|
||||||
|
if (!SpeechRecognitionApi) return;
|
||||||
|
|
||||||
|
const recognition = new SpeechRecognitionApi();
|
||||||
|
recognition.continuous = true;
|
||||||
|
recognition.interimResults = true;
|
||||||
|
recognition.lang = 'de-DE';
|
||||||
|
|
||||||
|
recognition.onspeechstart = () => {
|
||||||
|
if (stateRef.current === 'botSpeaking') return;
|
||||||
|
transcriptPartsRef.current = [];
|
||||||
|
setLiveTranscript('');
|
||||||
|
_resetSilenceTimer();
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
||||||
|
const ignore = stateRef.current === 'botSpeaking';
|
||||||
|
const interimParts: string[] = [];
|
||||||
|
for (let i = processedIndexRef.current; i < event.results.length; i++) {
|
||||||
|
const r = event.results[i];
|
||||||
|
if (r.isFinal) {
|
||||||
|
const text = r[0].transcript.trim();
|
||||||
|
if (text && !ignore) transcriptPartsRef.current.push(text);
|
||||||
|
processedIndexRef.current = i + 1;
|
||||||
|
} else {
|
||||||
|
if (ignore) continue;
|
||||||
|
const text = r[0].transcript.trim();
|
||||||
|
if (text) interimParts.push(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ignore) return;
|
||||||
|
const currentInterim = interimParts.join(' ');
|
||||||
|
const preview = [...transcriptPartsRef.current, currentInterim].join(' ').trim();
|
||||||
|
setLiveTranscript(preview);
|
||||||
|
if (preview) _resetSilenceTimer();
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onspeechend = () => {
|
||||||
|
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
||||||
|
if (stateRef.current === 'botSpeaking') {
|
||||||
|
transcriptPartsRef.current = [];
|
||||||
|
setLiveTranscript('');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_finalizeTranscript();
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onend = () => {
|
||||||
|
_dlog('REC-END', `state=${stateRef.current}`);
|
||||||
|
if (recognitionRef.current !== recognition) return;
|
||||||
|
const cur = stateRef.current;
|
||||||
|
if (cur === 'botSpeaking' || cur === 'muted' || cur === 'idle') return;
|
||||||
|
processedIndexRef.current = 0;
|
||||||
|
setTimeout(() => {
|
||||||
|
if (recognitionRef.current !== recognition) return;
|
||||||
|
if (stateRef.current !== 'listening' && stateRef.current !== 'interrupted') return;
|
||||||
|
try {
|
||||||
|
recognition.start();
|
||||||
|
_dlog('REC-AUTOSTART', 'ok');
|
||||||
|
} catch {
|
||||||
|
_dlog('REC-AUTOSTART', 'failed');
|
||||||
|
}
|
||||||
|
}, REC_AUTORESTART_DELAY_MS);
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onerror = (event: any) => {
|
||||||
|
_dlog('REC-ERR', event.error);
|
||||||
|
if (event.error === 'no-speech' || event.error === 'aborted') return;
|
||||||
|
console.warn('SpeechRecognition error:', event.error);
|
||||||
|
};
|
||||||
|
|
||||||
|
recognitionRef.current = recognition;
|
||||||
|
_startRecognition();
|
||||||
|
}, [_dlog, _resetSilenceTimer, _finalizeTranscript, _startRecognition]);
|
||||||
|
|
||||||
|
const activate = useCallback(async () => {
|
||||||
|
if (stateRef.current !== 'idle') return;
|
||||||
|
_setState('listening');
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!streamRef.current) {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({
|
||||||
|
audio: { echoCancellation: true, noiseSuppression: true },
|
||||||
|
});
|
||||||
|
streamRef.current = stream;
|
||||||
|
}
|
||||||
|
_createRecognition();
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('Mic access failed:', err);
|
||||||
|
_setState('idle');
|
||||||
|
}
|
||||||
|
}, [_setState, _createRecognition]);
|
||||||
|
|
||||||
|
const deactivate = useCallback(() => {
|
||||||
|
_setState('idle');
|
||||||
|
if (silenceTimerRef.current) {
|
||||||
|
clearTimeout(silenceTimerRef.current);
|
||||||
|
silenceTimerRef.current = null;
|
||||||
|
}
|
||||||
|
if (recognitionRef.current) {
|
||||||
|
try { recognitionRef.current.stop(); } catch { /* ignore */ }
|
||||||
|
recognitionRef.current = null;
|
||||||
|
}
|
||||||
|
if (streamRef.current) {
|
||||||
|
streamRef.current.getTracks().forEach(t => t.stop());
|
||||||
|
streamRef.current = null;
|
||||||
|
}
|
||||||
|
transcriptPartsRef.current = [];
|
||||||
|
processedIndexRef.current = 0;
|
||||||
|
setLiveTranscript('');
|
||||||
|
}, [_setState]);
|
||||||
|
|
||||||
|
const ttsPlaying = useCallback(() => {
|
||||||
|
const cur = stateRef.current;
|
||||||
|
if (cur === 'muted') return;
|
||||||
|
_setState('botSpeaking');
|
||||||
|
}, [_setState]);
|
||||||
|
|
||||||
|
const ttsPaused = useCallback(() => {
|
||||||
|
const cur = stateRef.current;
|
||||||
|
if (cur === 'botSpeaking') _setState('interrupted');
|
||||||
|
}, [_setState]);
|
||||||
|
|
||||||
|
const ttsEnded = useCallback(() => {
|
||||||
|
const cur = stateRef.current;
|
||||||
|
if (cur === 'botSpeaking' || cur === 'interrupted') _setState('listening');
|
||||||
|
}, [_setState]);
|
||||||
|
|
||||||
|
const toggleMute = useCallback(() => {
|
||||||
|
const cur = stateRef.current;
|
||||||
|
if (cur === 'muted') {
|
||||||
|
_setState('listening');
|
||||||
|
_startRecognition();
|
||||||
|
} else if (cur === 'listening' || cur === 'interrupted') {
|
||||||
|
_setState('muted');
|
||||||
|
_stopRecognition();
|
||||||
|
} else if (cur === 'botSpeaking') {
|
||||||
|
_setState('muted');
|
||||||
|
}
|
||||||
|
}, [_setState, _startRecognition, _stopRecognition]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
return () => {
|
||||||
|
if (silenceTimerRef.current) clearTimeout(silenceTimerRef.current);
|
||||||
|
if (recognitionRef.current) {
|
||||||
|
try { recognitionRef.current.stop(); } catch { /* ignore */ }
|
||||||
|
recognitionRef.current = null;
|
||||||
|
}
|
||||||
|
if (streamRef.current) {
|
||||||
|
streamRef.current.getTracks().forEach(t => t.stop());
|
||||||
|
streamRef.current = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
return {
|
||||||
|
state,
|
||||||
|
liveTranscript,
|
||||||
|
activate,
|
||||||
|
deactivate,
|
||||||
|
ttsPlaying,
|
||||||
|
ttsPaused,
|
||||||
|
ttsEnded,
|
||||||
|
toggleMute,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
@ -40,6 +40,17 @@ export const TeamsbotSessionView: React.FC = () => {
|
||||||
const transcriptEndRef = useRef<HTMLDivElement>(null);
|
const transcriptEndRef = useRef<HTMLDivElement>(null);
|
||||||
const eventSourceRef = useRef<EventSource | null>(null);
|
const eventSourceRef = useRef<EventSource | null>(null);
|
||||||
|
|
||||||
|
const debugLogsRef = useRef<string[]>([]);
|
||||||
|
const [debugVisible, setDebugVisible] = useState(false);
|
||||||
|
const [debugSnapshot, setDebugSnapshot] = useState<string[]>([]);
|
||||||
|
const _dlog = useCallback((tag: string, info?: string) => {
|
||||||
|
const t = new Date();
|
||||||
|
const ts = `${t.getMinutes()}:${String(t.getSeconds()).padStart(2, '0')}.${String(t.getMilliseconds()).padStart(3, '0')}`;
|
||||||
|
const entry = `[${ts}] ${tag}${info ? ' ' + info : ''}`;
|
||||||
|
debugLogsRef.current.push(entry);
|
||||||
|
if (debugLogsRef.current.length > 120) debugLogsRef.current.shift();
|
||||||
|
}, []);
|
||||||
|
|
||||||
// Load session data - if no sessionId given, load the most recent session
|
// Load session data - if no sessionId given, load the most recent session
|
||||||
const _loadSession = useCallback(async () => {
|
const _loadSession = useCallback(async () => {
|
||||||
if (!instanceId) return;
|
if (!instanceId) return;
|
||||||
|
|
@ -98,19 +109,26 @@ export const TeamsbotSessionView: React.FC = () => {
|
||||||
const eventSource = teamsbotApi.createSessionStream(instanceId, sessionId);
|
const eventSource = teamsbotApi.createSessionStream(instanceId, sessionId);
|
||||||
eventSourceRef.current = eventSource;
|
eventSourceRef.current = eventSource;
|
||||||
setIsLive(true);
|
setIsLive(true);
|
||||||
|
_dlog('SSE', 'connected');
|
||||||
|
|
||||||
eventSource.onmessage = (event) => {
|
eventSource.onmessage = (event) => {
|
||||||
try {
|
try {
|
||||||
const sseEvent: TeamsbotSSEEvent = JSON.parse(event.data);
|
const sseEvent: TeamsbotSSEEvent = JSON.parse(event.data);
|
||||||
|
const evType = sseEvent.type || 'unknown';
|
||||||
|
|
||||||
switch (sseEvent.type) {
|
_dlog('SSE', evType + (sseEvent.data ? ` ${JSON.stringify(sseEvent.data).substring(0, 80)}` : ''));
|
||||||
|
|
||||||
|
switch (evType) {
|
||||||
case 'sessionState':
|
case 'sessionState':
|
||||||
if (sseEvent.data) setSession(prev => prev ? { ...prev, ...sseEvent.data } : sseEvent.data);
|
if (sseEvent.data) setSession(prev => prev ? { ...prev, ...sseEvent.data } : sseEvent.data);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'transcript':
|
case 'transcript': {
|
||||||
setTranscripts(prev => [...prev, sseEvent.data as TeamsbotTranscript]);
|
const t = sseEvent.data as TeamsbotTranscript;
|
||||||
|
_dlog('TRANSCRIPT', `[${t?.speaker || '?'}] ${(t?.text || '').substring(0, 50)}...`);
|
||||||
|
setTranscripts(prev => [...prev, t]);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case 'botResponse':
|
case 'botResponse':
|
||||||
setBotResponses(prev => [...prev, sseEvent.data as TeamsbotBotResponse]);
|
setBotResponses(prev => [...prev, sseEvent.data as TeamsbotBotResponse]);
|
||||||
|
|
@ -155,6 +173,7 @@ export const TeamsbotSessionView: React.FC = () => {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
|
_dlog('SSE-ERR', String(err));
|
||||||
console.error('SSE parse error:', err);
|
console.error('SSE parse error:', err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -169,7 +188,7 @@ export const TeamsbotSessionView: React.FC = () => {
|
||||||
sseSessionRef.current = null;
|
sseSessionRef.current = null;
|
||||||
setIsLive(false);
|
setIsLive(false);
|
||||||
};
|
};
|
||||||
}, [instanceId, sessionId, sessionStatus]);
|
}, [instanceId, sessionId, sessionStatus, _dlog]);
|
||||||
|
|
||||||
// Polling fallback: refresh session data every 5s when SSE is not connected
|
// Polling fallback: refresh session data every 5s when SSE is not connected
|
||||||
const pollRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
const pollRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||||
|
|
@ -362,6 +381,21 @@ export const TeamsbotSessionView: React.FC = () => {
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Debug Log (SSE/Transcript/Chat) */}
|
||||||
|
<div style={{ position: 'fixed', bottom: 0, right: 0, zIndex: 9999 }}>
|
||||||
|
<button
|
||||||
|
onClick={() => { setDebugSnapshot([...debugLogsRef.current]); setDebugVisible(v => !v); }}
|
||||||
|
style={{ background: '#333', color: '#0f0', border: 'none', padding: '4px 8px', fontSize: '10px', borderRadius: '4px 0 0 0' }}
|
||||||
|
>
|
||||||
|
DBG ({debugLogsRef.current.length})
|
||||||
|
</button>
|
||||||
|
{debugVisible && (
|
||||||
|
<div style={{ background: 'rgba(0,0,0,0.9)', color: '#0f0', fontSize: '9px', maxHeight: '40vh', overflow: 'auto', padding: '4px', fontFamily: 'monospace', whiteSpace: 'pre-wrap', width: '100vw' }}>
|
||||||
|
{debugSnapshot.map((l, i) => <div key={i}>{l}</div>)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Debug Screenshots (SysAdmin only) */}
|
{/* Debug Screenshots (SysAdmin only) */}
|
||||||
{_isSysAdmin && (
|
{_isSysAdmin && (
|
||||||
<div className={styles.summaryCard}>
|
<div className={styles.summaryCard}>
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue