From 681744292d541ff5143442389417e177afda8aa3 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Fri, 27 Feb 2026 16:40:08 +0100 Subject: [PATCH] AudioCapture: add extended diagnostics for silent audio investigation Made-with: Cursor --- src/bot/audioCaptureProcedure.ts | 77 ++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 9 deletions(-) diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts index 819b6b5..95248b9 100644 --- a/src/bot/audioCaptureProcedure.ts +++ b/src/bot/audioCaptureProcedure.ts @@ -73,6 +73,9 @@ export class AudioCaptureProcedure { try { const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[]; pcs.push(pc); + // #region agent log + console.log(`[AudioCapture][DIAG] New RTCPeerConnection created (total: ${pcs.length}), config:`, JSON.stringify(args[0] || {}).substring(0, 200)); + // #endregion } catch { // ignore } @@ -86,34 +89,68 @@ export class AudioCaptureProcedure { return; } + // #region agent log + console.log( + `[AudioCapture][DIAG] Track received: id=${trackId}, enabled=${event.track.enabled}, muted=${event.track.muted}, readyState=${event.track.readyState}, label=${event.track.label}` + ); + event.track.addEventListener('mute', () => { + console.log(`[AudioCapture][DIAG] Track MUTED: id=${trackId}`); + }); + event.track.addEventListener('unmute', () => { + console.log(`[AudioCapture][DIAG] Track UNMUTED: id=${trackId}`); + }); + // #endregion + try { const AudioCtx = window.AudioContext || (window as any).webkitAudioContext; - // Use native sample rate (48kHz for WebRTC/Opus) to avoid - // forced resampling which destabilises the Chromium audio stack. const ctx = new AudioCtx(); const nativeRate = ctx.sampleRate; const stream = new MediaStream([event.track]); const source = ctx.createMediaStreamSource(stream); - // ScriptProcessor with larger buffer (8192) reduces callback - // frequency and gives the renderer more breathing room. + // #region agent log + console.log( + `[AudioCapture][DIAG] AudioContext: state=${ctx.state}, sampleRate=${nativeRate}, stream.active=${stream.active}, streamTracks=${stream.getAudioTracks().length}` + ); + ctx.addEventListener('statechange', () => { + console.log(`[AudioCapture][DIAG] AudioContext statechange: ${ctx.state} for track=${trackId}`); + }); + // #endregion + const processor = ctx.createScriptProcessor(8192, 1, 1); let chunkBuffer: Float32Array[] = []; let samplesCollected = 0; let skippedSilentChunks = 0; + let callbackCount = 0; + let totalNonZeroSamples = 0; const minRmsThreshold = 0.0015; - // Collect ~2 seconds of audio at native rate before emitting. - // Larger chunks improve STT stability and reduce fragment transcripts. const samplesPerChunk = nativeRate * 2; const targetRate = 16000; processor.onaudioprocess = (e: AudioProcessingEvent) => { const input = e.inputBuffer.getChannelData(0); + callbackCount++; + + // #region agent log + if (callbackCount <= 3 || callbackCount % 50 === 0) { + let nonZero = 0; + let maxAbs = 0; + for (let i = 0; i < input.length; i++) { + if (input[i] !== 0) nonZero++; + const abs = Math.abs(input[i]); + if (abs > maxAbs) maxAbs = abs; + } + totalNonZeroSamples += nonZero; + console.log( + `[AudioCapture][DIAG] onaudioprocess #${callbackCount}: bufLen=${input.length}, nonZero=${nonZero}/${input.length}, maxAbs=${maxAbs.toFixed(8)}, track.enabled=${event.track.enabled}, track.muted=${event.track.muted}, track.readyState=${event.track.readyState}, ctx.state=${ctx.state}, totalNonZero=${totalNonZeroSamples}` + ); + } + // #endregion + chunkBuffer.push(new Float32Array(input)); samplesCollected += input.length; if (samplesCollected >= samplesPerChunk) { - // Merge buffers into one contiguous array const merged = new Float32Array(samplesCollected); let offset = 0; for (const buf of chunkBuffer) { @@ -121,7 +158,6 @@ export class AudioCaptureProcedure { offset += buf.length; } - // Calculate RMS to detect real audio activity let powerSum = 0; for (let i = 0; i < merged.length; i++) { powerSum += merged[i] * merged[i]; @@ -132,7 +168,7 @@ export class AudioCaptureProcedure { skippedSilentChunks++; if (skippedSilentChunks % 10 === 0) { console.log( - `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}` + `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}` ); } chunkBuffer = []; @@ -239,8 +275,31 @@ export class AudioCaptureProcedure { this._logger.info('[AudioCapture] Starting audio chunk polling...'); + // #region agent log + let pollCount = 0; + // #endregion this._pollInterval = setInterval(async () => { try { + // #region agent log + pollCount++; + if (pollCount % 60 === 1) { + const diagInfo = await this._page.evaluate(() => { + const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[] || []; + const procs = (window as any).__audioCaptureProcessors as Record || {}; + const ctxs = (window as any).__audioCaptureContexts as Record || {}; + const procKeys = Object.keys(procs); + const ctxStates = Object.entries(ctxs).map(([k, c]) => `${k}:${c.state}`); + return { + peerConnections: pcs.length, + pcStates: pcs.map((p: RTCPeerConnection) => p.connectionState || 'unknown'), + processors: procKeys.length, + processorTrackIds: procKeys, + audioContextStates: ctxStates, + }; + }); + this._logger.info(`[AudioCapture][DIAG] Periodic: ${JSON.stringify(diagInfo)}`); + } + // #endregion const chunks = await this._page.evaluate(() => { const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[]; const result = buf.splice(0, buf.length);