From ee2dcd61f184c32562262a2a37c85704f5b1c2f4 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sat, 28 Feb 2026 15:53:31 +0100 Subject: [PATCH] feat: migrate audio capture from ScriptProcessorNode to AudioWorkletNode with fallback Made-with: Cursor --- src/bot/audioCaptureProcedure.ts | 404 +++++++++++++++++++------------ 1 file changed, 255 insertions(+), 149 deletions(-) diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts index 8866c39..293978d 100644 --- a/src/bot/audioCaptureProcedure.ts +++ b/src/bot/audioCaptureProcedure.ts @@ -14,13 +14,108 @@ interface CapturedAudioChunk { captureDiagnostics?: AudioChunkDiagnostics; } +const AUDIO_CAPTURE_WORKLET_CODE = ` +class AudioCaptureProcessor extends AudioWorkletProcessor { + constructor(options) { + super(); + const opts = options.processorOptions || {}; + this.nativeRate = opts.nativeRate || 48000; + this.targetRate = opts.targetRate || 16000; + this.maxSamplesPerChunk = this.nativeRate * 8; + this.minRmsThreshold = 0.0003; + this.preRollSamples = Math.ceil(this.nativeRate * 0.5); + this.minFlushSamples = Math.ceil(this.nativeRate * 0.5); + this.silenceFlushCallbacks = 6; + this.ratio = this.nativeRate / this.targetRate; + this.chunkBuffer = []; + this.samplesCollected = 0; + this.hasVoicedContent = false; + this.consecutiveSilentCallbacks = 0; + } + + process(inputs, outputs, parameters) { + const input = inputs[0]?.[0]; + if (!input || input.length === 0) return true; + + let cbPower = 0; + for (let i = 0; i < input.length; i++) { + cbPower += input[i] * input[i]; + } + const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1)); + + if (cbRms >= this.minRmsThreshold) { + this.hasVoicedContent = true; + this.consecutiveSilentCallbacks = 0; + } else { + this.consecutiveSilentCallbacks++; + } + + this.chunkBuffer.push(new Float32Array(input)); + this.samplesCollected += input.length; + + const shouldFlush = ( + this.samplesCollected >= this.maxSamplesPerChunk + || (this.hasVoicedContent + && this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks + && this.samplesCollected > this.minFlushSamples) + ); + + if (shouldFlush) { + const merged = new Float32Array(this.samplesCollected); + let offset = 0; + for (const buf of this.chunkBuffer) { + merged.set(buf, offset); + offset += buf.length; + } + + let powerSum = 0; + for (let i = 0; i < merged.length; i++) { + powerSum += merged[i] * merged[i]; + } + const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); + + this.hasVoicedContent = false; + this.consecutiveSilentCallbacks = 0; + + if (rms >= this.minRmsThreshold) { + const outLen = Math.floor(merged.length / this.ratio); + const pcm16 = new Int16Array(outLen); + for (let i = 0; i < outLen; i++) { + const srcIdx = Math.floor(i * this.ratio); + const s = Math.max(-1, Math.min(1, merged[srcIdx])); + pcm16[i] = Math.round(s * 32767); + } + this.port.postMessage({ + type: 'chunk', + data: pcm16.buffer, + rms, + nativeSampleRate: this.nativeRate + }, [pcm16.buffer]); + } else { + const keep = Math.min(this.preRollSamples, merged.length); + const preRoll = merged.slice(merged.length - keep); + this.chunkBuffer = [preRoll]; + this.samplesCollected = keep; + return true; + } + + this.chunkBuffer = []; + this.samplesCollected = 0; + } + return true; + } +} + +registerProcessor('audio-capture-processor', AudioCaptureProcessor); +`; + /** * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection. * * How it works: * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks - * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode + * 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback) * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16 * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway */ @@ -59,7 +154,7 @@ export class AudioCaptureProcedure { this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); - await this._page.addInitScript(() => { + await this._page.addInitScript((workletCode: string) => { (window as any).__audioCaptureChunks = [] as any[]; (window as any).__audioCaptureProcessors = {} as Record; (window as any).__audioCaptureContexts = {} as Record; @@ -107,6 +202,7 @@ export class AudioCaptureProcedure { const nativeRate = ctx.sampleRate; const stream = new MediaStream([event.track]); const source = ctx.createMediaStreamSource(stream); + const targetRate = 16000; // #region agent log console.log( @@ -117,157 +213,171 @@ export class AudioCaptureProcedure { }); // #endregion - const processor = ctx.createScriptProcessor(8192, 1, 1); - let chunkBuffer: Float32Array[] = []; - let samplesCollected = 0; - let skippedSilentChunks = 0; - let callbackCount = 0; - let totalNonZeroSamples = 0; - const minRmsThreshold = 0.0003; - const maxSamplesPerChunk = nativeRate * 8; - const targetRate = 16000; - const preRollSamples = Math.ceil(nativeRate * 0.5); - const minFlushSamples = Math.ceil(nativeRate * 0.5); - // Adaptive flush: after ~1s silence following voiced content - const silenceFlushCallbacks = 6; - let hasVoicedContent = false; - let consecutiveSilentCallbacks = 0; + const silentGain = ctx.createGain(); + silentGain.gain.value = 0; - processor.onaudioprocess = (e: AudioProcessingEvent) => { - const input = e.inputBuffer.getChannelData(0); - callbackCount++; - - // #region agent log - let nonZeroThisCallback = 0; - for (let i = 0; i < input.length; i++) { - if (input[i] !== 0) nonZeroThisCallback++; - } - totalNonZeroSamples += nonZeroThisCallback; - - if (callbackCount <= 3 || callbackCount % 50 === 0) { - let maxAbs = 0; - for (let i = 0; i < input.length; i++) { - const abs = Math.abs(input[i]); - if (abs > maxAbs) maxAbs = abs; - } - console.log( - `[AudioCapture][DIAG] onaudioprocess #${callbackCount}: bufLen=${input.length}, nonZero=${nonZeroThisCallback}/${input.length}, maxAbs=${maxAbs.toFixed(8)}, track.enabled=${event.track.enabled}, track.muted=${event.track.muted}, track.readyState=${event.track.readyState}, ctx.state=${ctx.state}, totalNonZero=${totalNonZeroSamples}` - ); - } - // #endregion - - // Per-callback voice activity detection - let cbPower = 0; - for (let i = 0; i < input.length; i++) { - cbPower += input[i] * input[i]; - } - const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1)); - - if (cbRms >= minRmsThreshold) { - hasVoicedContent = true; - consecutiveSilentCallbacks = 0; - } else { - consecutiveSilentCallbacks++; - } - - chunkBuffer.push(new Float32Array(input)); - samplesCollected += input.length; - - // Flush: max duration reached OR voiced content followed by ~1s silence - const shouldFlush = ( - samplesCollected >= maxSamplesPerChunk - || (hasVoicedContent - && consecutiveSilentCallbacks >= silenceFlushCallbacks - && samplesCollected > minFlushSamples) - ); - - if (shouldFlush) { - const merged = new Float32Array(samplesCollected); - let offset = 0; - for (const buf of chunkBuffer) { - merged.set(buf, offset); - offset += buf.length; - } - - let powerSum = 0; - for (let i = 0; i < merged.length; i++) { - powerSum += merged[i] * merged[i]; - } - const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); - - hasVoicedContent = false; - consecutiveSilentCallbacks = 0; - - if (rms < minRmsThreshold) { - skippedSilentChunks++; - if (skippedSilentChunks % 10 === 0) { - console.log( - `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}` - ); - } - const keep = Math.min(preRollSamples, merged.length); - const preRoll = merged.slice(merged.length - keep); - chunkBuffer = [preRoll]; - samplesCollected = keep; - return; - } - - // Downsample from nativeRate to 16 kHz - const ratio = nativeRate / targetRate; - const outLen = Math.floor(merged.length / ratio); - const pcm16 = new Int16Array(outLen); - for (let i = 0; i < outLen; i++) { - const srcIdx = Math.floor(i * ratio); - const s = Math.max(-1, Math.min(1, merged[srcIdx])); - pcm16[i] = Math.round(s * 32767); - } - - // Convert to base64 - const bytes = new Uint8Array(pcm16.buffer); - let binary = ''; - for (let i = 0; i < bytes.length; i++) { - binary += String.fromCharCode(bytes[i]); - } - const base64 = btoa(binary); - - const chunks = (window as any).__audioCaptureChunks as any[]; - if (chunks.length < 60) { - chunks.push({ - data: base64, - sampleRate: targetRate, - captureDiagnostics: { - trackId, - readyState: event.track.readyState, - rms: Number(rms.toFixed(6)), - nativeSampleRate: nativeRate, - }, - }); - } - - skippedSilentChunks = 0; - - chunkBuffer = []; - samplesCollected = 0; + const pushChunk = (base64Data: string, rms: number) => { + const chunks = (window as any).__audioCaptureChunks as any[]; + if (chunks.length < 60) { + chunks.push({ + data: base64Data, + sampleRate: targetRate, + captureDiagnostics: { + trackId, + readyState: event.track.readyState, + rms: Number(rms.toFixed(6)), + nativeSampleRate: nativeRate, + }, + }); } }; - source.connect(processor); - // Connect to a silent gain node so the ScriptProcessor fires - // its onaudioprocess callback without routing captured audio - // to the speakers (which would conflict with the TTS AudioContext). - const silentGain = ctx.createGain(); - silentGain.gain.value = 0; - processor.connect(silentGain); - silentGain.connect(ctx.destination); + let workletNode: AudioWorkletNode | null = null; + let scriptProcessor: ScriptProcessorNode | null = null; - // Resume the context explicitly — in authMode Chromium does - // not set --autoplay-policy, so new AudioContexts start suspended. - ctx.resume().catch(() => {}); + const useWorklet = async () => { + try { + const blob = new Blob([workletCode], { type: 'application/javascript' }); + const blobUrl = URL.createObjectURL(blob); + await ctx.audioWorklet.addModule(blobUrl); + URL.revokeObjectURL(blobUrl); + + workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', { + processorOptions: { nativeRate, targetRate }, + }); + + workletNode.port.onmessage = (ev: MessageEvent) => { + if (ev.data?.type !== 'chunk' || !ev.data.data) return; + const pcm16 = new Int16Array(ev.data.data); + const bytes = new Uint8Array(pcm16.buffer); + let binary = ''; + for (let i = 0; i < bytes.length; i++) { + binary += String.fromCharCode(bytes[i]); + } + pushChunk(btoa(binary), ev.data.rms || 0); + }; + + source.connect(workletNode); + workletNode.connect(silentGain); + silentGain.connect(ctx.destination); + + const processorsObj = (window as any).__audioCaptureProcessors as Record; + processorsObj[trackId] = workletNode; + console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`); + return true; + } catch (err) { + console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`); + return false; + } + }; + + const useScriptProcessor = () => { + const minRmsThreshold = 0.0003; + const maxSamplesPerChunk = nativeRate * 8; + const preRollSamples = Math.ceil(nativeRate * 0.5); + const minFlushSamples = Math.ceil(nativeRate * 0.5); + const silenceFlushCallbacks = 6; + const ratio = nativeRate / targetRate; + + scriptProcessor = ctx.createScriptProcessor(8192, 1, 1); + let chunkBuffer: Float32Array[] = []; + let samplesCollected = 0; + let hasVoicedContent = false; + let consecutiveSilentCallbacks = 0; + + scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => { + const input = e.inputBuffer.getChannelData(0); + let cbPower = 0; + for (let i = 0; i < input.length; i++) { + cbPower += input[i] * input[i]; + } + const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1)); + + if (cbRms >= minRmsThreshold) { + hasVoicedContent = true; + consecutiveSilentCallbacks = 0; + } else { + consecutiveSilentCallbacks++; + } + + chunkBuffer.push(new Float32Array(input)); + samplesCollected += input.length; + + const shouldFlush = ( + samplesCollected >= maxSamplesPerChunk + || (hasVoicedContent + && consecutiveSilentCallbacks >= silenceFlushCallbacks + && samplesCollected > minFlushSamples) + ); + + if (shouldFlush) { + const merged = new Float32Array(samplesCollected); + let offset = 0; + for (const buf of chunkBuffer) { + merged.set(buf, offset); + offset += buf.length; + } + + let powerSum = 0; + for (let i = 0; i < merged.length; i++) { + powerSum += merged[i] * merged[i]; + } + const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); + + hasVoicedContent = false; + consecutiveSilentCallbacks = 0; + + if (rms >= minRmsThreshold) { + const outLen = Math.floor(merged.length / ratio); + const pcm16 = new Int16Array(outLen); + for (let i = 0; i < outLen; i++) { + const srcIdx = Math.floor(i * ratio); + const s = Math.max(-1, Math.min(1, merged[srcIdx])); + pcm16[i] = Math.round(s * 32767); + } + const bytes = new Uint8Array(pcm16.buffer); + let binary = ''; + for (let i = 0; i < bytes.length; i++) { + binary += String.fromCharCode(bytes[i]); + } + pushChunk(btoa(binary), rms); + } else { + const keep = Math.min(preRollSamples, merged.length); + const preRoll = merged.slice(merged.length - keep); + chunkBuffer = [preRoll]; + samplesCollected = keep; + return; + } + chunkBuffer = []; + samplesCollected = 0; + } + }; + + source.connect(scriptProcessor); + scriptProcessor.connect(silentGain); + silentGain.connect(ctx.destination); + + const processorsObj = (window as any).__audioCaptureProcessors as Record; + processorsObj[trackId] = scriptProcessor; + console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`); + }; + + (async () => { + const ok = await useWorklet(); + if (!ok) useScriptProcessor(); + + ctx.resume().catch(() => {}); + })(); // Clean up when the track ends (peer leaves, renegotiation, etc.) event.track.addEventListener('ended', () => { try { - processor.disconnect(); + if (workletNode) { + workletNode.disconnect(); + } + if (scriptProcessor) { + scriptProcessor.disconnect(); + } source.disconnect(); silentGain.disconnect(); ctx.close(); @@ -279,12 +389,8 @@ export class AudioCaptureProcedure { console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`); }); - const processorsObj = (window as any).__audioCaptureProcessors as Record; const contextsObj = (window as any).__audioCaptureContexts as Record; - processorsObj[trackId] = processor; contextsObj[trackId] = ctx; - - console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`); } catch (err) { console.error('[AudioCapture] Failed to set up audio capture:', err); } @@ -296,7 +402,7 @@ export class AudioCaptureProcedure { // Copy static properties window.RTCPeerConnection.prototype = OrigRTC.prototype; Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC); - }); + }, AUDIO_CAPTURE_WORKLET_CODE); this._injected = true; this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected');