feat: migrate audio capture from ScriptProcessorNode to AudioWorkletNode with fallback

Made-with: Cursor
2026-02-28 15:53:31 +01:00 · 2026-02-28 15:53:31 +01:00 · ee2dcd61f1
commit ee2dcd61f1
parent 25f684eb58
1 changed files with 255 additions and 149 deletions
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@ -14,13 +14,108 @@ interface CapturedAudioChunk {
  captureDiagnostics?: AudioChunkDiagnostics;
 }

+const AUDIO_CAPTURE_WORKLET_CODE = `
+class AudioCaptureProcessor extends AudioWorkletProcessor {
+  constructor(options) {
+    super();
+    const opts = options.processorOptions || {};
+    this.nativeRate = opts.nativeRate || 48000;
+    this.targetRate = opts.targetRate || 16000;
+    this.maxSamplesPerChunk = this.nativeRate * 8;
+    this.minRmsThreshold = 0.0003;
+    this.preRollSamples = Math.ceil(this.nativeRate * 0.5);
+    this.minFlushSamples = Math.ceil(this.nativeRate * 0.5);
+    this.silenceFlushCallbacks = 6;
+    this.ratio = this.nativeRate / this.targetRate;
+    this.chunkBuffer = [];
+    this.samplesCollected = 0;
+    this.hasVoicedContent = false;
+    this.consecutiveSilentCallbacks = 0;
+  }
+
+  process(inputs, outputs, parameters) {
+    const input = inputs[0]?.[0];
+    if (!input || input.length === 0) return true;
+
+    let cbPower = 0;
+    for (let i = 0; i < input.length; i++) {
+      cbPower += input[i] * input[i];
+    }
+    const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
+
+    if (cbRms >= this.minRmsThreshold) {
+      this.hasVoicedContent = true;
+      this.consecutiveSilentCallbacks = 0;
+    } else {
+      this.consecutiveSilentCallbacks++;
+    }
+
+    this.chunkBuffer.push(new Float32Array(input));
+    this.samplesCollected += input.length;
+
+    const shouldFlush = (
+      this.samplesCollected >= this.maxSamplesPerChunk
+      || (this.hasVoicedContent
+          && this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks
+          && this.samplesCollected > this.minFlushSamples)
+    );
+
+    if (shouldFlush) {
+      const merged = new Float32Array(this.samplesCollected);
+      let offset = 0;
+      for (const buf of this.chunkBuffer) {
+        merged.set(buf, offset);
+        offset += buf.length;
+      }
+
+      let powerSum = 0;
+      for (let i = 0; i < merged.length; i++) {
+        powerSum += merged[i] * merged[i];
+      }
+      const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+      this.hasVoicedContent = false;
+      this.consecutiveSilentCallbacks = 0;
+
+      if (rms >= this.minRmsThreshold) {
+        const outLen = Math.floor(merged.length / this.ratio);
+        const pcm16 = new Int16Array(outLen);
+        for (let i = 0; i < outLen; i++) {
+          const srcIdx = Math.floor(i * this.ratio);
+          const s = Math.max(-1, Math.min(1, merged[srcIdx]));
+          pcm16[i] = Math.round(s * 32767);
+        }
+        this.port.postMessage({
+          type: 'chunk',
+          data: pcm16.buffer,
+          rms,
+          nativeSampleRate: this.nativeRate
+        }, [pcm16.buffer]);
+      } else {
+        const keep = Math.min(this.preRollSamples, merged.length);
+        const preRoll = merged.slice(merged.length - keep);
+        this.chunkBuffer = [preRoll];
+        this.samplesCollected = keep;
+        return true;
+      }
+
+      this.chunkBuffer = [];
+      this.samplesCollected = 0;
+    }
+    return true;
+  }
+}
+
+registerProcessor('audio-capture-processor', AudioCaptureProcessor);
+`;
+
 /**
 * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
 *
 * How it works:
 * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
 * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
- * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
+ * 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback)
 * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
 * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
 */
@ -59,7 +154,7 @@ export class AudioCaptureProcedure {

    this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');

-    await this._page.addInitScript(() => {
+    await this._page.addInitScript((workletCode: string) => {
      (window as any).__audioCaptureChunks = [] as any[];
      (window as any).__audioCaptureProcessors = {} as Record<string, any>;
      (window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
@ -107,6 +202,7 @@ export class AudioCaptureProcedure {
            const nativeRate = ctx.sampleRate;
            const stream = new MediaStream([event.track]);
            const source = ctx.createMediaStreamSource(stream);
+            const targetRate = 16000;

            // #region agent log
            console.log(
@ -117,157 +213,171 @@ export class AudioCaptureProcedure {
            });
            // #endregion

-            const processor = ctx.createScriptProcessor(8192, 1, 1);
-            let chunkBuffer: Float32Array[] = [];
-            let samplesCollected = 0;
-            let skippedSilentChunks = 0;
-            let callbackCount = 0;
-            let totalNonZeroSamples = 0;
-            const minRmsThreshold = 0.0003;
-            const maxSamplesPerChunk = nativeRate * 8;
-            const targetRate = 16000;
-            const preRollSamples = Math.ceil(nativeRate * 0.5);
-            const minFlushSamples = Math.ceil(nativeRate * 0.5);
-            // Adaptive flush: after ~1s silence following voiced content
-            const silenceFlushCallbacks = 6;
-            let hasVoicedContent = false;
-            let consecutiveSilentCallbacks = 0;
+            const silentGain = ctx.createGain();
+            silentGain.gain.value = 0;

-            processor.onaudioprocess = (e: AudioProcessingEvent) => {
-              const input = e.inputBuffer.getChannelData(0);
-              callbackCount++;
-
-              // #region agent log
-              let nonZeroThisCallback = 0;
-              for (let i = 0; i < input.length; i++) {
-                if (input[i] !== 0) nonZeroThisCallback++;
-              }
-              totalNonZeroSamples += nonZeroThisCallback;
-
-              if (callbackCount <= 3 || callbackCount % 50 === 0) {
-                let maxAbs = 0;
-                for (let i = 0; i < input.length; i++) {
-                  const abs = Math.abs(input[i]);
-                  if (abs > maxAbs) maxAbs = abs;
-                }
-                console.log(
-                  `[AudioCapture][DIAG] onaudioprocess #${callbackCount}: bufLen=${input.length}, nonZero=${nonZeroThisCallback}/${input.length}, maxAbs=${maxAbs.toFixed(8)}, track.enabled=${event.track.enabled}, track.muted=${event.track.muted}, track.readyState=${event.track.readyState}, ctx.state=${ctx.state}, totalNonZero=${totalNonZeroSamples}`
-                );
-              }
-              // #endregion
-
-              // Per-callback voice activity detection
-              let cbPower = 0;
-              for (let i = 0; i < input.length; i++) {
-                cbPower += input[i] * input[i];
-              }
-              const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
-
-              if (cbRms >= minRmsThreshold) {
-                hasVoicedContent = true;
-                consecutiveSilentCallbacks = 0;
-              } else {
-                consecutiveSilentCallbacks++;
-              }
-
-              chunkBuffer.push(new Float32Array(input));
-              samplesCollected += input.length;
-
-              // Flush: max duration reached OR voiced content followed by ~1s silence
-              const shouldFlush = (
-                samplesCollected >= maxSamplesPerChunk
-                || (hasVoicedContent
-                    && consecutiveSilentCallbacks >= silenceFlushCallbacks
-                    && samplesCollected > minFlushSamples)
-              );
-
-              if (shouldFlush) {
-                const merged = new Float32Array(samplesCollected);
-                let offset = 0;
-                for (const buf of chunkBuffer) {
-                  merged.set(buf, offset);
-                  offset += buf.length;
-                }
-
-                let powerSum = 0;
-                for (let i = 0; i < merged.length; i++) {
-                  powerSum += merged[i] * merged[i];
-                }
-                const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
-
-                hasVoicedContent = false;
-                consecutiveSilentCallbacks = 0;
-
-                if (rms < minRmsThreshold) {
-                  skippedSilentChunks++;
-                  if (skippedSilentChunks % 10 === 0) {
-                    console.log(
-                      `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
-                    );
-                  }
-                  const keep = Math.min(preRollSamples, merged.length);
-                  const preRoll = merged.slice(merged.length - keep);
-                  chunkBuffer = [preRoll];
-                  samplesCollected = keep;
-                  return;
-                }
-
-                // Downsample from nativeRate to 16 kHz
-                const ratio = nativeRate / targetRate;
-                const outLen = Math.floor(merged.length / ratio);
-                const pcm16 = new Int16Array(outLen);
-                for (let i = 0; i < outLen; i++) {
-                  const srcIdx = Math.floor(i * ratio);
-                  const s = Math.max(-1, Math.min(1, merged[srcIdx]));
-                  pcm16[i] = Math.round(s * 32767);
-                }
-
-                // Convert to base64
-                const bytes = new Uint8Array(pcm16.buffer);
-                let binary = '';
-                for (let i = 0; i < bytes.length; i++) {
-                  binary += String.fromCharCode(bytes[i]);
-                }
-                const base64 = btoa(binary);
-
-                const chunks = (window as any).__audioCaptureChunks as any[];
-                if (chunks.length < 60) {
-                  chunks.push({
-                    data: base64,
-                    sampleRate: targetRate,
-                    captureDiagnostics: {
-                      trackId,
-                      readyState: event.track.readyState,
-                      rms: Number(rms.toFixed(6)),
-                      nativeSampleRate: nativeRate,
-                    },
-                  });
-                }
-
-                skippedSilentChunks = 0;
-
-                chunkBuffer = [];
-                samplesCollected = 0;
+            const pushChunk = (base64Data: string, rms: number) => {
+              const chunks = (window as any).__audioCaptureChunks as any[];
+              if (chunks.length < 60) {
+                chunks.push({
+                  data: base64Data,
+                  sampleRate: targetRate,
+                  captureDiagnostics: {
+                    trackId,
+                    readyState: event.track.readyState,
+                    rms: Number(rms.toFixed(6)),
+                    nativeSampleRate: nativeRate,
+                  },
+                });
              }
            };

-            source.connect(processor);
-            // Connect to a silent gain node so the ScriptProcessor fires
-            // its onaudioprocess callback without routing captured audio
-            // to the speakers (which would conflict with the TTS AudioContext).
-            const silentGain = ctx.createGain();
-            silentGain.gain.value = 0;
-            processor.connect(silentGain);
-            silentGain.connect(ctx.destination);
+            let workletNode: AudioWorkletNode | null = null;
+            let scriptProcessor: ScriptProcessorNode | null = null;

-            // Resume the context explicitly — in authMode Chromium does
-            // not set --autoplay-policy, so new AudioContexts start suspended.
-            ctx.resume().catch(() => {});
+            const useWorklet = async () => {
+              try {
+                const blob = new Blob([workletCode], { type: 'application/javascript' });
+                const blobUrl = URL.createObjectURL(blob);
+                await ctx.audioWorklet.addModule(blobUrl);
+                URL.revokeObjectURL(blobUrl);
+
+                workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', {
+                  processorOptions: { nativeRate, targetRate },
+                });
+
+                workletNode.port.onmessage = (ev: MessageEvent) => {
+                  if (ev.data?.type !== 'chunk' || !ev.data.data) return;
+                  const pcm16 = new Int16Array(ev.data.data);
+                  const bytes = new Uint8Array(pcm16.buffer);
+                  let binary = '';
+                  for (let i = 0; i < bytes.length; i++) {
+                    binary += String.fromCharCode(bytes[i]);
+                  }
+                  pushChunk(btoa(binary), ev.data.rms || 0);
+                };
+
+                source.connect(workletNode);
+                workletNode.connect(silentGain);
+                silentGain.connect(ctx.destination);
+
+                const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
+                processorsObj[trackId] = workletNode;
+                console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
+                return true;
+              } catch (err) {
+                console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`);
+                return false;
+              }
+            };
+
+            const useScriptProcessor = () => {
+              const minRmsThreshold = 0.0003;
+              const maxSamplesPerChunk = nativeRate * 8;
+              const preRollSamples = Math.ceil(nativeRate * 0.5);
+              const minFlushSamples = Math.ceil(nativeRate * 0.5);
+              const silenceFlushCallbacks = 6;
+              const ratio = nativeRate / targetRate;
+
+              scriptProcessor = ctx.createScriptProcessor(8192, 1, 1);
+              let chunkBuffer: Float32Array[] = [];
+              let samplesCollected = 0;
+              let hasVoicedContent = false;
+              let consecutiveSilentCallbacks = 0;
+
+              scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => {
+                const input = e.inputBuffer.getChannelData(0);
+                let cbPower = 0;
+                for (let i = 0; i < input.length; i++) {
+                  cbPower += input[i] * input[i];
+                }
+                const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
+
+                if (cbRms >= minRmsThreshold) {
+                  hasVoicedContent = true;
+                  consecutiveSilentCallbacks = 0;
+                } else {
+                  consecutiveSilentCallbacks++;
+                }
+
+                chunkBuffer.push(new Float32Array(input));
+                samplesCollected += input.length;
+
+                const shouldFlush = (
+                  samplesCollected >= maxSamplesPerChunk
+                  || (hasVoicedContent
+                      && consecutiveSilentCallbacks >= silenceFlushCallbacks
+                      && samplesCollected > minFlushSamples)
+                );
+
+                if (shouldFlush) {
+                  const merged = new Float32Array(samplesCollected);
+                  let offset = 0;
+                  for (const buf of chunkBuffer) {
+                    merged.set(buf, offset);
+                    offset += buf.length;
+                  }
+
+                  let powerSum = 0;
+                  for (let i = 0; i < merged.length; i++) {
+                    powerSum += merged[i] * merged[i];
+                  }
+                  const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+                  hasVoicedContent = false;
+                  consecutiveSilentCallbacks = 0;
+
+                  if (rms >= minRmsThreshold) {
+                    const outLen = Math.floor(merged.length / ratio);
+                    const pcm16 = new Int16Array(outLen);
+                    for (let i = 0; i < outLen; i++) {
+                      const srcIdx = Math.floor(i * ratio);
+                      const s = Math.max(-1, Math.min(1, merged[srcIdx]));
+                      pcm16[i] = Math.round(s * 32767);
+                    }
+                    const bytes = new Uint8Array(pcm16.buffer);
+                    let binary = '';
+                    for (let i = 0; i < bytes.length; i++) {
+                      binary += String.fromCharCode(bytes[i]);
+                    }
+                    pushChunk(btoa(binary), rms);
+                  } else {
+                    const keep = Math.min(preRollSamples, merged.length);
+                    const preRoll = merged.slice(merged.length - keep);
+                    chunkBuffer = [preRoll];
+                    samplesCollected = keep;
+                    return;
+                  }
+                  chunkBuffer = [];
+                  samplesCollected = 0;
+                }
+              };
+
+              source.connect(scriptProcessor);
+              scriptProcessor.connect(silentGain);
+              silentGain.connect(ctx.destination);
+
+              const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
+              processorsObj[trackId] = scriptProcessor;
+              console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
+            };
+
+            (async () => {
+              const ok = await useWorklet();
+              if (!ok) useScriptProcessor();
+
+              ctx.resume().catch(() => {});
+            })();

            // Clean up when the track ends (peer leaves, renegotiation, etc.)
            event.track.addEventListener('ended', () => {
              try {
-                processor.disconnect();
+                if (workletNode) {
+                  workletNode.disconnect();
+                }
+                if (scriptProcessor) {
+                  scriptProcessor.disconnect();
+                }
                source.disconnect();
                silentGain.disconnect();
                ctx.close();
@ -279,12 +389,8 @@ export class AudioCaptureProcedure {
              console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
            });

-            const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
            const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
-            processorsObj[trackId] = processor;
            contextsObj[trackId] = ctx;
-
-            console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
          } catch (err) {
            console.error('[AudioCapture] Failed to set up audio capture:', err);
          }
@ -296,7 +402,7 @@ export class AudioCaptureProcedure {
      // Copy static properties
      window.RTCPeerConnection.prototype = OrigRTC.prototype;
      Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC);
-    });
+    }, AUDIO_CAPTURE_WORKLET_CODE);

    this._injected = true;
    this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected');