From ee2dcd61f184c32562262a2a37c85704f5b1c2f4 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sat, 28 Feb 2026 15:53:31 +0100
Subject: [PATCH] feat: migrate audio capture from ScriptProcessorNode to
AudioWorkletNode with fallback
Made-with: Cursor
---
src/bot/audioCaptureProcedure.ts | 404 +++++++++++++++++++------------
1 file changed, 255 insertions(+), 149 deletions(-)
diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts
index 8866c39..293978d 100644
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@@ -14,13 +14,108 @@ interface CapturedAudioChunk {
captureDiagnostics?: AudioChunkDiagnostics;
}
+const AUDIO_CAPTURE_WORKLET_CODE = `
+class AudioCaptureProcessor extends AudioWorkletProcessor {
+ constructor(options) {
+ super();
+ const opts = options.processorOptions || {};
+ this.nativeRate = opts.nativeRate || 48000;
+ this.targetRate = opts.targetRate || 16000;
+ this.maxSamplesPerChunk = this.nativeRate * 8;
+ this.minRmsThreshold = 0.0003;
+ this.preRollSamples = Math.ceil(this.nativeRate * 0.5);
+ this.minFlushSamples = Math.ceil(this.nativeRate * 0.5);
+ this.silenceFlushCallbacks = 6;
+ this.ratio = this.nativeRate / this.targetRate;
+ this.chunkBuffer = [];
+ this.samplesCollected = 0;
+ this.hasVoicedContent = false;
+ this.consecutiveSilentCallbacks = 0;
+ }
+
+ process(inputs, outputs, parameters) {
+ const input = inputs[0]?.[0];
+ if (!input || input.length === 0) return true;
+
+ let cbPower = 0;
+ for (let i = 0; i < input.length; i++) {
+ cbPower += input[i] * input[i];
+ }
+ const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
+
+ if (cbRms >= this.minRmsThreshold) {
+ this.hasVoicedContent = true;
+ this.consecutiveSilentCallbacks = 0;
+ } else {
+ this.consecutiveSilentCallbacks++;
+ }
+
+ this.chunkBuffer.push(new Float32Array(input));
+ this.samplesCollected += input.length;
+
+ const shouldFlush = (
+ this.samplesCollected >= this.maxSamplesPerChunk
+ || (this.hasVoicedContent
+ && this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks
+ && this.samplesCollected > this.minFlushSamples)
+ );
+
+ if (shouldFlush) {
+ const merged = new Float32Array(this.samplesCollected);
+ let offset = 0;
+ for (const buf of this.chunkBuffer) {
+ merged.set(buf, offset);
+ offset += buf.length;
+ }
+
+ let powerSum = 0;
+ for (let i = 0; i < merged.length; i++) {
+ powerSum += merged[i] * merged[i];
+ }
+ const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+ this.hasVoicedContent = false;
+ this.consecutiveSilentCallbacks = 0;
+
+ if (rms >= this.minRmsThreshold) {
+ const outLen = Math.floor(merged.length / this.ratio);
+ const pcm16 = new Int16Array(outLen);
+ for (let i = 0; i < outLen; i++) {
+ const srcIdx = Math.floor(i * this.ratio);
+ const s = Math.max(-1, Math.min(1, merged[srcIdx]));
+ pcm16[i] = Math.round(s * 32767);
+ }
+ this.port.postMessage({
+ type: 'chunk',
+ data: pcm16.buffer,
+ rms,
+ nativeSampleRate: this.nativeRate
+ }, [pcm16.buffer]);
+ } else {
+ const keep = Math.min(this.preRollSamples, merged.length);
+ const preRoll = merged.slice(merged.length - keep);
+ this.chunkBuffer = [preRoll];
+ this.samplesCollected = keep;
+ return true;
+ }
+
+ this.chunkBuffer = [];
+ this.samplesCollected = 0;
+ }
+ return true;
+ }
+}
+
+registerProcessor('audio-capture-processor', AudioCaptureProcessor);
+`;
+
/**
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
*
* How it works:
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
- * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
+ * 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback)
* 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
* 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
*/
@@ -59,7 +154,7 @@ export class AudioCaptureProcedure {
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
- await this._page.addInitScript(() => {
+ await this._page.addInitScript((workletCode: string) => {
(window as any).__audioCaptureChunks = [] as any[];
(window as any).__audioCaptureProcessors = {} as Record;
(window as any).__audioCaptureContexts = {} as Record;
@@ -107,6 +202,7 @@ export class AudioCaptureProcedure {
const nativeRate = ctx.sampleRate;
const stream = new MediaStream([event.track]);
const source = ctx.createMediaStreamSource(stream);
+ const targetRate = 16000;
// #region agent log
console.log(
@@ -117,157 +213,171 @@ export class AudioCaptureProcedure {
});
// #endregion
- const processor = ctx.createScriptProcessor(8192, 1, 1);
- let chunkBuffer: Float32Array[] = [];
- let samplesCollected = 0;
- let skippedSilentChunks = 0;
- let callbackCount = 0;
- let totalNonZeroSamples = 0;
- const minRmsThreshold = 0.0003;
- const maxSamplesPerChunk = nativeRate * 8;
- const targetRate = 16000;
- const preRollSamples = Math.ceil(nativeRate * 0.5);
- const minFlushSamples = Math.ceil(nativeRate * 0.5);
- // Adaptive flush: after ~1s silence following voiced content
- const silenceFlushCallbacks = 6;
- let hasVoicedContent = false;
- let consecutiveSilentCallbacks = 0;
+ const silentGain = ctx.createGain();
+ silentGain.gain.value = 0;
- processor.onaudioprocess = (e: AudioProcessingEvent) => {
- const input = e.inputBuffer.getChannelData(0);
- callbackCount++;
-
- // #region agent log
- let nonZeroThisCallback = 0;
- for (let i = 0; i < input.length; i++) {
- if (input[i] !== 0) nonZeroThisCallback++;
- }
- totalNonZeroSamples += nonZeroThisCallback;
-
- if (callbackCount <= 3 || callbackCount % 50 === 0) {
- let maxAbs = 0;
- for (let i = 0; i < input.length; i++) {
- const abs = Math.abs(input[i]);
- if (abs > maxAbs) maxAbs = abs;
- }
- console.log(
- `[AudioCapture][DIAG] onaudioprocess #${callbackCount}: bufLen=${input.length}, nonZero=${nonZeroThisCallback}/${input.length}, maxAbs=${maxAbs.toFixed(8)}, track.enabled=${event.track.enabled}, track.muted=${event.track.muted}, track.readyState=${event.track.readyState}, ctx.state=${ctx.state}, totalNonZero=${totalNonZeroSamples}`
- );
- }
- // #endregion
-
- // Per-callback voice activity detection
- let cbPower = 0;
- for (let i = 0; i < input.length; i++) {
- cbPower += input[i] * input[i];
- }
- const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
-
- if (cbRms >= minRmsThreshold) {
- hasVoicedContent = true;
- consecutiveSilentCallbacks = 0;
- } else {
- consecutiveSilentCallbacks++;
- }
-
- chunkBuffer.push(new Float32Array(input));
- samplesCollected += input.length;
-
- // Flush: max duration reached OR voiced content followed by ~1s silence
- const shouldFlush = (
- samplesCollected >= maxSamplesPerChunk
- || (hasVoicedContent
- && consecutiveSilentCallbacks >= silenceFlushCallbacks
- && samplesCollected > minFlushSamples)
- );
-
- if (shouldFlush) {
- const merged = new Float32Array(samplesCollected);
- let offset = 0;
- for (const buf of chunkBuffer) {
- merged.set(buf, offset);
- offset += buf.length;
- }
-
- let powerSum = 0;
- for (let i = 0; i < merged.length; i++) {
- powerSum += merged[i] * merged[i];
- }
- const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
-
- hasVoicedContent = false;
- consecutiveSilentCallbacks = 0;
-
- if (rms < minRmsThreshold) {
- skippedSilentChunks++;
- if (skippedSilentChunks % 10 === 0) {
- console.log(
- `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
- );
- }
- const keep = Math.min(preRollSamples, merged.length);
- const preRoll = merged.slice(merged.length - keep);
- chunkBuffer = [preRoll];
- samplesCollected = keep;
- return;
- }
-
- // Downsample from nativeRate to 16 kHz
- const ratio = nativeRate / targetRate;
- const outLen = Math.floor(merged.length / ratio);
- const pcm16 = new Int16Array(outLen);
- for (let i = 0; i < outLen; i++) {
- const srcIdx = Math.floor(i * ratio);
- const s = Math.max(-1, Math.min(1, merged[srcIdx]));
- pcm16[i] = Math.round(s * 32767);
- }
-
- // Convert to base64
- const bytes = new Uint8Array(pcm16.buffer);
- let binary = '';
- for (let i = 0; i < bytes.length; i++) {
- binary += String.fromCharCode(bytes[i]);
- }
- const base64 = btoa(binary);
-
- const chunks = (window as any).__audioCaptureChunks as any[];
- if (chunks.length < 60) {
- chunks.push({
- data: base64,
- sampleRate: targetRate,
- captureDiagnostics: {
- trackId,
- readyState: event.track.readyState,
- rms: Number(rms.toFixed(6)),
- nativeSampleRate: nativeRate,
- },
- });
- }
-
- skippedSilentChunks = 0;
-
- chunkBuffer = [];
- samplesCollected = 0;
+ const pushChunk = (base64Data: string, rms: number) => {
+ const chunks = (window as any).__audioCaptureChunks as any[];
+ if (chunks.length < 60) {
+ chunks.push({
+ data: base64Data,
+ sampleRate: targetRate,
+ captureDiagnostics: {
+ trackId,
+ readyState: event.track.readyState,
+ rms: Number(rms.toFixed(6)),
+ nativeSampleRate: nativeRate,
+ },
+ });
}
};
- source.connect(processor);
- // Connect to a silent gain node so the ScriptProcessor fires
- // its onaudioprocess callback without routing captured audio
- // to the speakers (which would conflict with the TTS AudioContext).
- const silentGain = ctx.createGain();
- silentGain.gain.value = 0;
- processor.connect(silentGain);
- silentGain.connect(ctx.destination);
+ let workletNode: AudioWorkletNode | null = null;
+ let scriptProcessor: ScriptProcessorNode | null = null;
- // Resume the context explicitly — in authMode Chromium does
- // not set --autoplay-policy, so new AudioContexts start suspended.
- ctx.resume().catch(() => {});
+ const useWorklet = async () => {
+ try {
+ const blob = new Blob([workletCode], { type: 'application/javascript' });
+ const blobUrl = URL.createObjectURL(blob);
+ await ctx.audioWorklet.addModule(blobUrl);
+ URL.revokeObjectURL(blobUrl);
+
+ workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', {
+ processorOptions: { nativeRate, targetRate },
+ });
+
+ workletNode.port.onmessage = (ev: MessageEvent) => {
+ if (ev.data?.type !== 'chunk' || !ev.data.data) return;
+ const pcm16 = new Int16Array(ev.data.data);
+ const bytes = new Uint8Array(pcm16.buffer);
+ let binary = '';
+ for (let i = 0; i < bytes.length; i++) {
+ binary += String.fromCharCode(bytes[i]);
+ }
+ pushChunk(btoa(binary), ev.data.rms || 0);
+ };
+
+ source.connect(workletNode);
+ workletNode.connect(silentGain);
+ silentGain.connect(ctx.destination);
+
+ const processorsObj = (window as any).__audioCaptureProcessors as Record;
+ processorsObj[trackId] = workletNode;
+ console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
+ return true;
+ } catch (err) {
+ console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`);
+ return false;
+ }
+ };
+
+ const useScriptProcessor = () => {
+ const minRmsThreshold = 0.0003;
+ const maxSamplesPerChunk = nativeRate * 8;
+ const preRollSamples = Math.ceil(nativeRate * 0.5);
+ const minFlushSamples = Math.ceil(nativeRate * 0.5);
+ const silenceFlushCallbacks = 6;
+ const ratio = nativeRate / targetRate;
+
+ scriptProcessor = ctx.createScriptProcessor(8192, 1, 1);
+ let chunkBuffer: Float32Array[] = [];
+ let samplesCollected = 0;
+ let hasVoicedContent = false;
+ let consecutiveSilentCallbacks = 0;
+
+ scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => {
+ const input = e.inputBuffer.getChannelData(0);
+ let cbPower = 0;
+ for (let i = 0; i < input.length; i++) {
+ cbPower += input[i] * input[i];
+ }
+ const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
+
+ if (cbRms >= minRmsThreshold) {
+ hasVoicedContent = true;
+ consecutiveSilentCallbacks = 0;
+ } else {
+ consecutiveSilentCallbacks++;
+ }
+
+ chunkBuffer.push(new Float32Array(input));
+ samplesCollected += input.length;
+
+ const shouldFlush = (
+ samplesCollected >= maxSamplesPerChunk
+ || (hasVoicedContent
+ && consecutiveSilentCallbacks >= silenceFlushCallbacks
+ && samplesCollected > minFlushSamples)
+ );
+
+ if (shouldFlush) {
+ const merged = new Float32Array(samplesCollected);
+ let offset = 0;
+ for (const buf of chunkBuffer) {
+ merged.set(buf, offset);
+ offset += buf.length;
+ }
+
+ let powerSum = 0;
+ for (let i = 0; i < merged.length; i++) {
+ powerSum += merged[i] * merged[i];
+ }
+ const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+ hasVoicedContent = false;
+ consecutiveSilentCallbacks = 0;
+
+ if (rms >= minRmsThreshold) {
+ const outLen = Math.floor(merged.length / ratio);
+ const pcm16 = new Int16Array(outLen);
+ for (let i = 0; i < outLen; i++) {
+ const srcIdx = Math.floor(i * ratio);
+ const s = Math.max(-1, Math.min(1, merged[srcIdx]));
+ pcm16[i] = Math.round(s * 32767);
+ }
+ const bytes = new Uint8Array(pcm16.buffer);
+ let binary = '';
+ for (let i = 0; i < bytes.length; i++) {
+ binary += String.fromCharCode(bytes[i]);
+ }
+ pushChunk(btoa(binary), rms);
+ } else {
+ const keep = Math.min(preRollSamples, merged.length);
+ const preRoll = merged.slice(merged.length - keep);
+ chunkBuffer = [preRoll];
+ samplesCollected = keep;
+ return;
+ }
+ chunkBuffer = [];
+ samplesCollected = 0;
+ }
+ };
+
+ source.connect(scriptProcessor);
+ scriptProcessor.connect(silentGain);
+ silentGain.connect(ctx.destination);
+
+ const processorsObj = (window as any).__audioCaptureProcessors as Record;
+ processorsObj[trackId] = scriptProcessor;
+ console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
+ };
+
+ (async () => {
+ const ok = await useWorklet();
+ if (!ok) useScriptProcessor();
+
+ ctx.resume().catch(() => {});
+ })();
// Clean up when the track ends (peer leaves, renegotiation, etc.)
event.track.addEventListener('ended', () => {
try {
- processor.disconnect();
+ if (workletNode) {
+ workletNode.disconnect();
+ }
+ if (scriptProcessor) {
+ scriptProcessor.disconnect();
+ }
source.disconnect();
silentGain.disconnect();
ctx.close();
@@ -279,12 +389,8 @@ export class AudioCaptureProcedure {
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
});
- const processorsObj = (window as any).__audioCaptureProcessors as Record;
const contextsObj = (window as any).__audioCaptureContexts as Record;
- processorsObj[trackId] = processor;
contextsObj[trackId] = ctx;
-
- console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
} catch (err) {
console.error('[AudioCapture] Failed to set up audio capture:', err);
}
@@ -296,7 +402,7 @@ export class AudioCaptureProcedure {
// Copy static properties
window.RTCPeerConnection.prototype = OrigRTC.prototype;
Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC);
- });
+ }, AUDIO_CAPTURE_WORKLET_CODE);
this._injected = true;
this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected');