feat: migrate audio capture from ScriptProcessorNode to AudioWorkletNode with fallback
Made-with: Cursor
This commit is contained in:
parent
25f684eb58
commit
ee2dcd61f1
1 changed files with 255 additions and 149 deletions
|
|
@ -14,13 +14,108 @@ interface CapturedAudioChunk {
|
|||
captureDiagnostics?: AudioChunkDiagnostics;
|
||||
}
|
||||
|
||||
const AUDIO_CAPTURE_WORKLET_CODE = `
|
||||
class AudioCaptureProcessor extends AudioWorkletProcessor {
|
||||
constructor(options) {
|
||||
super();
|
||||
const opts = options.processorOptions || {};
|
||||
this.nativeRate = opts.nativeRate || 48000;
|
||||
this.targetRate = opts.targetRate || 16000;
|
||||
this.maxSamplesPerChunk = this.nativeRate * 8;
|
||||
this.minRmsThreshold = 0.0003;
|
||||
this.preRollSamples = Math.ceil(this.nativeRate * 0.5);
|
||||
this.minFlushSamples = Math.ceil(this.nativeRate * 0.5);
|
||||
this.silenceFlushCallbacks = 6;
|
||||
this.ratio = this.nativeRate / this.targetRate;
|
||||
this.chunkBuffer = [];
|
||||
this.samplesCollected = 0;
|
||||
this.hasVoicedContent = false;
|
||||
this.consecutiveSilentCallbacks = 0;
|
||||
}
|
||||
|
||||
process(inputs, outputs, parameters) {
|
||||
const input = inputs[0]?.[0];
|
||||
if (!input || input.length === 0) return true;
|
||||
|
||||
let cbPower = 0;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
cbPower += input[i] * input[i];
|
||||
}
|
||||
const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
|
||||
|
||||
if (cbRms >= this.minRmsThreshold) {
|
||||
this.hasVoicedContent = true;
|
||||
this.consecutiveSilentCallbacks = 0;
|
||||
} else {
|
||||
this.consecutiveSilentCallbacks++;
|
||||
}
|
||||
|
||||
this.chunkBuffer.push(new Float32Array(input));
|
||||
this.samplesCollected += input.length;
|
||||
|
||||
const shouldFlush = (
|
||||
this.samplesCollected >= this.maxSamplesPerChunk
|
||||
|| (this.hasVoicedContent
|
||||
&& this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks
|
||||
&& this.samplesCollected > this.minFlushSamples)
|
||||
);
|
||||
|
||||
if (shouldFlush) {
|
||||
const merged = new Float32Array(this.samplesCollected);
|
||||
let offset = 0;
|
||||
for (const buf of this.chunkBuffer) {
|
||||
merged.set(buf, offset);
|
||||
offset += buf.length;
|
||||
}
|
||||
|
||||
let powerSum = 0;
|
||||
for (let i = 0; i < merged.length; i++) {
|
||||
powerSum += merged[i] * merged[i];
|
||||
}
|
||||
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
||||
|
||||
this.hasVoicedContent = false;
|
||||
this.consecutiveSilentCallbacks = 0;
|
||||
|
||||
if (rms >= this.minRmsThreshold) {
|
||||
const outLen = Math.floor(merged.length / this.ratio);
|
||||
const pcm16 = new Int16Array(outLen);
|
||||
for (let i = 0; i < outLen; i++) {
|
||||
const srcIdx = Math.floor(i * this.ratio);
|
||||
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
|
||||
pcm16[i] = Math.round(s * 32767);
|
||||
}
|
||||
this.port.postMessage({
|
||||
type: 'chunk',
|
||||
data: pcm16.buffer,
|
||||
rms,
|
||||
nativeSampleRate: this.nativeRate
|
||||
}, [pcm16.buffer]);
|
||||
} else {
|
||||
const keep = Math.min(this.preRollSamples, merged.length);
|
||||
const preRoll = merged.slice(merged.length - keep);
|
||||
this.chunkBuffer = [preRoll];
|
||||
this.samplesCollected = keep;
|
||||
return true;
|
||||
}
|
||||
|
||||
this.chunkBuffer = [];
|
||||
this.samplesCollected = 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
registerProcessor('audio-capture-processor', AudioCaptureProcessor);
|
||||
`;
|
||||
|
||||
/**
|
||||
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
|
||||
*
|
||||
* How it works:
|
||||
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
|
||||
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
|
||||
* 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
|
||||
* 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback)
|
||||
* 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
|
||||
* 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
|
||||
*/
|
||||
|
|
@ -59,7 +154,7 @@ export class AudioCaptureProcedure {
|
|||
|
||||
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
||||
|
||||
await this._page.addInitScript(() => {
|
||||
await this._page.addInitScript((workletCode: string) => {
|
||||
(window as any).__audioCaptureChunks = [] as any[];
|
||||
(window as any).__audioCaptureProcessors = {} as Record<string, any>;
|
||||
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
|
||||
|
|
@ -107,6 +202,7 @@ export class AudioCaptureProcedure {
|
|||
const nativeRate = ctx.sampleRate;
|
||||
const stream = new MediaStream([event.track]);
|
||||
const source = ctx.createMediaStreamSource(stream);
|
||||
const targetRate = 16000;
|
||||
|
||||
// #region agent log
|
||||
console.log(
|
||||
|
|
@ -117,157 +213,171 @@ export class AudioCaptureProcedure {
|
|||
});
|
||||
// #endregion
|
||||
|
||||
const processor = ctx.createScriptProcessor(8192, 1, 1);
|
||||
let chunkBuffer: Float32Array[] = [];
|
||||
let samplesCollected = 0;
|
||||
let skippedSilentChunks = 0;
|
||||
let callbackCount = 0;
|
||||
let totalNonZeroSamples = 0;
|
||||
const minRmsThreshold = 0.0003;
|
||||
const maxSamplesPerChunk = nativeRate * 8;
|
||||
const targetRate = 16000;
|
||||
const preRollSamples = Math.ceil(nativeRate * 0.5);
|
||||
const minFlushSamples = Math.ceil(nativeRate * 0.5);
|
||||
// Adaptive flush: after ~1s silence following voiced content
|
||||
const silenceFlushCallbacks = 6;
|
||||
let hasVoicedContent = false;
|
||||
let consecutiveSilentCallbacks = 0;
|
||||
const silentGain = ctx.createGain();
|
||||
silentGain.gain.value = 0;
|
||||
|
||||
processor.onaudioprocess = (e: AudioProcessingEvent) => {
|
||||
const input = e.inputBuffer.getChannelData(0);
|
||||
callbackCount++;
|
||||
|
||||
// #region agent log
|
||||
let nonZeroThisCallback = 0;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
if (input[i] !== 0) nonZeroThisCallback++;
|
||||
}
|
||||
totalNonZeroSamples += nonZeroThisCallback;
|
||||
|
||||
if (callbackCount <= 3 || callbackCount % 50 === 0) {
|
||||
let maxAbs = 0;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const abs = Math.abs(input[i]);
|
||||
if (abs > maxAbs) maxAbs = abs;
|
||||
}
|
||||
console.log(
|
||||
`[AudioCapture][DIAG] onaudioprocess #${callbackCount}: bufLen=${input.length}, nonZero=${nonZeroThisCallback}/${input.length}, maxAbs=${maxAbs.toFixed(8)}, track.enabled=${event.track.enabled}, track.muted=${event.track.muted}, track.readyState=${event.track.readyState}, ctx.state=${ctx.state}, totalNonZero=${totalNonZeroSamples}`
|
||||
);
|
||||
}
|
||||
// #endregion
|
||||
|
||||
// Per-callback voice activity detection
|
||||
let cbPower = 0;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
cbPower += input[i] * input[i];
|
||||
}
|
||||
const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
|
||||
|
||||
if (cbRms >= minRmsThreshold) {
|
||||
hasVoicedContent = true;
|
||||
consecutiveSilentCallbacks = 0;
|
||||
} else {
|
||||
consecutiveSilentCallbacks++;
|
||||
}
|
||||
|
||||
chunkBuffer.push(new Float32Array(input));
|
||||
samplesCollected += input.length;
|
||||
|
||||
// Flush: max duration reached OR voiced content followed by ~1s silence
|
||||
const shouldFlush = (
|
||||
samplesCollected >= maxSamplesPerChunk
|
||||
|| (hasVoicedContent
|
||||
&& consecutiveSilentCallbacks >= silenceFlushCallbacks
|
||||
&& samplesCollected > minFlushSamples)
|
||||
);
|
||||
|
||||
if (shouldFlush) {
|
||||
const merged = new Float32Array(samplesCollected);
|
||||
let offset = 0;
|
||||
for (const buf of chunkBuffer) {
|
||||
merged.set(buf, offset);
|
||||
offset += buf.length;
|
||||
}
|
||||
|
||||
let powerSum = 0;
|
||||
for (let i = 0; i < merged.length; i++) {
|
||||
powerSum += merged[i] * merged[i];
|
||||
}
|
||||
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
||||
|
||||
hasVoicedContent = false;
|
||||
consecutiveSilentCallbacks = 0;
|
||||
|
||||
if (rms < minRmsThreshold) {
|
||||
skippedSilentChunks++;
|
||||
if (skippedSilentChunks % 10 === 0) {
|
||||
console.log(
|
||||
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
|
||||
);
|
||||
}
|
||||
const keep = Math.min(preRollSamples, merged.length);
|
||||
const preRoll = merged.slice(merged.length - keep);
|
||||
chunkBuffer = [preRoll];
|
||||
samplesCollected = keep;
|
||||
return;
|
||||
}
|
||||
|
||||
// Downsample from nativeRate to 16 kHz
|
||||
const ratio = nativeRate / targetRate;
|
||||
const outLen = Math.floor(merged.length / ratio);
|
||||
const pcm16 = new Int16Array(outLen);
|
||||
for (let i = 0; i < outLen; i++) {
|
||||
const srcIdx = Math.floor(i * ratio);
|
||||
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
|
||||
pcm16[i] = Math.round(s * 32767);
|
||||
}
|
||||
|
||||
// Convert to base64
|
||||
const bytes = new Uint8Array(pcm16.buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
const base64 = btoa(binary);
|
||||
|
||||
const chunks = (window as any).__audioCaptureChunks as any[];
|
||||
if (chunks.length < 60) {
|
||||
chunks.push({
|
||||
data: base64,
|
||||
sampleRate: targetRate,
|
||||
captureDiagnostics: {
|
||||
trackId,
|
||||
readyState: event.track.readyState,
|
||||
rms: Number(rms.toFixed(6)),
|
||||
nativeSampleRate: nativeRate,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
skippedSilentChunks = 0;
|
||||
|
||||
chunkBuffer = [];
|
||||
samplesCollected = 0;
|
||||
const pushChunk = (base64Data: string, rms: number) => {
|
||||
const chunks = (window as any).__audioCaptureChunks as any[];
|
||||
if (chunks.length < 60) {
|
||||
chunks.push({
|
||||
data: base64Data,
|
||||
sampleRate: targetRate,
|
||||
captureDiagnostics: {
|
||||
trackId,
|
||||
readyState: event.track.readyState,
|
||||
rms: Number(rms.toFixed(6)),
|
||||
nativeSampleRate: nativeRate,
|
||||
},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
// Connect to a silent gain node so the ScriptProcessor fires
|
||||
// its onaudioprocess callback without routing captured audio
|
||||
// to the speakers (which would conflict with the TTS AudioContext).
|
||||
const silentGain = ctx.createGain();
|
||||
silentGain.gain.value = 0;
|
||||
processor.connect(silentGain);
|
||||
silentGain.connect(ctx.destination);
|
||||
let workletNode: AudioWorkletNode | null = null;
|
||||
let scriptProcessor: ScriptProcessorNode | null = null;
|
||||
|
||||
// Resume the context explicitly — in authMode Chromium does
|
||||
// not set --autoplay-policy, so new AudioContexts start suspended.
|
||||
ctx.resume().catch(() => {});
|
||||
const useWorklet = async () => {
|
||||
try {
|
||||
const blob = new Blob([workletCode], { type: 'application/javascript' });
|
||||
const blobUrl = URL.createObjectURL(blob);
|
||||
await ctx.audioWorklet.addModule(blobUrl);
|
||||
URL.revokeObjectURL(blobUrl);
|
||||
|
||||
workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', {
|
||||
processorOptions: { nativeRate, targetRate },
|
||||
});
|
||||
|
||||
workletNode.port.onmessage = (ev: MessageEvent) => {
|
||||
if (ev.data?.type !== 'chunk' || !ev.data.data) return;
|
||||
const pcm16 = new Int16Array(ev.data.data);
|
||||
const bytes = new Uint8Array(pcm16.buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
pushChunk(btoa(binary), ev.data.rms || 0);
|
||||
};
|
||||
|
||||
source.connect(workletNode);
|
||||
workletNode.connect(silentGain);
|
||||
silentGain.connect(ctx.destination);
|
||||
|
||||
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
processorsObj[trackId] = workletNode;
|
||||
console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
||||
return true;
|
||||
} catch (err) {
|
||||
console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
const useScriptProcessor = () => {
|
||||
const minRmsThreshold = 0.0003;
|
||||
const maxSamplesPerChunk = nativeRate * 8;
|
||||
const preRollSamples = Math.ceil(nativeRate * 0.5);
|
||||
const minFlushSamples = Math.ceil(nativeRate * 0.5);
|
||||
const silenceFlushCallbacks = 6;
|
||||
const ratio = nativeRate / targetRate;
|
||||
|
||||
scriptProcessor = ctx.createScriptProcessor(8192, 1, 1);
|
||||
let chunkBuffer: Float32Array[] = [];
|
||||
let samplesCollected = 0;
|
||||
let hasVoicedContent = false;
|
||||
let consecutiveSilentCallbacks = 0;
|
||||
|
||||
scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => {
|
||||
const input = e.inputBuffer.getChannelData(0);
|
||||
let cbPower = 0;
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
cbPower += input[i] * input[i];
|
||||
}
|
||||
const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
|
||||
|
||||
if (cbRms >= minRmsThreshold) {
|
||||
hasVoicedContent = true;
|
||||
consecutiveSilentCallbacks = 0;
|
||||
} else {
|
||||
consecutiveSilentCallbacks++;
|
||||
}
|
||||
|
||||
chunkBuffer.push(new Float32Array(input));
|
||||
samplesCollected += input.length;
|
||||
|
||||
const shouldFlush = (
|
||||
samplesCollected >= maxSamplesPerChunk
|
||||
|| (hasVoicedContent
|
||||
&& consecutiveSilentCallbacks >= silenceFlushCallbacks
|
||||
&& samplesCollected > minFlushSamples)
|
||||
);
|
||||
|
||||
if (shouldFlush) {
|
||||
const merged = new Float32Array(samplesCollected);
|
||||
let offset = 0;
|
||||
for (const buf of chunkBuffer) {
|
||||
merged.set(buf, offset);
|
||||
offset += buf.length;
|
||||
}
|
||||
|
||||
let powerSum = 0;
|
||||
for (let i = 0; i < merged.length; i++) {
|
||||
powerSum += merged[i] * merged[i];
|
||||
}
|
||||
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
||||
|
||||
hasVoicedContent = false;
|
||||
consecutiveSilentCallbacks = 0;
|
||||
|
||||
if (rms >= minRmsThreshold) {
|
||||
const outLen = Math.floor(merged.length / ratio);
|
||||
const pcm16 = new Int16Array(outLen);
|
||||
for (let i = 0; i < outLen; i++) {
|
||||
const srcIdx = Math.floor(i * ratio);
|
||||
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
|
||||
pcm16[i] = Math.round(s * 32767);
|
||||
}
|
||||
const bytes = new Uint8Array(pcm16.buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
pushChunk(btoa(binary), rms);
|
||||
} else {
|
||||
const keep = Math.min(preRollSamples, merged.length);
|
||||
const preRoll = merged.slice(merged.length - keep);
|
||||
chunkBuffer = [preRoll];
|
||||
samplesCollected = keep;
|
||||
return;
|
||||
}
|
||||
chunkBuffer = [];
|
||||
samplesCollected = 0;
|
||||
}
|
||||
};
|
||||
|
||||
source.connect(scriptProcessor);
|
||||
scriptProcessor.connect(silentGain);
|
||||
silentGain.connect(ctx.destination);
|
||||
|
||||
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
processorsObj[trackId] = scriptProcessor;
|
||||
console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
||||
};
|
||||
|
||||
(async () => {
|
||||
const ok = await useWorklet();
|
||||
if (!ok) useScriptProcessor();
|
||||
|
||||
ctx.resume().catch(() => {});
|
||||
})();
|
||||
|
||||
// Clean up when the track ends (peer leaves, renegotiation, etc.)
|
||||
event.track.addEventListener('ended', () => {
|
||||
try {
|
||||
processor.disconnect();
|
||||
if (workletNode) {
|
||||
workletNode.disconnect();
|
||||
}
|
||||
if (scriptProcessor) {
|
||||
scriptProcessor.disconnect();
|
||||
}
|
||||
source.disconnect();
|
||||
silentGain.disconnect();
|
||||
ctx.close();
|
||||
|
|
@ -279,12 +389,8 @@ export class AudioCaptureProcedure {
|
|||
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
|
||||
});
|
||||
|
||||
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||
processorsObj[trackId] = processor;
|
||||
contextsObj[trackId] = ctx;
|
||||
|
||||
console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
||||
} catch (err) {
|
||||
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
||||
}
|
||||
|
|
@ -296,7 +402,7 @@ export class AudioCaptureProcedure {
|
|||
// Copy static properties
|
||||
window.RTCPeerConnection.prototype = OrigRTC.prototype;
|
||||
Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC);
|
||||
});
|
||||
}, AUDIO_CAPTURE_WORKLET_CODE);
|
||||
|
||||
this._injected = true;
|
||||
this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected');
|
||||
|
|
|
|||
Loading…
Reference in a new issue