506 lines
19 KiB
TypeScript
506 lines
19 KiB
TypeScript
import { Page } from 'playwright';
|
|
import { Logger } from 'winston';
|
|
|
|
interface AudioChunkDiagnostics {
|
|
trackId?: string;
|
|
readyState?: string;
|
|
rms?: number;
|
|
nativeSampleRate?: number;
|
|
}
|
|
|
|
interface CapturedAudioChunk {
|
|
data: string;
|
|
sampleRate: number;
|
|
captureDiagnostics?: AudioChunkDiagnostics;
|
|
}
|
|
|
|
const AUDIO_CAPTURE_WORKLET_CODE = `
|
|
class AudioCaptureProcessor extends AudioWorkletProcessor {
|
|
constructor(options) {
|
|
super();
|
|
const opts = options.processorOptions || {};
|
|
this.nativeRate = opts.nativeRate || 48000;
|
|
this.targetRate = opts.targetRate || 16000;
|
|
this.maxSamplesPerChunk = this.nativeRate * 8;
|
|
this.minRmsThreshold = 0.0003;
|
|
this.preRollSamples = Math.ceil(this.nativeRate * 1.0);
|
|
this.minFlushSamples = Math.ceil(this.nativeRate * 0.5);
|
|
this.silenceFlushCallbacks = 6;
|
|
this.ratio = this.nativeRate / this.targetRate;
|
|
this.chunkBuffer = [];
|
|
this.samplesCollected = 0;
|
|
this.hasVoicedContent = false;
|
|
this.consecutiveSilentCallbacks = 0;
|
|
}
|
|
|
|
process(inputs, outputs, parameters) {
|
|
const input = inputs[0]?.[0];
|
|
if (!input || input.length === 0) return true;
|
|
|
|
let cbPower = 0;
|
|
for (let i = 0; i < input.length; i++) {
|
|
cbPower += input[i] * input[i];
|
|
}
|
|
const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
|
|
|
|
if (cbRms >= this.minRmsThreshold) {
|
|
this.hasVoicedContent = true;
|
|
this.consecutiveSilentCallbacks = 0;
|
|
} else {
|
|
this.consecutiveSilentCallbacks++;
|
|
}
|
|
|
|
this.chunkBuffer.push(new Float32Array(input));
|
|
this.samplesCollected += input.length;
|
|
|
|
const shouldFlush = (
|
|
this.samplesCollected >= this.maxSamplesPerChunk
|
|
|| (this.hasVoicedContent
|
|
&& this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks
|
|
&& this.samplesCollected > this.minFlushSamples)
|
|
);
|
|
|
|
if (shouldFlush) {
|
|
const merged = new Float32Array(this.samplesCollected);
|
|
let offset = 0;
|
|
for (const buf of this.chunkBuffer) {
|
|
merged.set(buf, offset);
|
|
offset += buf.length;
|
|
}
|
|
|
|
let powerSum = 0;
|
|
for (let i = 0; i < merged.length; i++) {
|
|
powerSum += merged[i] * merged[i];
|
|
}
|
|
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
|
|
|
this.hasVoicedContent = false;
|
|
this.consecutiveSilentCallbacks = 0;
|
|
|
|
if (rms >= this.minRmsThreshold) {
|
|
const outLen = Math.floor(merged.length / this.ratio);
|
|
const pcm16 = new Int16Array(outLen);
|
|
for (let i = 0; i < outLen; i++) {
|
|
const srcIdx = Math.floor(i * this.ratio);
|
|
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
|
|
pcm16[i] = Math.round(s * 32767);
|
|
}
|
|
this.port.postMessage({
|
|
type: 'chunk',
|
|
data: pcm16.buffer,
|
|
rms,
|
|
nativeSampleRate: this.nativeRate
|
|
}, [pcm16.buffer]);
|
|
} else {
|
|
const keep = Math.min(this.preRollSamples, merged.length);
|
|
const preRoll = merged.slice(merged.length - keep);
|
|
this.chunkBuffer = [preRoll];
|
|
this.samplesCollected = keep;
|
|
return true;
|
|
}
|
|
|
|
const keep = Math.min(this.preRollSamples, merged.length);
|
|
const preRoll = merged.slice(merged.length - keep);
|
|
this.chunkBuffer = [preRoll];
|
|
this.samplesCollected = keep;
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
registerProcessor('audio-capture-processor', AudioCaptureProcessor);
|
|
`;
|
|
|
|
/**
|
|
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
|
|
*
|
|
* How it works:
|
|
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
|
|
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
|
|
* 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback)
|
|
* 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
|
|
* 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
|
|
*/
|
|
export class AudioCaptureProcedure {
|
|
private _page: Page;
|
|
private _logger: Logger;
|
|
private _onAudioChunk: (
|
|
base64Data: string,
|
|
sampleRate: number,
|
|
captureDiagnostics?: AudioChunkDiagnostics
|
|
) => void;
|
|
private _isCapturing: boolean = false;
|
|
private _pollInterval: ReturnType<typeof setInterval> | null = null;
|
|
private _injected: boolean = false;
|
|
|
|
constructor(
|
|
page: Page,
|
|
logger: Logger,
|
|
onAudioChunk: (
|
|
base64Data: string,
|
|
sampleRate: number,
|
|
captureDiagnostics?: AudioChunkDiagnostics
|
|
) => void,
|
|
) {
|
|
this._page = page;
|
|
this._logger = logger;
|
|
this._onAudioChunk = onAudioChunk;
|
|
}
|
|
|
|
/**
|
|
* Inject the RTCPeerConnection wrapper BEFORE any page navigation.
|
|
* Must be called before navigating to Teams.
|
|
*/
|
|
async injectCaptureOverride(): Promise<void> {
|
|
if (this._injected) return;
|
|
|
|
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
|
|
|
await this._page.addInitScript((workletCode: string) => {
|
|
(window as any).__audioCaptureChunks = [] as any[];
|
|
(window as any).__audioCaptureProcessors = {} as Record<string, any>;
|
|
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
|
|
(window as any).__audioCapturePeerConnections = [] as RTCPeerConnection[];
|
|
|
|
const OrigRTC = window.RTCPeerConnection;
|
|
|
|
// @ts-ignore — wrapping constructor
|
|
window.RTCPeerConnection = function (this: RTCPeerConnection, ...args: any[]) {
|
|
const pc = new OrigRTC(...args);
|
|
try {
|
|
const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[];
|
|
pcs.push(pc);
|
|
// #region agent log
|
|
console.log(`[AudioCapture][DIAG] New RTCPeerConnection created (total: ${pcs.length}), config:`, JSON.stringify(args[0] || {}).substring(0, 200));
|
|
// #endregion
|
|
} catch {
|
|
// ignore
|
|
}
|
|
|
|
pc.addEventListener('track', (event: RTCTrackEvent) => {
|
|
if (event.track.kind !== 'audio') return;
|
|
|
|
const trackId = event.track.id || `audio-track-${Date.now()}`;
|
|
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
|
|
if (processors[trackId]) {
|
|
return;
|
|
}
|
|
|
|
// #region agent log
|
|
console.log(
|
|
`[AudioCapture][DIAG] Track received: id=${trackId}, enabled=${event.track.enabled}, muted=${event.track.muted}, readyState=${event.track.readyState}, label=${event.track.label}`
|
|
);
|
|
event.track.addEventListener('mute', () => {
|
|
console.log(`[AudioCapture][DIAG] Track MUTED: id=${trackId}`);
|
|
});
|
|
event.track.addEventListener('unmute', () => {
|
|
console.log(`[AudioCapture][DIAG] Track UNMUTED: id=${trackId}`);
|
|
});
|
|
// #endregion
|
|
|
|
try {
|
|
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
|
|
const ctx = new AudioCtx();
|
|
const nativeRate = ctx.sampleRate;
|
|
const stream = new MediaStream([event.track]);
|
|
const source = ctx.createMediaStreamSource(stream);
|
|
const targetRate = 16000;
|
|
|
|
// #region agent log
|
|
console.log(
|
|
`[AudioCapture][DIAG] AudioContext: state=${ctx.state}, sampleRate=${nativeRate}, stream.active=${stream.active}, streamTracks=${stream.getAudioTracks().length}`
|
|
);
|
|
ctx.addEventListener('statechange', () => {
|
|
console.log(`[AudioCapture][DIAG] AudioContext statechange: ${ctx.state} for track=${trackId}`);
|
|
});
|
|
// #endregion
|
|
|
|
const silentGain = ctx.createGain();
|
|
silentGain.gain.value = 0;
|
|
|
|
const pushChunk = (base64Data: string, rms: number) => {
|
|
const chunks = (window as any).__audioCaptureChunks as any[];
|
|
if (chunks.length < 60) {
|
|
chunks.push({
|
|
data: base64Data,
|
|
sampleRate: targetRate,
|
|
captureDiagnostics: {
|
|
trackId,
|
|
readyState: event.track.readyState,
|
|
rms: Number(rms.toFixed(6)),
|
|
nativeSampleRate: nativeRate,
|
|
},
|
|
});
|
|
}
|
|
};
|
|
|
|
let workletNode: AudioWorkletNode | null = null;
|
|
let scriptProcessor: ScriptProcessorNode | null = null;
|
|
|
|
const useWorklet = async () => {
|
|
try {
|
|
const blob = new Blob([workletCode], { type: 'application/javascript' });
|
|
const blobUrl = URL.createObjectURL(blob);
|
|
await ctx.audioWorklet.addModule(blobUrl);
|
|
URL.revokeObjectURL(blobUrl);
|
|
|
|
workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', {
|
|
processorOptions: { nativeRate, targetRate },
|
|
});
|
|
|
|
workletNode.port.onmessage = (ev: MessageEvent) => {
|
|
if (ev.data?.type !== 'chunk' || !ev.data.data) return;
|
|
const pcm16 = new Int16Array(ev.data.data);
|
|
const bytes = new Uint8Array(pcm16.buffer);
|
|
let binary = '';
|
|
for (let i = 0; i < bytes.length; i++) {
|
|
binary += String.fromCharCode(bytes[i]);
|
|
}
|
|
pushChunk(btoa(binary), ev.data.rms || 0);
|
|
};
|
|
|
|
source.connect(workletNode);
|
|
workletNode.connect(silentGain);
|
|
silentGain.connect(ctx.destination);
|
|
|
|
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
|
processorsObj[trackId] = workletNode;
|
|
console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
|
return true;
|
|
} catch (err) {
|
|
console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`);
|
|
return false;
|
|
}
|
|
};
|
|
|
|
const useScriptProcessor = () => {
|
|
const minRmsThreshold = 0.0003;
|
|
const maxSamplesPerChunk = nativeRate * 8;
|
|
const preRollSamples = Math.ceil(nativeRate * 1.0);
|
|
const minFlushSamples = Math.ceil(nativeRate * 0.5);
|
|
const silenceFlushCallbacks = 6;
|
|
const ratio = nativeRate / targetRate;
|
|
|
|
scriptProcessor = ctx.createScriptProcessor(8192, 1, 1);
|
|
let chunkBuffer: Float32Array[] = [];
|
|
let samplesCollected = 0;
|
|
let hasVoicedContent = false;
|
|
let consecutiveSilentCallbacks = 0;
|
|
|
|
scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => {
|
|
const input = e.inputBuffer.getChannelData(0);
|
|
let cbPower = 0;
|
|
for (let i = 0; i < input.length; i++) {
|
|
cbPower += input[i] * input[i];
|
|
}
|
|
const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
|
|
|
|
if (cbRms >= minRmsThreshold) {
|
|
hasVoicedContent = true;
|
|
consecutiveSilentCallbacks = 0;
|
|
} else {
|
|
consecutiveSilentCallbacks++;
|
|
}
|
|
|
|
chunkBuffer.push(new Float32Array(input));
|
|
samplesCollected += input.length;
|
|
|
|
const shouldFlush = (
|
|
samplesCollected >= maxSamplesPerChunk
|
|
|| (hasVoicedContent
|
|
&& consecutiveSilentCallbacks >= silenceFlushCallbacks
|
|
&& samplesCollected > minFlushSamples)
|
|
);
|
|
|
|
if (shouldFlush) {
|
|
const merged = new Float32Array(samplesCollected);
|
|
let offset = 0;
|
|
for (const buf of chunkBuffer) {
|
|
merged.set(buf, offset);
|
|
offset += buf.length;
|
|
}
|
|
|
|
let powerSum = 0;
|
|
for (let i = 0; i < merged.length; i++) {
|
|
powerSum += merged[i] * merged[i];
|
|
}
|
|
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
|
|
|
hasVoicedContent = false;
|
|
consecutiveSilentCallbacks = 0;
|
|
|
|
if (rms >= minRmsThreshold) {
|
|
const outLen = Math.floor(merged.length / ratio);
|
|
const pcm16 = new Int16Array(outLen);
|
|
for (let i = 0; i < outLen; i++) {
|
|
const srcIdx = Math.floor(i * ratio);
|
|
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
|
|
pcm16[i] = Math.round(s * 32767);
|
|
}
|
|
const bytes = new Uint8Array(pcm16.buffer);
|
|
let binary = '';
|
|
for (let i = 0; i < bytes.length; i++) {
|
|
binary += String.fromCharCode(bytes[i]);
|
|
}
|
|
pushChunk(btoa(binary), rms);
|
|
} else {
|
|
const keep = Math.min(preRollSamples, merged.length);
|
|
const preRoll = merged.slice(merged.length - keep);
|
|
chunkBuffer = [preRoll];
|
|
samplesCollected = keep;
|
|
return;
|
|
}
|
|
const keep = Math.min(preRollSamples, merged.length);
|
|
const preRoll = merged.slice(merged.length - keep);
|
|
chunkBuffer = [preRoll];
|
|
samplesCollected = keep;
|
|
}
|
|
};
|
|
|
|
source.connect(scriptProcessor);
|
|
scriptProcessor.connect(silentGain);
|
|
silentGain.connect(ctx.destination);
|
|
|
|
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
|
processorsObj[trackId] = scriptProcessor;
|
|
console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
|
};
|
|
|
|
(async () => {
|
|
const ok = await useWorklet();
|
|
if (!ok) useScriptProcessor();
|
|
|
|
ctx.resume().catch(() => {});
|
|
})();
|
|
|
|
// Clean up when the track ends (peer leaves, renegotiation, etc.)
|
|
event.track.addEventListener('ended', () => {
|
|
try {
|
|
if (workletNode) {
|
|
workletNode.disconnect();
|
|
}
|
|
if (scriptProcessor) {
|
|
scriptProcessor.disconnect();
|
|
}
|
|
source.disconnect();
|
|
silentGain.disconnect();
|
|
ctx.close();
|
|
} catch { /* already closed */ }
|
|
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
|
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
|
delete processorsObj[trackId];
|
|
delete contextsObj[trackId];
|
|
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
|
|
});
|
|
|
|
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
|
contextsObj[trackId] = ctx;
|
|
} catch (err) {
|
|
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
|
}
|
|
});
|
|
|
|
return pc;
|
|
} as any;
|
|
|
|
// Copy static properties
|
|
window.RTCPeerConnection.prototype = OrigRTC.prototype;
|
|
Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC);
|
|
}, AUDIO_CAPTURE_WORKLET_CODE);
|
|
|
|
this._injected = true;
|
|
this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected');
|
|
}
|
|
|
|
/**
|
|
* Start polling for captured audio chunks and forwarding them to the callback.
|
|
*/
|
|
async startCapture(): Promise<void> {
|
|
if (this._isCapturing) return;
|
|
this._isCapturing = true;
|
|
|
|
this._logger.info('[AudioCapture] Starting audio chunk polling...');
|
|
|
|
// #region agent log
|
|
let pollCount = 0;
|
|
// #endregion
|
|
this._pollInterval = setInterval(async () => {
|
|
try {
|
|
// #region agent log
|
|
pollCount++;
|
|
if (pollCount % 60 === 1) {
|
|
const diagInfo = await this._page.evaluate(() => {
|
|
const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[] || [];
|
|
const procs = (window as any).__audioCaptureProcessors as Record<string, any> || {};
|
|
const ctxs = (window as any).__audioCaptureContexts as Record<string, AudioContext> || {};
|
|
const procKeys = Object.keys(procs);
|
|
const ctxStates = Object.entries(ctxs).map(([k, c]) => `${k}:${c.state}`);
|
|
return {
|
|
peerConnections: pcs.length,
|
|
pcStates: pcs.map((p: RTCPeerConnection) => p.connectionState || 'unknown'),
|
|
processors: procKeys.length,
|
|
processorTrackIds: procKeys,
|
|
audioContextStates: ctxStates,
|
|
};
|
|
});
|
|
this._logger.info(`[AudioCapture][DIAG] Periodic: ${JSON.stringify(diagInfo)}`);
|
|
}
|
|
// #endregion
|
|
const chunks = await this._page.evaluate(() => {
|
|
const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
|
|
const result = buf.splice(0, buf.length);
|
|
return result;
|
|
});
|
|
|
|
for (const chunk of chunks) {
|
|
this._onAudioChunk(
|
|
chunk.data,
|
|
chunk.sampleRate || 16000,
|
|
chunk.captureDiagnostics
|
|
);
|
|
}
|
|
} catch {
|
|
// Page might be navigating or closed
|
|
}
|
|
}, 500);
|
|
}
|
|
|
|
/**
|
|
* Stop capturing audio.
|
|
*/
|
|
async stopCapture(): Promise<void> {
|
|
this._isCapturing = false;
|
|
|
|
if (this._pollInterval) {
|
|
clearInterval(this._pollInterval);
|
|
this._pollInterval = null;
|
|
}
|
|
|
|
try {
|
|
await this._page.evaluate(() => {
|
|
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
|
|
const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
|
Object.keys(processors || {}).forEach((trackId) => {
|
|
try {
|
|
processors[trackId]?.disconnect();
|
|
} catch {
|
|
// ignore
|
|
}
|
|
});
|
|
Object.keys(contexts || {}).forEach((trackId) => {
|
|
try {
|
|
contexts[trackId]?.close();
|
|
} catch {
|
|
// ignore
|
|
}
|
|
});
|
|
(window as any).__audioCaptureProcessors = {};
|
|
(window as any).__audioCaptureContexts = {};
|
|
});
|
|
} catch {
|
|
// Page might already be closed
|
|
}
|
|
|
|
this._logger.info('[AudioCapture] Audio capture stopped');
|
|
}
|
|
}
|