import { Page } from 'playwright'; import { Logger } from 'winston'; interface AudioChunkDiagnostics { trackId?: string; readyState?: string; rms?: number; nativeSampleRate?: number; } interface CapturedAudioChunk { data: string; sampleRate: number; captureDiagnostics?: AudioChunkDiagnostics; } const AUDIO_CAPTURE_WORKLET_CODE = ` class AudioCaptureProcessor extends AudioWorkletProcessor { constructor(options) { super(); const opts = options.processorOptions || {}; this.nativeRate = opts.nativeRate || 48000; this.targetRate = opts.targetRate || 16000; this.maxSamplesPerChunk = this.nativeRate * 8; this.minRmsThreshold = 0.0003; this.preRollSamples = Math.ceil(this.nativeRate * 1.0); this.minFlushSamples = Math.ceil(this.nativeRate * 0.5); this.silenceFlushCallbacks = 6; this.ratio = this.nativeRate / this.targetRate; this.chunkBuffer = []; this.samplesCollected = 0; this.hasVoicedContent = false; this.consecutiveSilentCallbacks = 0; } process(inputs, outputs, parameters) { const input = inputs[0]?.[0]; if (!input || input.length === 0) return true; let cbPower = 0; for (let i = 0; i < input.length; i++) { cbPower += input[i] * input[i]; } const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1)); if (cbRms >= this.minRmsThreshold) { this.hasVoicedContent = true; this.consecutiveSilentCallbacks = 0; } else { this.consecutiveSilentCallbacks++; } this.chunkBuffer.push(new Float32Array(input)); this.samplesCollected += input.length; const shouldFlush = ( this.samplesCollected >= this.maxSamplesPerChunk || (this.hasVoicedContent && this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks && this.samplesCollected > this.minFlushSamples) ); if (shouldFlush) { const merged = new Float32Array(this.samplesCollected); let offset = 0; for (const buf of this.chunkBuffer) { merged.set(buf, offset); offset += buf.length; } let powerSum = 0; for (let i = 0; i < merged.length; i++) { powerSum += merged[i] * merged[i]; } const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); this.hasVoicedContent = false; this.consecutiveSilentCallbacks = 0; if (rms >= this.minRmsThreshold) { const outLen = Math.floor(merged.length / this.ratio); const pcm16 = new Int16Array(outLen); for (let i = 0; i < outLen; i++) { const srcIdx = Math.floor(i * this.ratio); const s = Math.max(-1, Math.min(1, merged[srcIdx])); pcm16[i] = Math.round(s * 32767); } this.port.postMessage({ type: 'chunk', data: pcm16.buffer, rms, nativeSampleRate: this.nativeRate }, [pcm16.buffer]); } else { const keep = Math.min(this.preRollSamples, merged.length); const preRoll = merged.slice(merged.length - keep); this.chunkBuffer = [preRoll]; this.samplesCollected = keep; return true; } const keep = Math.min(this.preRollSamples, merged.length); const preRoll = merged.slice(merged.length - keep); this.chunkBuffer = [preRoll]; this.samplesCollected = keep; } return true; } } registerProcessor('audio-capture-processor', AudioCaptureProcessor); `; /** * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection. * * How it works: * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks * 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback) * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16 * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway */ export class AudioCaptureProcedure { private _page: Page; private _logger: Logger; private _onAudioChunk: ( base64Data: string, sampleRate: number, captureDiagnostics?: AudioChunkDiagnostics ) => void; private _isCapturing: boolean = false; private _pollInterval: ReturnType | null = null; private _injected: boolean = false; constructor( page: Page, logger: Logger, onAudioChunk: ( base64Data: string, sampleRate: number, captureDiagnostics?: AudioChunkDiagnostics ) => void, ) { this._page = page; this._logger = logger; this._onAudioChunk = onAudioChunk; } /** * Inject the RTCPeerConnection wrapper BEFORE any page navigation. * Must be called before navigating to Teams. */ async injectCaptureOverride(): Promise { if (this._injected) return; this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); await this._page.addInitScript((workletCode: string) => { (window as any).__audioCaptureChunks = [] as any[]; (window as any).__audioCaptureProcessors = {} as Record; (window as any).__audioCaptureContexts = {} as Record; (window as any).__audioCapturePeerConnections = [] as RTCPeerConnection[]; const OrigRTC = window.RTCPeerConnection; // @ts-ignore — wrapping constructor window.RTCPeerConnection = function (this: RTCPeerConnection, ...args: any[]) { const pc = new OrigRTC(...args); try { const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[]; pcs.push(pc); // #region agent log console.log(`[AudioCapture][DIAG] New RTCPeerConnection created (total: ${pcs.length}), config:`, JSON.stringify(args[0] || {}).substring(0, 200)); // #endregion } catch { // ignore } pc.addEventListener('track', (event: RTCTrackEvent) => { if (event.track.kind !== 'audio') return; const trackId = event.track.id || `audio-track-${Date.now()}`; const processors = (window as any).__audioCaptureProcessors as Record; if (processors[trackId]) { return; } // #region agent log console.log( `[AudioCapture][DIAG] Track received: id=${trackId}, enabled=${event.track.enabled}, muted=${event.track.muted}, readyState=${event.track.readyState}, label=${event.track.label}` ); event.track.addEventListener('mute', () => { console.log(`[AudioCapture][DIAG] Track MUTED: id=${trackId}`); }); event.track.addEventListener('unmute', () => { console.log(`[AudioCapture][DIAG] Track UNMUTED: id=${trackId}`); }); // #endregion try { const AudioCtx = window.AudioContext || (window as any).webkitAudioContext; const ctx = new AudioCtx(); const nativeRate = ctx.sampleRate; const stream = new MediaStream([event.track]); const source = ctx.createMediaStreamSource(stream); const targetRate = 16000; // #region agent log console.log( `[AudioCapture][DIAG] AudioContext: state=${ctx.state}, sampleRate=${nativeRate}, stream.active=${stream.active}, streamTracks=${stream.getAudioTracks().length}` ); ctx.addEventListener('statechange', () => { console.log(`[AudioCapture][DIAG] AudioContext statechange: ${ctx.state} for track=${trackId}`); }); // #endregion const silentGain = ctx.createGain(); silentGain.gain.value = 0; const pushChunk = (base64Data: string, rms: number) => { const chunks = (window as any).__audioCaptureChunks as any[]; if (chunks.length < 60) { chunks.push({ data: base64Data, sampleRate: targetRate, captureDiagnostics: { trackId, readyState: event.track.readyState, rms: Number(rms.toFixed(6)), nativeSampleRate: nativeRate, }, }); } }; let workletNode: AudioWorkletNode | null = null; let scriptProcessor: ScriptProcessorNode | null = null; const useWorklet = async () => { try { const blob = new Blob([workletCode], { type: 'application/javascript' }); const blobUrl = URL.createObjectURL(blob); await ctx.audioWorklet.addModule(blobUrl); URL.revokeObjectURL(blobUrl); workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', { processorOptions: { nativeRate, targetRate }, }); workletNode.port.onmessage = (ev: MessageEvent) => { if (ev.data?.type !== 'chunk' || !ev.data.data) return; const pcm16 = new Int16Array(ev.data.data); const bytes = new Uint8Array(pcm16.buffer); let binary = ''; for (let i = 0; i < bytes.length; i++) { binary += String.fromCharCode(bytes[i]); } pushChunk(btoa(binary), ev.data.rms || 0); }; source.connect(workletNode); workletNode.connect(silentGain); silentGain.connect(ctx.destination); const processorsObj = (window as any).__audioCaptureProcessors as Record; processorsObj[trackId] = workletNode; console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`); return true; } catch (err) { console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`); return false; } }; const useScriptProcessor = () => { const minRmsThreshold = 0.0003; const maxSamplesPerChunk = nativeRate * 8; const preRollSamples = Math.ceil(nativeRate * 1.0); const minFlushSamples = Math.ceil(nativeRate * 0.5); const silenceFlushCallbacks = 6; const ratio = nativeRate / targetRate; scriptProcessor = ctx.createScriptProcessor(8192, 1, 1); let chunkBuffer: Float32Array[] = []; let samplesCollected = 0; let hasVoicedContent = false; let consecutiveSilentCallbacks = 0; scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => { const input = e.inputBuffer.getChannelData(0); let cbPower = 0; for (let i = 0; i < input.length; i++) { cbPower += input[i] * input[i]; } const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1)); if (cbRms >= minRmsThreshold) { hasVoicedContent = true; consecutiveSilentCallbacks = 0; } else { consecutiveSilentCallbacks++; } chunkBuffer.push(new Float32Array(input)); samplesCollected += input.length; const shouldFlush = ( samplesCollected >= maxSamplesPerChunk || (hasVoicedContent && consecutiveSilentCallbacks >= silenceFlushCallbacks && samplesCollected > minFlushSamples) ); if (shouldFlush) { const merged = new Float32Array(samplesCollected); let offset = 0; for (const buf of chunkBuffer) { merged.set(buf, offset); offset += buf.length; } let powerSum = 0; for (let i = 0; i < merged.length; i++) { powerSum += merged[i] * merged[i]; } const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); hasVoicedContent = false; consecutiveSilentCallbacks = 0; if (rms >= minRmsThreshold) { const outLen = Math.floor(merged.length / ratio); const pcm16 = new Int16Array(outLen); for (let i = 0; i < outLen; i++) { const srcIdx = Math.floor(i * ratio); const s = Math.max(-1, Math.min(1, merged[srcIdx])); pcm16[i] = Math.round(s * 32767); } const bytes = new Uint8Array(pcm16.buffer); let binary = ''; for (let i = 0; i < bytes.length; i++) { binary += String.fromCharCode(bytes[i]); } pushChunk(btoa(binary), rms); } else { const keep = Math.min(preRollSamples, merged.length); const preRoll = merged.slice(merged.length - keep); chunkBuffer = [preRoll]; samplesCollected = keep; return; } const keep = Math.min(preRollSamples, merged.length); const preRoll = merged.slice(merged.length - keep); chunkBuffer = [preRoll]; samplesCollected = keep; } }; source.connect(scriptProcessor); scriptProcessor.connect(silentGain); silentGain.connect(ctx.destination); const processorsObj = (window as any).__audioCaptureProcessors as Record; processorsObj[trackId] = scriptProcessor; console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`); }; (async () => { const ok = await useWorklet(); if (!ok) useScriptProcessor(); ctx.resume().catch(() => {}); })(); // Clean up when the track ends (peer leaves, renegotiation, etc.) event.track.addEventListener('ended', () => { try { if (workletNode) { workletNode.disconnect(); } if (scriptProcessor) { scriptProcessor.disconnect(); } source.disconnect(); silentGain.disconnect(); ctx.close(); } catch { /* already closed */ } const processorsObj = (window as any).__audioCaptureProcessors as Record; const contextsObj = (window as any).__audioCaptureContexts as Record; delete processorsObj[trackId]; delete contextsObj[trackId]; console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`); }); const contextsObj = (window as any).__audioCaptureContexts as Record; contextsObj[trackId] = ctx; } catch (err) { console.error('[AudioCapture] Failed to set up audio capture:', err); } }); return pc; } as any; // Copy static properties window.RTCPeerConnection.prototype = OrigRTC.prototype; Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC); }, AUDIO_CAPTURE_WORKLET_CODE); this._injected = true; this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected'); } /** * Start polling for captured audio chunks and forwarding them to the callback. */ async startCapture(): Promise { if (this._isCapturing) return; this._isCapturing = true; this._logger.info('[AudioCapture] Starting audio chunk polling...'); // #region agent log let pollCount = 0; // #endregion this._pollInterval = setInterval(async () => { try { // #region agent log pollCount++; if (pollCount % 60 === 1) { const diagInfo = await this._page.evaluate(() => { const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[] || []; const procs = (window as any).__audioCaptureProcessors as Record || {}; const ctxs = (window as any).__audioCaptureContexts as Record || {}; const procKeys = Object.keys(procs); const ctxStates = Object.entries(ctxs).map(([k, c]) => `${k}:${c.state}`); return { peerConnections: pcs.length, pcStates: pcs.map((p: RTCPeerConnection) => p.connectionState || 'unknown'), processors: procKeys.length, processorTrackIds: procKeys, audioContextStates: ctxStates, }; }); this._logger.info(`[AudioCapture][DIAG] Periodic: ${JSON.stringify(diagInfo)}`); } // #endregion const chunks = await this._page.evaluate(() => { const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[]; const result = buf.splice(0, buf.length); return result; }); for (const chunk of chunks) { this._onAudioChunk( chunk.data, chunk.sampleRate || 16000, chunk.captureDiagnostics ); } } catch { // Page might be navigating or closed } }, 500); } /** * Stop capturing audio. */ async stopCapture(): Promise { this._isCapturing = false; if (this._pollInterval) { clearInterval(this._pollInterval); this._pollInterval = null; } try { await this._page.evaluate(() => { const processors = (window as any).__audioCaptureProcessors as Record; const contexts = (window as any).__audioCaptureContexts as Record; Object.keys(processors || {}).forEach((trackId) => { try { processors[trackId]?.disconnect(); } catch { // ignore } }); Object.keys(contexts || {}).forEach((trackId) => { try { contexts[trackId]?.close(); } catch { // ignore } }); (window as any).__audioCaptureProcessors = {}; (window as any).__audioCaptureContexts = {}; }); } catch { // Page might already be closed } this._logger.info('[AudioCapture] Audio capture stopped'); } }