From 79c1555e0cffc207ef2fbe3dcc116e016cdc7823 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Tue, 24 Feb 2026 14:55:16 +0100 Subject: [PATCH] fix: capture active teams audio track with diagnostics Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis. Co-authored-by: Cursor --- src/bot/audioCaptureProcedure.ts | 123 +++++++++++++++++++++++++------ src/bot/orchestrator.ts | 22 +++++- src/types/index.ts | 6 ++ 3 files changed, 126 insertions(+), 25 deletions(-) diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts index a0cca6b..739539e 100644 --- a/src/bot/audioCaptureProcedure.ts +++ b/src/bot/audioCaptureProcedure.ts @@ -1,6 +1,19 @@ import { Page } from 'playwright'; import { Logger } from 'winston'; +interface AudioChunkDiagnostics { + trackId?: string; + readyState?: string; + rms?: number; + nativeSampleRate?: number; +} + +interface CapturedAudioChunk { + data: string; + sampleRate: number; + captureDiagnostics?: AudioChunkDiagnostics; +} + /** * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection. * @@ -14,7 +27,11 @@ import { Logger } from 'winston'; export class AudioCaptureProcedure { private _page: Page; private _logger: Logger; - private _onAudioChunk: (base64Data: string, sampleRate: number) => void; + private _onAudioChunk: ( + base64Data: string, + sampleRate: number, + captureDiagnostics?: AudioChunkDiagnostics + ) => void; private _isCapturing: boolean = false; private _pollInterval: ReturnType | null = null; private _injected: boolean = false; @@ -22,7 +39,11 @@ export class AudioCaptureProcedure { constructor( page: Page, logger: Logger, - onAudioChunk: (base64Data: string, sampleRate: number) => void, + onAudioChunk: ( + base64Data: string, + sampleRate: number, + captureDiagnostics?: AudioChunkDiagnostics + ) => void, ) { this._page = page; this._logger = logger; @@ -39,8 +60,9 @@ export class AudioCaptureProcedure { this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); await this._page.addInitScript(() => { - (window as any).__audioCaptureChunks = [] as string[]; - (window as any).__audioCaptureActive = false; + (window as any).__audioCaptureChunks = [] as any[]; + (window as any).__audioCaptureProcessors = {} as Record; + (window as any).__audioCaptureContexts = {} as Record; const OrigRTC = window.RTCPeerConnection; @@ -50,8 +72,12 @@ export class AudioCaptureProcedure { pc.addEventListener('track', (event: RTCTrackEvent) => { if (event.track.kind !== 'audio') return; - if ((window as any).__audioCaptureActive) return; - (window as any).__audioCaptureActive = true; + + const trackId = event.track.id || `audio-track-${Date.now()}`; + const processors = (window as any).__audioCaptureProcessors as Record; + if (processors[trackId]) { + return; + } try { const AudioCtx = window.AudioContext || (window as any).webkitAudioContext; @@ -67,6 +93,8 @@ export class AudioCaptureProcedure { const processor = ctx.createScriptProcessor(8192, 1, 1); let chunkBuffer: Float32Array[] = []; let samplesCollected = 0; + let skippedSilentChunks = 0; + const minRmsThreshold = 0.0015; // Collect ~1 second of audio at native rate before emitting const samplesPerChunk = nativeRate; const targetRate = 16000; @@ -85,6 +113,25 @@ export class AudioCaptureProcedure { offset += buf.length; } + // Calculate RMS to detect real audio activity + let powerSum = 0; + for (let i = 0; i < merged.length; i++) { + powerSum += merged[i] * merged[i]; + } + const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); + + if (rms < minRmsThreshold) { + skippedSilentChunks++; + if (skippedSilentChunks % 10 === 0) { + console.log( + `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}` + ); + } + chunkBuffer = []; + samplesCollected = 0; + return; + } + // Downsample from nativeRate to 16 kHz const ratio = nativeRate / targetRate; const outLen = Math.floor(merged.length / ratio); @@ -103,11 +150,22 @@ export class AudioCaptureProcedure { } const base64 = btoa(binary); - const chunks = (window as any).__audioCaptureChunks as string[]; - if (chunks.length < 30) { - chunks.push(base64); + const chunks = (window as any).__audioCaptureChunks as any[]; + if (chunks.length < 60) { + chunks.push({ + data: base64, + sampleRate: targetRate, + captureDiagnostics: { + trackId, + readyState: event.track.readyState, + rms: Number(rms.toFixed(6)), + nativeSampleRate: nativeRate, + }, + }); } + skippedSilentChunks = 0; + chunkBuffer = []; samplesCollected = 0; } @@ -134,17 +192,21 @@ export class AudioCaptureProcedure { silentGain.disconnect(); ctx.close(); } catch { /* already closed */ } - (window as any).__audioCaptureActive = false; - console.log('[AudioCapture] Audio track ended, resources cleaned up'); + const processorsObj = (window as any).__audioCaptureProcessors as Record; + const contextsObj = (window as any).__audioCaptureContexts as Record; + delete processorsObj[trackId]; + delete contextsObj[trackId]; + console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`); }); - (window as any).__audioCaptureCtx = ctx; - (window as any).__audioCaptureProcessor = processor; + const processorsObj = (window as any).__audioCaptureProcessors as Record; + const contextsObj = (window as any).__audioCaptureContexts as Record; + processorsObj[trackId] = processor; + contextsObj[trackId] = ctx; - console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`); + console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`); } catch (err) { console.error('[AudioCapture] Failed to set up audio capture:', err); - (window as any).__audioCaptureActive = false; } }); @@ -172,13 +234,17 @@ export class AudioCaptureProcedure { this._pollInterval = setInterval(async () => { try { const chunks = await this._page.evaluate(() => { - const buf = (window as any).__audioCaptureChunks as string[]; + const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[]; const result = buf.splice(0, buf.length); return result; }); for (const chunk of chunks) { - this._onAudioChunk(chunk, 16000); + this._onAudioChunk( + chunk.data, + chunk.sampleRate || 16000, + chunk.captureDiagnostics + ); } } catch { // Page might be navigating or closed @@ -199,11 +265,24 @@ export class AudioCaptureProcedure { try { await this._page.evaluate(() => { - (window as any).__audioCaptureActive = false; - const proc = (window as any).__audioCaptureProcessor; - if (proc) try { proc.disconnect(); } catch { /* ok */ } - const ctx = (window as any).__audioCaptureCtx as AudioContext; - if (ctx) ctx.close(); + const processors = (window as any).__audioCaptureProcessors as Record; + const contexts = (window as any).__audioCaptureContexts as Record; + Object.keys(processors || {}).forEach((trackId) => { + try { + processors[trackId]?.disconnect(); + } catch { + // ignore + } + }); + Object.keys(contexts || {}).forEach((trackId) => { + try { + contexts[trackId]?.close(); + } catch { + // ignore + } + }); + (window as any).__audioCaptureProcessors = {}; + (window as any).__audioCaptureContexts = {}; }); } catch { // Page might already be closed diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts index e68a810..eb9e7fb 100644 --- a/src/bot/orchestrator.ts +++ b/src/bot/orchestrator.ts @@ -864,6 +864,12 @@ export class BotOrchestrator { }); this._page = await this._context.newPage(); + this._page.on('console', (msg) => { + const text = msg.text(); + if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) { + this._logger.info(`[PageConsole] ${text}`); + } + }); // Stealth: Override browser properties that reveal automation. // Teams checks these to detect headless/automated browsers and @@ -906,8 +912,8 @@ export class BotOrchestrator { this._audioCaptureProcedure = new AudioCaptureProcedure( this._page, this._logger, - (base64Data, sampleRate) => { - this._sendAudioChunk(base64Data, sampleRate); + (base64Data, sampleRate, captureDiagnostics) => { + this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics); }, ); this._captionsProcedure = new CaptionsProcedure( @@ -1176,7 +1182,16 @@ export class BotOrchestrator { /** * Send an audio chunk to the Gateway for STT processing. */ - private _sendAudioChunk(base64Data: string, sampleRate: number): void { + private _sendAudioChunk( + base64Data: string, + sampleRate: number, + captureDiagnostics?: { + trackId?: string; + readyState?: string; + rms?: number; + nativeSampleRate?: number; + }, + ): void { const message: AudioChunkMessage = { type: 'audioChunk', sessionId: this._sessionId, @@ -1185,6 +1200,7 @@ export class BotOrchestrator { sampleRate, data: base64Data, timestamp: new Date().toISOString(), + captureDiagnostics, }, }; this._sendToGateway(message); diff --git a/src/types/index.ts b/src/types/index.ts index 2d5033e..c0a6004 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -63,6 +63,12 @@ export interface AudioChunkMessage { sampleRate: number; data: string; // base64 encoded timestamp: string; + captureDiagnostics?: { + trackId?: string; + readyState?: string; + rms?: number; + nativeSampleRate?: number; + }; }; }