fix: capture active teams audio track with diagnostics

Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-24 14:55:16 +01:00 · 2026-02-24 14:55:16 +01:00 · 79c1555e0c
commit 79c1555e0c
parent f47b3c5682
3 changed files with 126 additions and 25 deletions
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@ -1,6 +1,19 @@
 import { Page } from 'playwright';
 import { Logger } from 'winston';
 interface AudioChunkDiagnostics {
  trackId?: string;
  readyState?: string;
  rms?: number;
  nativeSampleRate?: number;
 }
 interface CapturedAudioChunk {
  data: string;
  sampleRate: number;
  captureDiagnostics?: AudioChunkDiagnostics;
 }
 /**
 * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
 *
@ -14,7 +27,11 @@ import { Logger } from 'winston';
 export class AudioCaptureProcedure {
  private _page: Page;
  private _logger: Logger;
-  private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
+  private _onAudioChunk: (
    base64Data: string,
    sampleRate: number,
    captureDiagnostics?: AudioChunkDiagnostics
  ) => void;
  private _isCapturing: boolean = false;
  private _pollInterval: ReturnType<typeof setInterval> | null = null;
  private _injected: boolean = false;
@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
  constructor(
    page: Page,
    logger: Logger,
-    onAudioChunk: (base64Data: string, sampleRate: number) => void,
+    onAudioChunk: (
      base64Data: string,
      sampleRate: number,
      captureDiagnostics?: AudioChunkDiagnostics
    ) => void,
  ) {
    this._page = page;
    this._logger = logger;
@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
    this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
    await this._page.addInitScript(() => {
-      (window as any).__audioCaptureChunks = [] as string[];
+      (window as any).__audioCaptureChunks = [] as any[];
-      (window as any).__audioCaptureActive = false;
+      (window as any).__audioCaptureProcessors = {} as Record<string, any>;
      (window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
      const OrigRTC = window.RTCPeerConnection;
@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
        pc.addEventListener('track', (event: RTCTrackEvent) => {
          if (event.track.kind !== 'audio') return;
-          if ((window as any).__audioCaptureActive) return;
+
-          (window as any).__audioCaptureActive = true;
+          const trackId = event.track.id || `audio-track-${Date.now()}`;
          const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
          if (processors[trackId]) {
            return;
          }
          try {
            const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
            const processor = ctx.createScriptProcessor(8192, 1, 1);
            let chunkBuffer: Float32Array[] = [];
            let samplesCollected = 0;
            let skippedSilentChunks = 0;
            const minRmsThreshold = 0.0015;
            // Collect ~1 second of audio at native rate before emitting
            const samplesPerChunk = nativeRate;
            const targetRate = 16000;
@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
                  offset += buf.length;
                }
                // Calculate RMS to detect real audio activity
                let powerSum = 0;
                for (let i = 0; i < merged.length; i++) {
                  powerSum += merged[i] * merged[i];
                }
                const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
                if (rms < minRmsThreshold) {
                  skippedSilentChunks++;
                  if (skippedSilentChunks % 10 === 0) {
                    console.log(
                      `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
                    );
                  }
                  chunkBuffer = [];
                  samplesCollected = 0;
                  return;
                }
                // Downsample from nativeRate to 16 kHz
                const ratio = nativeRate / targetRate;
                const outLen = Math.floor(merged.length / ratio);
@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
                }
                const base64 = btoa(binary);
-                const chunks = (window as any).__audioCaptureChunks as string[];
+                const chunks = (window as any).__audioCaptureChunks as any[];
-                if (chunks.length < 30) {
+                if (chunks.length < 60) {
-                  chunks.push(base64);
+                  chunks.push({
                    data: base64,
                    sampleRate: targetRate,
                    captureDiagnostics: {
                      trackId,
                      readyState: event.track.readyState,
                      rms: Number(rms.toFixed(6)),
                      nativeSampleRate: nativeRate,
                    },
                  });
                }
                skippedSilentChunks = 0;
                chunkBuffer = [];
                samplesCollected = 0;
              }
@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
                silentGain.disconnect();
                ctx.close();
              } catch { /* already closed */ }
-              (window as any).__audioCaptureActive = false;
+              const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
-              console.log('[AudioCapture] Audio track ended, resources cleaned up');
+              const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
              delete processorsObj[trackId];
              delete contextsObj[trackId];
              console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
            });
-            (window as any).__audioCaptureCtx = ctx;
+            const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
-            (window as any).__audioCaptureProcessor = processor;
+            const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
            processorsObj[trackId] = processor;
            contextsObj[trackId] = ctx;
-            console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
+            console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
          } catch (err) {
            console.error('[AudioCapture] Failed to set up audio capture:', err);
            (window as any).__audioCaptureActive = false;
          }
        });
@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
    this._pollInterval = setInterval(async () => {
      try {
        const chunks = await this._page.evaluate(() => {
-          const buf = (window as any).__audioCaptureChunks as string[];
+          const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
          const result = buf.splice(0, buf.length);
          return result;
        });
        for (const chunk of chunks) {
-          this._onAudioChunk(chunk, 16000);
+          this._onAudioChunk(
            chunk.data,
            chunk.sampleRate || 16000,
            chunk.captureDiagnostics
          );
        }
      } catch {
        // Page might be navigating or closed
@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
    try {
      await this._page.evaluate(() => {
-        (window as any).__audioCaptureActive = false;
+        const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
-        const proc = (window as any).__audioCaptureProcessor;
+        const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
-        if (proc) try { proc.disconnect(); } catch { /* ok */ }
+        Object.keys(processors || {}).forEach((trackId) => {
-        const ctx = (window as any).__audioCaptureCtx as AudioContext;
+          try {
-        if (ctx) ctx.close();
+            processors[trackId]?.disconnect();
          } catch {
            // ignore
          }
        });
        Object.keys(contexts || {}).forEach((trackId) => {
          try {
            contexts[trackId]?.close();
          } catch {
            // ignore
          }
        });
        (window as any).__audioCaptureProcessors = {};
        (window as any).__audioCaptureContexts = {};
      });
    } catch {
      // Page might already be closed
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@ -864,6 +864,12 @@ export class BotOrchestrator {
    });
    this._page = await this._context.newPage();
    this._page.on('console', (msg) => {
      const text = msg.text();
      if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
        this._logger.info(`[PageConsole] ${text}`);
      }
    });
    // Stealth: Override browser properties that reveal automation.
    // Teams checks these to detect headless/automated browsers and
@ -906,8 +912,8 @@ export class BotOrchestrator {
    this._audioCaptureProcedure = new AudioCaptureProcedure(
      this._page,
      this._logger,
-      (base64Data, sampleRate) => {
+      (base64Data, sampleRate, captureDiagnostics) => {
-        this._sendAudioChunk(base64Data, sampleRate);
+        this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
      },
    );
    this._captionsProcedure = new CaptionsProcedure(
@ -1176,7 +1182,16 @@ export class BotOrchestrator {
  /**
   * Send an audio chunk to the Gateway for STT processing.
   */
-  private _sendAudioChunk(base64Data: string, sampleRate: number): void {
+  private _sendAudioChunk(
    base64Data: string,
    sampleRate: number,
    captureDiagnostics?: {
      trackId?: string;
      readyState?: string;
      rms?: number;
      nativeSampleRate?: number;
    },
  ): void {
    const message: AudioChunkMessage = {
      type: 'audioChunk',
      sessionId: this._sessionId,
@ -1185,6 +1200,7 @@ export class BotOrchestrator {
        sampleRate,
        data: base64Data,
        timestamp: new Date().toISOString(),
        captureDiagnostics,
      },
    };
    this._sendToGateway(message);
--- a/src/types/index.ts
+++ b/src/types/index.ts
@ -63,6 +63,12 @@ export interface AudioChunkMessage {
    sampleRate: number;
    data: string; // base64 encoded
    timestamp: string;
    captureDiagnostics?: {
      trackId?: string;
      readyState?: string;
      rms?: number;
      nativeSampleRate?: number;
    };
  };
 }