fix: capture active teams audio track with diagnostics

Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-24 14:55:16 +01:00 · 2026-02-24 14:55:16 +01:00 · 79c1555e0c
commit 79c1555e0c
parent f47b3c5682
3 changed files with 126 additions and 25 deletions
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@ -1,6 +1,19 @@
 import { Page } from 'playwright';
 import { Logger } from 'winston';

+interface AudioChunkDiagnostics {
+  trackId?: string;
+  readyState?: string;
+  rms?: number;
+  nativeSampleRate?: number;
+}
+
+interface CapturedAudioChunk {
+  data: string;
+  sampleRate: number;
+  captureDiagnostics?: AudioChunkDiagnostics;
+}
+
 /**
 * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
 *
@ -14,7 +27,11 @@ import { Logger } from 'winston';
 export class AudioCaptureProcedure {
  private _page: Page;
  private _logger: Logger;
-  private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
+  private _onAudioChunk: (
+    base64Data: string,
+    sampleRate: number,
+    captureDiagnostics?: AudioChunkDiagnostics
+  ) => void;
  private _isCapturing: boolean = false;
  private _pollInterval: ReturnType<typeof setInterval> | null = null;
  private _injected: boolean = false;
@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
  constructor(
    page: Page,
    logger: Logger,
-    onAudioChunk: (base64Data: string, sampleRate: number) => void,
+    onAudioChunk: (
+      base64Data: string,
+      sampleRate: number,
+      captureDiagnostics?: AudioChunkDiagnostics
+    ) => void,
  ) {
    this._page = page;
    this._logger = logger;
@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
    this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');

    await this._page.addInitScript(() => {
-      (window as any).__audioCaptureChunks = [] as string[];
-      (window as any).__audioCaptureActive = false;
+      (window as any).__audioCaptureChunks = [] as any[];
+      (window as any).__audioCaptureProcessors = {} as Record<string, any>;
+      (window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;

      const OrigRTC = window.RTCPeerConnection;

@ -50,8 +72,12 @@ export class AudioCaptureProcedure {

        pc.addEventListener('track', (event: RTCTrackEvent) => {
          if (event.track.kind !== 'audio') return;
-          if ((window as any).__audioCaptureActive) return;
-          (window as any).__audioCaptureActive = true;
+
+          const trackId = event.track.id || `audio-track-${Date.now()}`;
+          const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
+          if (processors[trackId]) {
+            return;
+          }

          try {
            const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
            const processor = ctx.createScriptProcessor(8192, 1, 1);
            let chunkBuffer: Float32Array[] = [];
            let samplesCollected = 0;
+            let skippedSilentChunks = 0;
+            const minRmsThreshold = 0.0015;
            // Collect ~1 second of audio at native rate before emitting
            const samplesPerChunk = nativeRate;
            const targetRate = 16000;
@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
                  offset += buf.length;
                }

+                // Calculate RMS to detect real audio activity
+                let powerSum = 0;
+                for (let i = 0; i < merged.length; i++) {
+                  powerSum += merged[i] * merged[i];
+                }
+                const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+                if (rms < minRmsThreshold) {
+                  skippedSilentChunks++;
+                  if (skippedSilentChunks % 10 === 0) {
+                    console.log(
+                      `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
+                    );
+                  }
+                  chunkBuffer = [];
+                  samplesCollected = 0;
+                  return;
+                }
+
                // Downsample from nativeRate to 16 kHz
                const ratio = nativeRate / targetRate;
                const outLen = Math.floor(merged.length / ratio);
@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
                }
                const base64 = btoa(binary);

-                const chunks = (window as any).__audioCaptureChunks as string[];
-                if (chunks.length < 30) {
-                  chunks.push(base64);
+                const chunks = (window as any).__audioCaptureChunks as any[];
+                if (chunks.length < 60) {
+                  chunks.push({
+                    data: base64,
+                    sampleRate: targetRate,
+                    captureDiagnostics: {
+                      trackId,
+                      readyState: event.track.readyState,
+                      rms: Number(rms.toFixed(6)),
+                      nativeSampleRate: nativeRate,
+                    },
+                  });
                }

+                skippedSilentChunks = 0;
+
                chunkBuffer = [];
                samplesCollected = 0;
              }
@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
                silentGain.disconnect();
                ctx.close();
              } catch { /* already closed */ }
-              (window as any).__audioCaptureActive = false;
-              console.log('[AudioCapture] Audio track ended, resources cleaned up');
+              const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
+              const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
+              delete processorsObj[trackId];
+              delete contextsObj[trackId];
+              console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
            });

-            (window as any).__audioCaptureCtx = ctx;
-            (window as any).__audioCaptureProcessor = processor;
+            const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
+            const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
+            processorsObj[trackId] = processor;
+            contextsObj[trackId] = ctx;

-            console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
+            console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
          } catch (err) {
            console.error('[AudioCapture] Failed to set up audio capture:', err);
-            (window as any).__audioCaptureActive = false;
          }
        });

@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
    this._pollInterval = setInterval(async () => {
      try {
        const chunks = await this._page.evaluate(() => {
-          const buf = (window as any).__audioCaptureChunks as string[];
+          const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
          const result = buf.splice(0, buf.length);
          return result;
        });

        for (const chunk of chunks) {
-          this._onAudioChunk(chunk, 16000);
+          this._onAudioChunk(
+            chunk.data,
+            chunk.sampleRate || 16000,
+            chunk.captureDiagnostics
+          );
        }
      } catch {
        // Page might be navigating or closed
@ -199,11 +265,24 @@ export class AudioCaptureProcedure {

    try {
      await this._page.evaluate(() => {
-        (window as any).__audioCaptureActive = false;
-        const proc = (window as any).__audioCaptureProcessor;
-        if (proc) try { proc.disconnect(); } catch { /* ok */ }
-        const ctx = (window as any).__audioCaptureCtx as AudioContext;
-        if (ctx) ctx.close();
+        const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
+        const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
+        Object.keys(processors || {}).forEach((trackId) => {
+          try {
+            processors[trackId]?.disconnect();
+          } catch {
+            // ignore
+          }
+        });
+        Object.keys(contexts || {}).forEach((trackId) => {
+          try {
+            contexts[trackId]?.close();
+          } catch {
+            // ignore
+          }
+        });
+        (window as any).__audioCaptureProcessors = {};
+        (window as any).__audioCaptureContexts = {};
      });
    } catch {
      // Page might already be closed
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@ -864,6 +864,12 @@ export class BotOrchestrator {
    });

    this._page = await this._context.newPage();
+    this._page.on('console', (msg) => {
+      const text = msg.text();
+      if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
+        this._logger.info(`[PageConsole] ${text}`);
+      }
+    });

    // Stealth: Override browser properties that reveal automation.
    // Teams checks these to detect headless/automated browsers and
@ -906,8 +912,8 @@ export class BotOrchestrator {
    this._audioCaptureProcedure = new AudioCaptureProcedure(
      this._page,
      this._logger,
-      (base64Data, sampleRate) => {
-        this._sendAudioChunk(base64Data, sampleRate);
+      (base64Data, sampleRate, captureDiagnostics) => {
+        this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
      },
    );
    this._captionsProcedure = new CaptionsProcedure(
@ -1176,7 +1182,16 @@ export class BotOrchestrator {
  /**
   * Send an audio chunk to the Gateway for STT processing.
   */
-  private _sendAudioChunk(base64Data: string, sampleRate: number): void {
+  private _sendAudioChunk(
+    base64Data: string,
+    sampleRate: number,
+    captureDiagnostics?: {
+      trackId?: string;
+      readyState?: string;
+      rms?: number;
+      nativeSampleRate?: number;
+    },
+  ): void {
    const message: AudioChunkMessage = {
      type: 'audioChunk',
      sessionId: this._sessionId,
@ -1185,6 +1200,7 @@ export class BotOrchestrator {
        sampleRate,
        data: base64Data,
        timestamp: new Date().toISOString(),
+        captureDiagnostics,
      },
    };
    this._sendToGateway(message);
--- a/src/types/index.ts
+++ b/src/types/index.ts
@ -63,6 +63,12 @@ export interface AudioChunkMessage {
    sampleRate: number;
    data: string; // base64 encoded
    timestamp: string;
+    captureDiagnostics?: {
+      trackId?: string;
+      readyState?: string;
+      rms?: number;
+      nativeSampleRate?: number;
+    };
  };
 }