From 79c1555e0cffc207ef2fbe3dcc116e016cdc7823 Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Tue, 24 Feb 2026 14:55:16 +0100
Subject: [PATCH] fix: capture active teams audio track with diagnostics

Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/bot/audioCaptureProcedure.ts | 123 +++++++++++++++++++++++++------
 src/bot/orchestrator.ts          |  22 +++++-
 src/types/index.ts               |   6 ++
 3 files changed, 126 insertions(+), 25 deletions(-)
diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts
index a0cca6b..739539e 100644
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@@ -1,6 +1,19 @@
 import { Page } from 'playwright';
 import { Logger } from 'winston';
 
+interface AudioChunkDiagnostics {
+  trackId?: string;
+  readyState?: string;
+  rms?: number;
+  nativeSampleRate?: number;
+}
+
+interface CapturedAudioChunk {
+  data: string;
+  sampleRate: number;
+  captureDiagnostics?: AudioChunkDiagnostics;
+}
+
 /**
  * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
  *
@@ -14,7 +27,11 @@ import { Logger } from 'winston';
 export class AudioCaptureProcedure {
   private _page: Page;
   private _logger: Logger;
-  private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
+  private _onAudioChunk: (
+    base64Data: string,
+    sampleRate: number,
+    captureDiagnostics?: AudioChunkDiagnostics
+  ) => void;
   private _isCapturing: boolean = false;
   private _pollInterval: ReturnType<typeof setInterval> | null = null;
   private _injected: boolean = false;
@@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
   constructor(
     page: Page,
     logger: Logger,
-    onAudioChunk: (base64Data: string, sampleRate: number) => void,
+    onAudioChunk: (
+      base64Data: string,
+      sampleRate: number,
+      captureDiagnostics?: AudioChunkDiagnostics
+    ) => void,
   ) {
     this._page = page;
     this._logger = logger;
@@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
     this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
 
     await this._page.addInitScript(() => {
-      (window as any).__audioCaptureChunks = [] as string[];
-      (window as any).__audioCaptureActive = false;
+      (window as any).__audioCaptureChunks = [] as any[];
+      (window as any).__audioCaptureProcessors = {} as Record<string, any>;
+      (window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
 
       const OrigRTC = window.RTCPeerConnection;
 
@@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
 
         pc.addEventListener('track', (event: RTCTrackEvent) => {
           if (event.track.kind !== 'audio') return;
-          if ((window as any).__audioCaptureActive) return;
-          (window as any).__audioCaptureActive = true;
+
+          const trackId = event.track.id || `audio-track-${Date.now()}`;
+          const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
+          if (processors[trackId]) {
+            return;
+          }
 
           try {
             const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
@@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
             const processor = ctx.createScriptProcessor(8192, 1, 1);
             let chunkBuffer: Float32Array[] = [];
             let samplesCollected = 0;
+            let skippedSilentChunks = 0;
+            const minRmsThreshold = 0.0015;
             // Collect ~1 second of audio at native rate before emitting
             const samplesPerChunk = nativeRate;
             const targetRate = 16000;
@@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
                   offset += buf.length;
                 }
 
+                // Calculate RMS to detect real audio activity
+                let powerSum = 0;
+                for (let i = 0; i < merged.length; i++) {
+                  powerSum += merged[i] * merged[i];
+                }
+                const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+                if (rms < minRmsThreshold) {
+                  skippedSilentChunks++;
+                  if (skippedSilentChunks % 10 === 0) {
+                    console.log(
+                      `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
+                    );
+                  }
+                  chunkBuffer = [];
+                  samplesCollected = 0;
+                  return;
+                }
+
                 // Downsample from nativeRate to 16 kHz
                 const ratio = nativeRate / targetRate;
                 const outLen = Math.floor(merged.length / ratio);
@@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
                 }
                 const base64 = btoa(binary);
 
-                const chunks = (window as any).__audioCaptureChunks as string[];
-                if (chunks.length < 30) {
-                  chunks.push(base64);
+                const chunks = (window as any).__audioCaptureChunks as any[];
+                if (chunks.length < 60) {
+                  chunks.push({
+                    data: base64,
+                    sampleRate: targetRate,
+                    captureDiagnostics: {
+                      trackId,
+                      readyState: event.track.readyState,
+                      rms: Number(rms.toFixed(6)),
+                      nativeSampleRate: nativeRate,
+                    },
+                  });
                 }
 
+                skippedSilentChunks = 0;
+
                 chunkBuffer = [];
                 samplesCollected = 0;
               }
@@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
                 silentGain.disconnect();
                 ctx.close();
               } catch { /* already closed */ }
-              (window as any).__audioCaptureActive = false;
-              console.log('[AudioCapture] Audio track ended, resources cleaned up');
+              const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
+              const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
+              delete processorsObj[trackId];
+              delete contextsObj[trackId];
+              console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
             });
 
-            (window as any).__audioCaptureCtx = ctx;
-            (window as any).__audioCaptureProcessor = processor;
+            const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
+            const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
+            processorsObj[trackId] = processor;
+            contextsObj[trackId] = ctx;
 
-            console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
+            console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
           } catch (err) {
             console.error('[AudioCapture] Failed to set up audio capture:', err);
-            (window as any).__audioCaptureActive = false;
           }
         });
 
@@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
     this._pollInterval = setInterval(async () => {
       try {
         const chunks = await this._page.evaluate(() => {
-          const buf = (window as any).__audioCaptureChunks as string[];
+          const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
           const result = buf.splice(0, buf.length);
           return result;
         });
 
         for (const chunk of chunks) {
-          this._onAudioChunk(chunk, 16000);
+          this._onAudioChunk(
+            chunk.data,
+            chunk.sampleRate || 16000,
+            chunk.captureDiagnostics
+          );
         }
       } catch {
         // Page might be navigating or closed
@@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
 
     try {
       await this._page.evaluate(() => {
-        (window as any).__audioCaptureActive = false;
-        const proc = (window as any).__audioCaptureProcessor;
-        if (proc) try { proc.disconnect(); } catch { /* ok */ }
-        const ctx = (window as any).__audioCaptureCtx as AudioContext;
-        if (ctx) ctx.close();
+        const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
+        const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
+        Object.keys(processors || {}).forEach((trackId) => {
+          try {
+            processors[trackId]?.disconnect();
+          } catch {
+            // ignore
+          }
+        });
+        Object.keys(contexts || {}).forEach((trackId) => {
+          try {
+            contexts[trackId]?.close();
+          } catch {
+            // ignore
+          }
+        });
+        (window as any).__audioCaptureProcessors = {};
+        (window as any).__audioCaptureContexts = {};
       });
     } catch {
       // Page might already be closed
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index e68a810..eb9e7fb 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -864,6 +864,12 @@ export class BotOrchestrator {
     });
 
     this._page = await this._context.newPage();
+    this._page.on('console', (msg) => {
+      const text = msg.text();
+      if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
+        this._logger.info(`[PageConsole] ${text}`);
+      }
+    });
 
     // Stealth: Override browser properties that reveal automation.
     // Teams checks these to detect headless/automated browsers and
@@ -906,8 +912,8 @@ export class BotOrchestrator {
     this._audioCaptureProcedure = new AudioCaptureProcedure(
       this._page,
       this._logger,
-      (base64Data, sampleRate) => {
-        this._sendAudioChunk(base64Data, sampleRate);
+      (base64Data, sampleRate, captureDiagnostics) => {
+        this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
       },
     );
     this._captionsProcedure = new CaptionsProcedure(
@@ -1176,7 +1182,16 @@ export class BotOrchestrator {
   /**
    * Send an audio chunk to the Gateway for STT processing.
    */
-  private _sendAudioChunk(base64Data: string, sampleRate: number): void {
+  private _sendAudioChunk(
+    base64Data: string,
+    sampleRate: number,
+    captureDiagnostics?: {
+      trackId?: string;
+      readyState?: string;
+      rms?: number;
+      nativeSampleRate?: number;
+    },
+  ): void {
     const message: AudioChunkMessage = {
       type: 'audioChunk',
       sessionId: this._sessionId,
@@ -1185,6 +1200,7 @@ export class BotOrchestrator {
         sampleRate,
         data: base64Data,
         timestamp: new Date().toISOString(),
+        captureDiagnostics,
       },
     };
     this._sendToGateway(message);
diff --git a/src/types/index.ts b/src/types/index.ts
index 2d5033e..c0a6004 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -63,6 +63,12 @@ export interface AudioChunkMessage {
     sampleRate: number;
     data: string; // base64 encoded
     timestamp: string;
+    captureDiagnostics?: {
+      trackId?: string;
+      readyState?: string;
+      rms?: number;
+      nativeSampleRate?: number;
+    };
   };
 }