From 79c1555e0cffc207ef2fbe3dcc116e016cdc7823 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Tue, 24 Feb 2026 14:55:16 +0100
Subject: [PATCH] fix: capture active teams audio track with diagnostics
Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis.
Co-authored-by: Cursor
---
src/bot/audioCaptureProcedure.ts | 123 +++++++++++++++++++++++++------
src/bot/orchestrator.ts | 22 +++++-
src/types/index.ts | 6 ++
3 files changed, 126 insertions(+), 25 deletions(-)
diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts
index a0cca6b..739539e 100644
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@@ -1,6 +1,19 @@
import { Page } from 'playwright';
import { Logger } from 'winston';
+interface AudioChunkDiagnostics {
+ trackId?: string;
+ readyState?: string;
+ rms?: number;
+ nativeSampleRate?: number;
+}
+
+interface CapturedAudioChunk {
+ data: string;
+ sampleRate: number;
+ captureDiagnostics?: AudioChunkDiagnostics;
+}
+
/**
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
*
@@ -14,7 +27,11 @@ import { Logger } from 'winston';
export class AudioCaptureProcedure {
private _page: Page;
private _logger: Logger;
- private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
+ private _onAudioChunk: (
+ base64Data: string,
+ sampleRate: number,
+ captureDiagnostics?: AudioChunkDiagnostics
+ ) => void;
private _isCapturing: boolean = false;
private _pollInterval: ReturnType | null = null;
private _injected: boolean = false;
@@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
constructor(
page: Page,
logger: Logger,
- onAudioChunk: (base64Data: string, sampleRate: number) => void,
+ onAudioChunk: (
+ base64Data: string,
+ sampleRate: number,
+ captureDiagnostics?: AudioChunkDiagnostics
+ ) => void,
) {
this._page = page;
this._logger = logger;
@@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
await this._page.addInitScript(() => {
- (window as any).__audioCaptureChunks = [] as string[];
- (window as any).__audioCaptureActive = false;
+ (window as any).__audioCaptureChunks = [] as any[];
+ (window as any).__audioCaptureProcessors = {} as Record;
+ (window as any).__audioCaptureContexts = {} as Record;
const OrigRTC = window.RTCPeerConnection;
@@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
pc.addEventListener('track', (event: RTCTrackEvent) => {
if (event.track.kind !== 'audio') return;
- if ((window as any).__audioCaptureActive) return;
- (window as any).__audioCaptureActive = true;
+
+ const trackId = event.track.id || `audio-track-${Date.now()}`;
+ const processors = (window as any).__audioCaptureProcessors as Record;
+ if (processors[trackId]) {
+ return;
+ }
try {
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
@@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
const processor = ctx.createScriptProcessor(8192, 1, 1);
let chunkBuffer: Float32Array[] = [];
let samplesCollected = 0;
+ let skippedSilentChunks = 0;
+ const minRmsThreshold = 0.0015;
// Collect ~1 second of audio at native rate before emitting
const samplesPerChunk = nativeRate;
const targetRate = 16000;
@@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
offset += buf.length;
}
+ // Calculate RMS to detect real audio activity
+ let powerSum = 0;
+ for (let i = 0; i < merged.length; i++) {
+ powerSum += merged[i] * merged[i];
+ }
+ const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+
+ if (rms < minRmsThreshold) {
+ skippedSilentChunks++;
+ if (skippedSilentChunks % 10 === 0) {
+ console.log(
+ `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
+ );
+ }
+ chunkBuffer = [];
+ samplesCollected = 0;
+ return;
+ }
+
// Downsample from nativeRate to 16 kHz
const ratio = nativeRate / targetRate;
const outLen = Math.floor(merged.length / ratio);
@@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
}
const base64 = btoa(binary);
- const chunks = (window as any).__audioCaptureChunks as string[];
- if (chunks.length < 30) {
- chunks.push(base64);
+ const chunks = (window as any).__audioCaptureChunks as any[];
+ if (chunks.length < 60) {
+ chunks.push({
+ data: base64,
+ sampleRate: targetRate,
+ captureDiagnostics: {
+ trackId,
+ readyState: event.track.readyState,
+ rms: Number(rms.toFixed(6)),
+ nativeSampleRate: nativeRate,
+ },
+ });
}
+ skippedSilentChunks = 0;
+
chunkBuffer = [];
samplesCollected = 0;
}
@@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
silentGain.disconnect();
ctx.close();
} catch { /* already closed */ }
- (window as any).__audioCaptureActive = false;
- console.log('[AudioCapture] Audio track ended, resources cleaned up');
+ const processorsObj = (window as any).__audioCaptureProcessors as Record;
+ const contextsObj = (window as any).__audioCaptureContexts as Record;
+ delete processorsObj[trackId];
+ delete contextsObj[trackId];
+ console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
});
- (window as any).__audioCaptureCtx = ctx;
- (window as any).__audioCaptureProcessor = processor;
+ const processorsObj = (window as any).__audioCaptureProcessors as Record;
+ const contextsObj = (window as any).__audioCaptureContexts as Record;
+ processorsObj[trackId] = processor;
+ contextsObj[trackId] = ctx;
- console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
+ console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
} catch (err) {
console.error('[AudioCapture] Failed to set up audio capture:', err);
- (window as any).__audioCaptureActive = false;
}
});
@@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
this._pollInterval = setInterval(async () => {
try {
const chunks = await this._page.evaluate(() => {
- const buf = (window as any).__audioCaptureChunks as string[];
+ const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
const result = buf.splice(0, buf.length);
return result;
});
for (const chunk of chunks) {
- this._onAudioChunk(chunk, 16000);
+ this._onAudioChunk(
+ chunk.data,
+ chunk.sampleRate || 16000,
+ chunk.captureDiagnostics
+ );
}
} catch {
// Page might be navigating or closed
@@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
try {
await this._page.evaluate(() => {
- (window as any).__audioCaptureActive = false;
- const proc = (window as any).__audioCaptureProcessor;
- if (proc) try { proc.disconnect(); } catch { /* ok */ }
- const ctx = (window as any).__audioCaptureCtx as AudioContext;
- if (ctx) ctx.close();
+ const processors = (window as any).__audioCaptureProcessors as Record;
+ const contexts = (window as any).__audioCaptureContexts as Record;
+ Object.keys(processors || {}).forEach((trackId) => {
+ try {
+ processors[trackId]?.disconnect();
+ } catch {
+ // ignore
+ }
+ });
+ Object.keys(contexts || {}).forEach((trackId) => {
+ try {
+ contexts[trackId]?.close();
+ } catch {
+ // ignore
+ }
+ });
+ (window as any).__audioCaptureProcessors = {};
+ (window as any).__audioCaptureContexts = {};
});
} catch {
// Page might already be closed
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index e68a810..eb9e7fb 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -864,6 +864,12 @@ export class BotOrchestrator {
});
this._page = await this._context.newPage();
+ this._page.on('console', (msg) => {
+ const text = msg.text();
+ if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
+ this._logger.info(`[PageConsole] ${text}`);
+ }
+ });
// Stealth: Override browser properties that reveal automation.
// Teams checks these to detect headless/automated browsers and
@@ -906,8 +912,8 @@ export class BotOrchestrator {
this._audioCaptureProcedure = new AudioCaptureProcedure(
this._page,
this._logger,
- (base64Data, sampleRate) => {
- this._sendAudioChunk(base64Data, sampleRate);
+ (base64Data, sampleRate, captureDiagnostics) => {
+ this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
},
);
this._captionsProcedure = new CaptionsProcedure(
@@ -1176,7 +1182,16 @@ export class BotOrchestrator {
/**
* Send an audio chunk to the Gateway for STT processing.
*/
- private _sendAudioChunk(base64Data: string, sampleRate: number): void {
+ private _sendAudioChunk(
+ base64Data: string,
+ sampleRate: number,
+ captureDiagnostics?: {
+ trackId?: string;
+ readyState?: string;
+ rms?: number;
+ nativeSampleRate?: number;
+ },
+ ): void {
const message: AudioChunkMessage = {
type: 'audioChunk',
sessionId: this._sessionId,
@@ -1185,6 +1200,7 @@ export class BotOrchestrator {
sampleRate,
data: base64Data,
timestamp: new Date().toISOString(),
+ captureDiagnostics,
},
};
this._sendToGateway(message);
diff --git a/src/types/index.ts b/src/types/index.ts
index 2d5033e..c0a6004 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -63,6 +63,12 @@ export interface AudioChunkMessage {
sampleRate: number;
data: string; // base64 encoded
timestamp: string;
+ captureDiagnostics?: {
+ trackId?: string;
+ readyState?: string;
+ rms?: number;
+ nativeSampleRate?: number;
+ };
};
}