fix: capture active teams audio track with diagnostics

Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ValueOn AG 2026-02-24 14:55:16 +01:00
parent f47b3c5682
commit 79c1555e0c
3 changed files with 126 additions and 25 deletions

View file

@ -1,6 +1,19 @@
import { Page } from 'playwright';
import { Logger } from 'winston';
interface AudioChunkDiagnostics {
trackId?: string;
readyState?: string;
rms?: number;
nativeSampleRate?: number;
}
interface CapturedAudioChunk {
data: string;
sampleRate: number;
captureDiagnostics?: AudioChunkDiagnostics;
}
/**
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
*
@ -14,7 +27,11 @@ import { Logger } from 'winston';
export class AudioCaptureProcedure {
private _page: Page;
private _logger: Logger;
private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
private _onAudioChunk: (
base64Data: string,
sampleRate: number,
captureDiagnostics?: AudioChunkDiagnostics
) => void;
private _isCapturing: boolean = false;
private _pollInterval: ReturnType<typeof setInterval> | null = null;
private _injected: boolean = false;
@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
constructor(
page: Page,
logger: Logger,
onAudioChunk: (base64Data: string, sampleRate: number) => void,
onAudioChunk: (
base64Data: string,
sampleRate: number,
captureDiagnostics?: AudioChunkDiagnostics
) => void,
) {
this._page = page;
this._logger = logger;
@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
await this._page.addInitScript(() => {
(window as any).__audioCaptureChunks = [] as string[];
(window as any).__audioCaptureActive = false;
(window as any).__audioCaptureChunks = [] as any[];
(window as any).__audioCaptureProcessors = {} as Record<string, any>;
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
const OrigRTC = window.RTCPeerConnection;
@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
pc.addEventListener('track', (event: RTCTrackEvent) => {
if (event.track.kind !== 'audio') return;
if ((window as any).__audioCaptureActive) return;
(window as any).__audioCaptureActive = true;
const trackId = event.track.id || `audio-track-${Date.now()}`;
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
if (processors[trackId]) {
return;
}
try {
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
const processor = ctx.createScriptProcessor(8192, 1, 1);
let chunkBuffer: Float32Array[] = [];
let samplesCollected = 0;
let skippedSilentChunks = 0;
const minRmsThreshold = 0.0015;
// Collect ~1 second of audio at native rate before emitting
const samplesPerChunk = nativeRate;
const targetRate = 16000;
@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
offset += buf.length;
}
// Calculate RMS to detect real audio activity
let powerSum = 0;
for (let i = 0; i < merged.length; i++) {
powerSum += merged[i] * merged[i];
}
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
if (rms < minRmsThreshold) {
skippedSilentChunks++;
if (skippedSilentChunks % 10 === 0) {
console.log(
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
);
}
chunkBuffer = [];
samplesCollected = 0;
return;
}
// Downsample from nativeRate to 16 kHz
const ratio = nativeRate / targetRate;
const outLen = Math.floor(merged.length / ratio);
@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
}
const base64 = btoa(binary);
const chunks = (window as any).__audioCaptureChunks as string[];
if (chunks.length < 30) {
chunks.push(base64);
const chunks = (window as any).__audioCaptureChunks as any[];
if (chunks.length < 60) {
chunks.push({
data: base64,
sampleRate: targetRate,
captureDiagnostics: {
trackId,
readyState: event.track.readyState,
rms: Number(rms.toFixed(6)),
nativeSampleRate: nativeRate,
},
});
}
skippedSilentChunks = 0;
chunkBuffer = [];
samplesCollected = 0;
}
@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
silentGain.disconnect();
ctx.close();
} catch { /* already closed */ }
(window as any).__audioCaptureActive = false;
console.log('[AudioCapture] Audio track ended, resources cleaned up');
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
delete processorsObj[trackId];
delete contextsObj[trackId];
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
});
(window as any).__audioCaptureCtx = ctx;
(window as any).__audioCaptureProcessor = processor;
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
processorsObj[trackId] = processor;
contextsObj[trackId] = ctx;
console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
} catch (err) {
console.error('[AudioCapture] Failed to set up audio capture:', err);
(window as any).__audioCaptureActive = false;
}
});
@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
this._pollInterval = setInterval(async () => {
try {
const chunks = await this._page.evaluate(() => {
const buf = (window as any).__audioCaptureChunks as string[];
const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
const result = buf.splice(0, buf.length);
return result;
});
for (const chunk of chunks) {
this._onAudioChunk(chunk, 16000);
this._onAudioChunk(
chunk.data,
chunk.sampleRate || 16000,
chunk.captureDiagnostics
);
}
} catch {
// Page might be navigating or closed
@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
try {
await this._page.evaluate(() => {
(window as any).__audioCaptureActive = false;
const proc = (window as any).__audioCaptureProcessor;
if (proc) try { proc.disconnect(); } catch { /* ok */ }
const ctx = (window as any).__audioCaptureCtx as AudioContext;
if (ctx) ctx.close();
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
Object.keys(processors || {}).forEach((trackId) => {
try {
processors[trackId]?.disconnect();
} catch {
// ignore
}
});
Object.keys(contexts || {}).forEach((trackId) => {
try {
contexts[trackId]?.close();
} catch {
// ignore
}
});
(window as any).__audioCaptureProcessors = {};
(window as any).__audioCaptureContexts = {};
});
} catch {
// Page might already be closed

View file

@ -864,6 +864,12 @@ export class BotOrchestrator {
});
this._page = await this._context.newPage();
this._page.on('console', (msg) => {
const text = msg.text();
if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
this._logger.info(`[PageConsole] ${text}`);
}
});
// Stealth: Override browser properties that reveal automation.
// Teams checks these to detect headless/automated browsers and
@ -906,8 +912,8 @@ export class BotOrchestrator {
this._audioCaptureProcedure = new AudioCaptureProcedure(
this._page,
this._logger,
(base64Data, sampleRate) => {
this._sendAudioChunk(base64Data, sampleRate);
(base64Data, sampleRate, captureDiagnostics) => {
this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
},
);
this._captionsProcedure = new CaptionsProcedure(
@ -1176,7 +1182,16 @@ export class BotOrchestrator {
/**
* Send an audio chunk to the Gateway for STT processing.
*/
private _sendAudioChunk(base64Data: string, sampleRate: number): void {
private _sendAudioChunk(
base64Data: string,
sampleRate: number,
captureDiagnostics?: {
trackId?: string;
readyState?: string;
rms?: number;
nativeSampleRate?: number;
},
): void {
const message: AudioChunkMessage = {
type: 'audioChunk',
sessionId: this._sessionId,
@ -1185,6 +1200,7 @@ export class BotOrchestrator {
sampleRate,
data: base64Data,
timestamp: new Date().toISOString(),
captureDiagnostics,
},
};
this._sendToGateway(message);

View file

@ -63,6 +63,12 @@ export interface AudioChunkMessage {
sampleRate: number;
data: string; // base64 encoded
timestamp: string;
captureDiagnostics?: {
trackId?: string;
readyState?: string;
rms?: number;
nativeSampleRate?: number;
};
};
}