fix: capture active teams audio track with diagnostics

Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ValueOn AG 2026-02-24 14:55:16 +01:00
parent f47b3c5682
commit 79c1555e0c
3 changed files with 126 additions and 25 deletions

View file

@ -1,6 +1,19 @@
import { Page } from 'playwright'; import { Page } from 'playwright';
import { Logger } from 'winston'; import { Logger } from 'winston';
interface AudioChunkDiagnostics {
trackId?: string;
readyState?: string;
rms?: number;
nativeSampleRate?: number;
}
interface CapturedAudioChunk {
data: string;
sampleRate: number;
captureDiagnostics?: AudioChunkDiagnostics;
}
/** /**
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection. * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
* *
@ -14,7 +27,11 @@ import { Logger } from 'winston';
export class AudioCaptureProcedure { export class AudioCaptureProcedure {
private _page: Page; private _page: Page;
private _logger: Logger; private _logger: Logger;
private _onAudioChunk: (base64Data: string, sampleRate: number) => void; private _onAudioChunk: (
base64Data: string,
sampleRate: number,
captureDiagnostics?: AudioChunkDiagnostics
) => void;
private _isCapturing: boolean = false; private _isCapturing: boolean = false;
private _pollInterval: ReturnType<typeof setInterval> | null = null; private _pollInterval: ReturnType<typeof setInterval> | null = null;
private _injected: boolean = false; private _injected: boolean = false;
@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
constructor( constructor(
page: Page, page: Page,
logger: Logger, logger: Logger,
onAudioChunk: (base64Data: string, sampleRate: number) => void, onAudioChunk: (
base64Data: string,
sampleRate: number,
captureDiagnostics?: AudioChunkDiagnostics
) => void,
) { ) {
this._page = page; this._page = page;
this._logger = logger; this._logger = logger;
@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
await this._page.addInitScript(() => { await this._page.addInitScript(() => {
(window as any).__audioCaptureChunks = [] as string[]; (window as any).__audioCaptureChunks = [] as any[];
(window as any).__audioCaptureActive = false; (window as any).__audioCaptureProcessors = {} as Record<string, any>;
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
const OrigRTC = window.RTCPeerConnection; const OrigRTC = window.RTCPeerConnection;
@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
pc.addEventListener('track', (event: RTCTrackEvent) => { pc.addEventListener('track', (event: RTCTrackEvent) => {
if (event.track.kind !== 'audio') return; if (event.track.kind !== 'audio') return;
if ((window as any).__audioCaptureActive) return;
(window as any).__audioCaptureActive = true; const trackId = event.track.id || `audio-track-${Date.now()}`;
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
if (processors[trackId]) {
return;
}
try { try {
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext; const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
const processor = ctx.createScriptProcessor(8192, 1, 1); const processor = ctx.createScriptProcessor(8192, 1, 1);
let chunkBuffer: Float32Array[] = []; let chunkBuffer: Float32Array[] = [];
let samplesCollected = 0; let samplesCollected = 0;
let skippedSilentChunks = 0;
const minRmsThreshold = 0.0015;
// Collect ~1 second of audio at native rate before emitting // Collect ~1 second of audio at native rate before emitting
const samplesPerChunk = nativeRate; const samplesPerChunk = nativeRate;
const targetRate = 16000; const targetRate = 16000;
@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
offset += buf.length; offset += buf.length;
} }
// Calculate RMS to detect real audio activity
let powerSum = 0;
for (let i = 0; i < merged.length; i++) {
powerSum += merged[i] * merged[i];
}
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
if (rms < minRmsThreshold) {
skippedSilentChunks++;
if (skippedSilentChunks % 10 === 0) {
console.log(
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
);
}
chunkBuffer = [];
samplesCollected = 0;
return;
}
// Downsample from nativeRate to 16 kHz // Downsample from nativeRate to 16 kHz
const ratio = nativeRate / targetRate; const ratio = nativeRate / targetRate;
const outLen = Math.floor(merged.length / ratio); const outLen = Math.floor(merged.length / ratio);
@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
} }
const base64 = btoa(binary); const base64 = btoa(binary);
const chunks = (window as any).__audioCaptureChunks as string[]; const chunks = (window as any).__audioCaptureChunks as any[];
if (chunks.length < 30) { if (chunks.length < 60) {
chunks.push(base64); chunks.push({
data: base64,
sampleRate: targetRate,
captureDiagnostics: {
trackId,
readyState: event.track.readyState,
rms: Number(rms.toFixed(6)),
nativeSampleRate: nativeRate,
},
});
} }
skippedSilentChunks = 0;
chunkBuffer = []; chunkBuffer = [];
samplesCollected = 0; samplesCollected = 0;
} }
@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
silentGain.disconnect(); silentGain.disconnect();
ctx.close(); ctx.close();
} catch { /* already closed */ } } catch { /* already closed */ }
(window as any).__audioCaptureActive = false; const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
console.log('[AudioCapture] Audio track ended, resources cleaned up'); const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
delete processorsObj[trackId];
delete contextsObj[trackId];
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
}); });
(window as any).__audioCaptureCtx = ctx; const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
(window as any).__audioCaptureProcessor = processor; const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
processorsObj[trackId] = processor;
contextsObj[trackId] = ctx;
console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`); console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
} catch (err) { } catch (err) {
console.error('[AudioCapture] Failed to set up audio capture:', err); console.error('[AudioCapture] Failed to set up audio capture:', err);
(window as any).__audioCaptureActive = false;
} }
}); });
@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
this._pollInterval = setInterval(async () => { this._pollInterval = setInterval(async () => {
try { try {
const chunks = await this._page.evaluate(() => { const chunks = await this._page.evaluate(() => {
const buf = (window as any).__audioCaptureChunks as string[]; const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
const result = buf.splice(0, buf.length); const result = buf.splice(0, buf.length);
return result; return result;
}); });
for (const chunk of chunks) { for (const chunk of chunks) {
this._onAudioChunk(chunk, 16000); this._onAudioChunk(
chunk.data,
chunk.sampleRate || 16000,
chunk.captureDiagnostics
);
} }
} catch { } catch {
// Page might be navigating or closed // Page might be navigating or closed
@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
try { try {
await this._page.evaluate(() => { await this._page.evaluate(() => {
(window as any).__audioCaptureActive = false; const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
const proc = (window as any).__audioCaptureProcessor; const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
if (proc) try { proc.disconnect(); } catch { /* ok */ } Object.keys(processors || {}).forEach((trackId) => {
const ctx = (window as any).__audioCaptureCtx as AudioContext; try {
if (ctx) ctx.close(); processors[trackId]?.disconnect();
} catch {
// ignore
}
});
Object.keys(contexts || {}).forEach((trackId) => {
try {
contexts[trackId]?.close();
} catch {
// ignore
}
});
(window as any).__audioCaptureProcessors = {};
(window as any).__audioCaptureContexts = {};
}); });
} catch { } catch {
// Page might already be closed // Page might already be closed

View file

@ -864,6 +864,12 @@ export class BotOrchestrator {
}); });
this._page = await this._context.newPage(); this._page = await this._context.newPage();
this._page.on('console', (msg) => {
const text = msg.text();
if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
this._logger.info(`[PageConsole] ${text}`);
}
});
// Stealth: Override browser properties that reveal automation. // Stealth: Override browser properties that reveal automation.
// Teams checks these to detect headless/automated browsers and // Teams checks these to detect headless/automated browsers and
@ -906,8 +912,8 @@ export class BotOrchestrator {
this._audioCaptureProcedure = new AudioCaptureProcedure( this._audioCaptureProcedure = new AudioCaptureProcedure(
this._page, this._page,
this._logger, this._logger,
(base64Data, sampleRate) => { (base64Data, sampleRate, captureDiagnostics) => {
this._sendAudioChunk(base64Data, sampleRate); this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
}, },
); );
this._captionsProcedure = new CaptionsProcedure( this._captionsProcedure = new CaptionsProcedure(
@ -1176,7 +1182,16 @@ export class BotOrchestrator {
/** /**
* Send an audio chunk to the Gateway for STT processing. * Send an audio chunk to the Gateway for STT processing.
*/ */
private _sendAudioChunk(base64Data: string, sampleRate: number): void { private _sendAudioChunk(
base64Data: string,
sampleRate: number,
captureDiagnostics?: {
trackId?: string;
readyState?: string;
rms?: number;
nativeSampleRate?: number;
},
): void {
const message: AudioChunkMessage = { const message: AudioChunkMessage = {
type: 'audioChunk', type: 'audioChunk',
sessionId: this._sessionId, sessionId: this._sessionId,
@ -1185,6 +1200,7 @@ export class BotOrchestrator {
sampleRate, sampleRate,
data: base64Data, data: base64Data,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
captureDiagnostics,
}, },
}; };
this._sendToGateway(message); this._sendToGateway(message);

View file

@ -63,6 +63,12 @@ export interface AudioChunkMessage {
sampleRate: number; sampleRate: number;
data: string; // base64 encoded data: string; // base64 encoded
timestamp: string; timestamp: string;
captureDiagnostics?: {
trackId?: string;
readyState?: string;
rms?: number;
nativeSampleRate?: number;
};
}; };
} }