fix: capture active teams audio track with diagnostics
Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
f47b3c5682
commit
79c1555e0c
3 changed files with 126 additions and 25 deletions
|
|
@ -1,6 +1,19 @@
|
||||||
import { Page } from 'playwright';
|
import { Page } from 'playwright';
|
||||||
import { Logger } from 'winston';
|
import { Logger } from 'winston';
|
||||||
|
|
||||||
|
interface AudioChunkDiagnostics {
|
||||||
|
trackId?: string;
|
||||||
|
readyState?: string;
|
||||||
|
rms?: number;
|
||||||
|
nativeSampleRate?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CapturedAudioChunk {
|
||||||
|
data: string;
|
||||||
|
sampleRate: number;
|
||||||
|
captureDiagnostics?: AudioChunkDiagnostics;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
|
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
|
||||||
*
|
*
|
||||||
|
|
@ -14,7 +27,11 @@ import { Logger } from 'winston';
|
||||||
export class AudioCaptureProcedure {
|
export class AudioCaptureProcedure {
|
||||||
private _page: Page;
|
private _page: Page;
|
||||||
private _logger: Logger;
|
private _logger: Logger;
|
||||||
private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
|
private _onAudioChunk: (
|
||||||
|
base64Data: string,
|
||||||
|
sampleRate: number,
|
||||||
|
captureDiagnostics?: AudioChunkDiagnostics
|
||||||
|
) => void;
|
||||||
private _isCapturing: boolean = false;
|
private _isCapturing: boolean = false;
|
||||||
private _pollInterval: ReturnType<typeof setInterval> | null = null;
|
private _pollInterval: ReturnType<typeof setInterval> | null = null;
|
||||||
private _injected: boolean = false;
|
private _injected: boolean = false;
|
||||||
|
|
@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
|
||||||
constructor(
|
constructor(
|
||||||
page: Page,
|
page: Page,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
onAudioChunk: (base64Data: string, sampleRate: number) => void,
|
onAudioChunk: (
|
||||||
|
base64Data: string,
|
||||||
|
sampleRate: number,
|
||||||
|
captureDiagnostics?: AudioChunkDiagnostics
|
||||||
|
) => void,
|
||||||
) {
|
) {
|
||||||
this._page = page;
|
this._page = page;
|
||||||
this._logger = logger;
|
this._logger = logger;
|
||||||
|
|
@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
|
||||||
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
||||||
|
|
||||||
await this._page.addInitScript(() => {
|
await this._page.addInitScript(() => {
|
||||||
(window as any).__audioCaptureChunks = [] as string[];
|
(window as any).__audioCaptureChunks = [] as any[];
|
||||||
(window as any).__audioCaptureActive = false;
|
(window as any).__audioCaptureProcessors = {} as Record<string, any>;
|
||||||
|
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
|
||||||
|
|
||||||
const OrigRTC = window.RTCPeerConnection;
|
const OrigRTC = window.RTCPeerConnection;
|
||||||
|
|
||||||
|
|
@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
|
||||||
|
|
||||||
pc.addEventListener('track', (event: RTCTrackEvent) => {
|
pc.addEventListener('track', (event: RTCTrackEvent) => {
|
||||||
if (event.track.kind !== 'audio') return;
|
if (event.track.kind !== 'audio') return;
|
||||||
if ((window as any).__audioCaptureActive) return;
|
|
||||||
(window as any).__audioCaptureActive = true;
|
const trackId = event.track.id || `audio-track-${Date.now()}`;
|
||||||
|
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||||
|
if (processors[trackId]) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
|
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
|
||||||
|
|
@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
|
||||||
const processor = ctx.createScriptProcessor(8192, 1, 1);
|
const processor = ctx.createScriptProcessor(8192, 1, 1);
|
||||||
let chunkBuffer: Float32Array[] = [];
|
let chunkBuffer: Float32Array[] = [];
|
||||||
let samplesCollected = 0;
|
let samplesCollected = 0;
|
||||||
|
let skippedSilentChunks = 0;
|
||||||
|
const minRmsThreshold = 0.0015;
|
||||||
// Collect ~1 second of audio at native rate before emitting
|
// Collect ~1 second of audio at native rate before emitting
|
||||||
const samplesPerChunk = nativeRate;
|
const samplesPerChunk = nativeRate;
|
||||||
const targetRate = 16000;
|
const targetRate = 16000;
|
||||||
|
|
@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
|
||||||
offset += buf.length;
|
offset += buf.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate RMS to detect real audio activity
|
||||||
|
let powerSum = 0;
|
||||||
|
for (let i = 0; i < merged.length; i++) {
|
||||||
|
powerSum += merged[i] * merged[i];
|
||||||
|
}
|
||||||
|
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
||||||
|
|
||||||
|
if (rms < minRmsThreshold) {
|
||||||
|
skippedSilentChunks++;
|
||||||
|
if (skippedSilentChunks % 10 === 0) {
|
||||||
|
console.log(
|
||||||
|
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
chunkBuffer = [];
|
||||||
|
samplesCollected = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Downsample from nativeRate to 16 kHz
|
// Downsample from nativeRate to 16 kHz
|
||||||
const ratio = nativeRate / targetRate;
|
const ratio = nativeRate / targetRate;
|
||||||
const outLen = Math.floor(merged.length / ratio);
|
const outLen = Math.floor(merged.length / ratio);
|
||||||
|
|
@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
|
||||||
}
|
}
|
||||||
const base64 = btoa(binary);
|
const base64 = btoa(binary);
|
||||||
|
|
||||||
const chunks = (window as any).__audioCaptureChunks as string[];
|
const chunks = (window as any).__audioCaptureChunks as any[];
|
||||||
if (chunks.length < 30) {
|
if (chunks.length < 60) {
|
||||||
chunks.push(base64);
|
chunks.push({
|
||||||
|
data: base64,
|
||||||
|
sampleRate: targetRate,
|
||||||
|
captureDiagnostics: {
|
||||||
|
trackId,
|
||||||
|
readyState: event.track.readyState,
|
||||||
|
rms: Number(rms.toFixed(6)),
|
||||||
|
nativeSampleRate: nativeRate,
|
||||||
|
},
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
skippedSilentChunks = 0;
|
||||||
|
|
||||||
chunkBuffer = [];
|
chunkBuffer = [];
|
||||||
samplesCollected = 0;
|
samplesCollected = 0;
|
||||||
}
|
}
|
||||||
|
|
@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
|
||||||
silentGain.disconnect();
|
silentGain.disconnect();
|
||||||
ctx.close();
|
ctx.close();
|
||||||
} catch { /* already closed */ }
|
} catch { /* already closed */ }
|
||||||
(window as any).__audioCaptureActive = false;
|
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||||
console.log('[AudioCapture] Audio track ended, resources cleaned up');
|
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||||
|
delete processorsObj[trackId];
|
||||||
|
delete contextsObj[trackId];
|
||||||
|
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
|
||||||
});
|
});
|
||||||
|
|
||||||
(window as any).__audioCaptureCtx = ctx;
|
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||||
(window as any).__audioCaptureProcessor = processor;
|
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||||
|
processorsObj[trackId] = processor;
|
||||||
|
contextsObj[trackId] = ctx;
|
||||||
|
|
||||||
console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
|
console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
||||||
(window as any).__audioCaptureActive = false;
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
|
||||||
this._pollInterval = setInterval(async () => {
|
this._pollInterval = setInterval(async () => {
|
||||||
try {
|
try {
|
||||||
const chunks = await this._page.evaluate(() => {
|
const chunks = await this._page.evaluate(() => {
|
||||||
const buf = (window as any).__audioCaptureChunks as string[];
|
const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
|
||||||
const result = buf.splice(0, buf.length);
|
const result = buf.splice(0, buf.length);
|
||||||
return result;
|
return result;
|
||||||
});
|
});
|
||||||
|
|
||||||
for (const chunk of chunks) {
|
for (const chunk of chunks) {
|
||||||
this._onAudioChunk(chunk, 16000);
|
this._onAudioChunk(
|
||||||
|
chunk.data,
|
||||||
|
chunk.sampleRate || 16000,
|
||||||
|
chunk.captureDiagnostics
|
||||||
|
);
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// Page might be navigating or closed
|
// Page might be navigating or closed
|
||||||
|
|
@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this._page.evaluate(() => {
|
await this._page.evaluate(() => {
|
||||||
(window as any).__audioCaptureActive = false;
|
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||||
const proc = (window as any).__audioCaptureProcessor;
|
const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||||
if (proc) try { proc.disconnect(); } catch { /* ok */ }
|
Object.keys(processors || {}).forEach((trackId) => {
|
||||||
const ctx = (window as any).__audioCaptureCtx as AudioContext;
|
try {
|
||||||
if (ctx) ctx.close();
|
processors[trackId]?.disconnect();
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Object.keys(contexts || {}).forEach((trackId) => {
|
||||||
|
try {
|
||||||
|
contexts[trackId]?.close();
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
});
|
||||||
|
(window as any).__audioCaptureProcessors = {};
|
||||||
|
(window as any).__audioCaptureContexts = {};
|
||||||
});
|
});
|
||||||
} catch {
|
} catch {
|
||||||
// Page might already be closed
|
// Page might already be closed
|
||||||
|
|
|
||||||
|
|
@ -864,6 +864,12 @@ export class BotOrchestrator {
|
||||||
});
|
});
|
||||||
|
|
||||||
this._page = await this._context.newPage();
|
this._page = await this._context.newPage();
|
||||||
|
this._page.on('console', (msg) => {
|
||||||
|
const text = msg.text();
|
||||||
|
if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
|
||||||
|
this._logger.info(`[PageConsole] ${text}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Stealth: Override browser properties that reveal automation.
|
// Stealth: Override browser properties that reveal automation.
|
||||||
// Teams checks these to detect headless/automated browsers and
|
// Teams checks these to detect headless/automated browsers and
|
||||||
|
|
@ -906,8 +912,8 @@ export class BotOrchestrator {
|
||||||
this._audioCaptureProcedure = new AudioCaptureProcedure(
|
this._audioCaptureProcedure = new AudioCaptureProcedure(
|
||||||
this._page,
|
this._page,
|
||||||
this._logger,
|
this._logger,
|
||||||
(base64Data, sampleRate) => {
|
(base64Data, sampleRate, captureDiagnostics) => {
|
||||||
this._sendAudioChunk(base64Data, sampleRate);
|
this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
this._captionsProcedure = new CaptionsProcedure(
|
this._captionsProcedure = new CaptionsProcedure(
|
||||||
|
|
@ -1176,7 +1182,16 @@ export class BotOrchestrator {
|
||||||
/**
|
/**
|
||||||
* Send an audio chunk to the Gateway for STT processing.
|
* Send an audio chunk to the Gateway for STT processing.
|
||||||
*/
|
*/
|
||||||
private _sendAudioChunk(base64Data: string, sampleRate: number): void {
|
private _sendAudioChunk(
|
||||||
|
base64Data: string,
|
||||||
|
sampleRate: number,
|
||||||
|
captureDiagnostics?: {
|
||||||
|
trackId?: string;
|
||||||
|
readyState?: string;
|
||||||
|
rms?: number;
|
||||||
|
nativeSampleRate?: number;
|
||||||
|
},
|
||||||
|
): void {
|
||||||
const message: AudioChunkMessage = {
|
const message: AudioChunkMessage = {
|
||||||
type: 'audioChunk',
|
type: 'audioChunk',
|
||||||
sessionId: this._sessionId,
|
sessionId: this._sessionId,
|
||||||
|
|
@ -1185,6 +1200,7 @@ export class BotOrchestrator {
|
||||||
sampleRate,
|
sampleRate,
|
||||||
data: base64Data,
|
data: base64Data,
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
|
captureDiagnostics,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
this._sendToGateway(message);
|
this._sendToGateway(message);
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,12 @@ export interface AudioChunkMessage {
|
||||||
sampleRate: number;
|
sampleRate: number;
|
||||||
data: string; // base64 encoded
|
data: string; // base64 encoded
|
||||||
timestamp: string;
|
timestamp: string;
|
||||||
|
captureDiagnostics?: {
|
||||||
|
trackId?: string;
|
||||||
|
readyState?: string;
|
||||||
|
rms?: number;
|
||||||
|
nativeSampleRate?: number;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue