fix: capture active teams audio track with diagnostics
Replace first-track audio lock with multi-track capture management, add RMS-based activity detection, and propagate capture diagnostics (trackId, readyState, rms, nativeSampleRate) to gateway logs for root-cause analysis. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
f47b3c5682
commit
79c1555e0c
3 changed files with 126 additions and 25 deletions
|
|
@ -1,6 +1,19 @@
|
|||
import { Page } from 'playwright';
|
||||
import { Logger } from 'winston';
|
||||
|
||||
interface AudioChunkDiagnostics {
|
||||
trackId?: string;
|
||||
readyState?: string;
|
||||
rms?: number;
|
||||
nativeSampleRate?: number;
|
||||
}
|
||||
|
||||
interface CapturedAudioChunk {
|
||||
data: string;
|
||||
sampleRate: number;
|
||||
captureDiagnostics?: AudioChunkDiagnostics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
|
||||
*
|
||||
|
|
@ -14,7 +27,11 @@ import { Logger } from 'winston';
|
|||
export class AudioCaptureProcedure {
|
||||
private _page: Page;
|
||||
private _logger: Logger;
|
||||
private _onAudioChunk: (base64Data: string, sampleRate: number) => void;
|
||||
private _onAudioChunk: (
|
||||
base64Data: string,
|
||||
sampleRate: number,
|
||||
captureDiagnostics?: AudioChunkDiagnostics
|
||||
) => void;
|
||||
private _isCapturing: boolean = false;
|
||||
private _pollInterval: ReturnType<typeof setInterval> | null = null;
|
||||
private _injected: boolean = false;
|
||||
|
|
@ -22,7 +39,11 @@ export class AudioCaptureProcedure {
|
|||
constructor(
|
||||
page: Page,
|
||||
logger: Logger,
|
||||
onAudioChunk: (base64Data: string, sampleRate: number) => void,
|
||||
onAudioChunk: (
|
||||
base64Data: string,
|
||||
sampleRate: number,
|
||||
captureDiagnostics?: AudioChunkDiagnostics
|
||||
) => void,
|
||||
) {
|
||||
this._page = page;
|
||||
this._logger = logger;
|
||||
|
|
@ -39,8 +60,9 @@ export class AudioCaptureProcedure {
|
|||
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
||||
|
||||
await this._page.addInitScript(() => {
|
||||
(window as any).__audioCaptureChunks = [] as string[];
|
||||
(window as any).__audioCaptureActive = false;
|
||||
(window as any).__audioCaptureChunks = [] as any[];
|
||||
(window as any).__audioCaptureProcessors = {} as Record<string, any>;
|
||||
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
|
||||
|
||||
const OrigRTC = window.RTCPeerConnection;
|
||||
|
||||
|
|
@ -50,8 +72,12 @@ export class AudioCaptureProcedure {
|
|||
|
||||
pc.addEventListener('track', (event: RTCTrackEvent) => {
|
||||
if (event.track.kind !== 'audio') return;
|
||||
if ((window as any).__audioCaptureActive) return;
|
||||
(window as any).__audioCaptureActive = true;
|
||||
|
||||
const trackId = event.track.id || `audio-track-${Date.now()}`;
|
||||
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
if (processors[trackId]) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
|
||||
|
|
@ -67,6 +93,8 @@ export class AudioCaptureProcedure {
|
|||
const processor = ctx.createScriptProcessor(8192, 1, 1);
|
||||
let chunkBuffer: Float32Array[] = [];
|
||||
let samplesCollected = 0;
|
||||
let skippedSilentChunks = 0;
|
||||
const minRmsThreshold = 0.0015;
|
||||
// Collect ~1 second of audio at native rate before emitting
|
||||
const samplesPerChunk = nativeRate;
|
||||
const targetRate = 16000;
|
||||
|
|
@ -85,6 +113,25 @@ export class AudioCaptureProcedure {
|
|||
offset += buf.length;
|
||||
}
|
||||
|
||||
// Calculate RMS to detect real audio activity
|
||||
let powerSum = 0;
|
||||
for (let i = 0; i < merged.length; i++) {
|
||||
powerSum += merged[i] * merged[i];
|
||||
}
|
||||
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
||||
|
||||
if (rms < minRmsThreshold) {
|
||||
skippedSilentChunks++;
|
||||
if (skippedSilentChunks % 10 === 0) {
|
||||
console.log(
|
||||
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, rms=${rms.toFixed(6)}`
|
||||
);
|
||||
}
|
||||
chunkBuffer = [];
|
||||
samplesCollected = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// Downsample from nativeRate to 16 kHz
|
||||
const ratio = nativeRate / targetRate;
|
||||
const outLen = Math.floor(merged.length / ratio);
|
||||
|
|
@ -103,11 +150,22 @@ export class AudioCaptureProcedure {
|
|||
}
|
||||
const base64 = btoa(binary);
|
||||
|
||||
const chunks = (window as any).__audioCaptureChunks as string[];
|
||||
if (chunks.length < 30) {
|
||||
chunks.push(base64);
|
||||
const chunks = (window as any).__audioCaptureChunks as any[];
|
||||
if (chunks.length < 60) {
|
||||
chunks.push({
|
||||
data: base64,
|
||||
sampleRate: targetRate,
|
||||
captureDiagnostics: {
|
||||
trackId,
|
||||
readyState: event.track.readyState,
|
||||
rms: Number(rms.toFixed(6)),
|
||||
nativeSampleRate: nativeRate,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
skippedSilentChunks = 0;
|
||||
|
||||
chunkBuffer = [];
|
||||
samplesCollected = 0;
|
||||
}
|
||||
|
|
@ -134,17 +192,21 @@ export class AudioCaptureProcedure {
|
|||
silentGain.disconnect();
|
||||
ctx.close();
|
||||
} catch { /* already closed */ }
|
||||
(window as any).__audioCaptureActive = false;
|
||||
console.log('[AudioCapture] Audio track ended, resources cleaned up');
|
||||
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||
delete processorsObj[trackId];
|
||||
delete contextsObj[trackId];
|
||||
console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
|
||||
});
|
||||
|
||||
(window as any).__audioCaptureCtx = ctx;
|
||||
(window as any).__audioCaptureProcessor = processor;
|
||||
const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||
processorsObj[trackId] = processor;
|
||||
contextsObj[trackId] = ctx;
|
||||
|
||||
console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
|
||||
console.log(`[AudioCapture] WebRTC audio track intercepted: track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
|
||||
} catch (err) {
|
||||
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
||||
(window as any).__audioCaptureActive = false;
|
||||
}
|
||||
});
|
||||
|
||||
|
|
@ -172,13 +234,17 @@ export class AudioCaptureProcedure {
|
|||
this._pollInterval = setInterval(async () => {
|
||||
try {
|
||||
const chunks = await this._page.evaluate(() => {
|
||||
const buf = (window as any).__audioCaptureChunks as string[];
|
||||
const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
|
||||
const result = buf.splice(0, buf.length);
|
||||
return result;
|
||||
});
|
||||
|
||||
for (const chunk of chunks) {
|
||||
this._onAudioChunk(chunk, 16000);
|
||||
this._onAudioChunk(
|
||||
chunk.data,
|
||||
chunk.sampleRate || 16000,
|
||||
chunk.captureDiagnostics
|
||||
);
|
||||
}
|
||||
} catch {
|
||||
// Page might be navigating or closed
|
||||
|
|
@ -199,11 +265,24 @@ export class AudioCaptureProcedure {
|
|||
|
||||
try {
|
||||
await this._page.evaluate(() => {
|
||||
(window as any).__audioCaptureActive = false;
|
||||
const proc = (window as any).__audioCaptureProcessor;
|
||||
if (proc) try { proc.disconnect(); } catch { /* ok */ }
|
||||
const ctx = (window as any).__audioCaptureCtx as AudioContext;
|
||||
if (ctx) ctx.close();
|
||||
const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
|
||||
const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
|
||||
Object.keys(processors || {}).forEach((trackId) => {
|
||||
try {
|
||||
processors[trackId]?.disconnect();
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
});
|
||||
Object.keys(contexts || {}).forEach((trackId) => {
|
||||
try {
|
||||
contexts[trackId]?.close();
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
});
|
||||
(window as any).__audioCaptureProcessors = {};
|
||||
(window as any).__audioCaptureContexts = {};
|
||||
});
|
||||
} catch {
|
||||
// Page might already be closed
|
||||
|
|
|
|||
|
|
@ -864,6 +864,12 @@ export class BotOrchestrator {
|
|||
});
|
||||
|
||||
this._page = await this._context.newPage();
|
||||
this._page.on('console', (msg) => {
|
||||
const text = msg.text();
|
||||
if (text.includes('[AudioCapture]') || text.includes('[AudioPlayback]')) {
|
||||
this._logger.info(`[PageConsole] ${text}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Stealth: Override browser properties that reveal automation.
|
||||
// Teams checks these to detect headless/automated browsers and
|
||||
|
|
@ -906,8 +912,8 @@ export class BotOrchestrator {
|
|||
this._audioCaptureProcedure = new AudioCaptureProcedure(
|
||||
this._page,
|
||||
this._logger,
|
||||
(base64Data, sampleRate) => {
|
||||
this._sendAudioChunk(base64Data, sampleRate);
|
||||
(base64Data, sampleRate, captureDiagnostics) => {
|
||||
this._sendAudioChunk(base64Data, sampleRate, captureDiagnostics);
|
||||
},
|
||||
);
|
||||
this._captionsProcedure = new CaptionsProcedure(
|
||||
|
|
@ -1176,7 +1182,16 @@ export class BotOrchestrator {
|
|||
/**
|
||||
* Send an audio chunk to the Gateway for STT processing.
|
||||
*/
|
||||
private _sendAudioChunk(base64Data: string, sampleRate: number): void {
|
||||
private _sendAudioChunk(
|
||||
base64Data: string,
|
||||
sampleRate: number,
|
||||
captureDiagnostics?: {
|
||||
trackId?: string;
|
||||
readyState?: string;
|
||||
rms?: number;
|
||||
nativeSampleRate?: number;
|
||||
},
|
||||
): void {
|
||||
const message: AudioChunkMessage = {
|
||||
type: 'audioChunk',
|
||||
sessionId: this._sessionId,
|
||||
|
|
@ -1185,6 +1200,7 @@ export class BotOrchestrator {
|
|||
sampleRate,
|
||||
data: base64Data,
|
||||
timestamp: new Date().toISOString(),
|
||||
captureDiagnostics,
|
||||
},
|
||||
};
|
||||
this._sendToGateway(message);
|
||||
|
|
|
|||
|
|
@ -63,6 +63,12 @@ export interface AudioChunkMessage {
|
|||
sampleRate: number;
|
||||
data: string; // base64 encoded
|
||||
timestamp: string;
|
||||
captureDiagnostics?: {
|
||||
trackId?: string;
|
||||
readyState?: string;
|
||||
rms?: number;
|
||||
nativeSampleRate?: number;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue