diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts index b81a57e..a0cca6b 100644 --- a/src/bot/audioCaptureProcedure.ts +++ b/src/bot/audioCaptureProcedure.ts @@ -8,8 +8,8 @@ import { Logger } from 'winston'; * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode - * 4. Audio chunks (PCM16, 16kHz mono) are buffered and exposed via a global callback - * 5. The Node.js side polls for chunks and sends them to the Gateway + * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16 + * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway */ export class AudioCaptureProcedure { private _page: Page; @@ -39,7 +39,6 @@ export class AudioCaptureProcedure { this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); await this._page.addInitScript(() => { - // Audio chunk buffer — Node.js polls this periodically (window as any).__audioCaptureChunks = [] as string[]; (window as any).__audioCaptureActive = false; @@ -56,15 +55,21 @@ export class AudioCaptureProcedure { try { const AudioCtx = window.AudioContext || (window as any).webkitAudioContext; - const ctx = new AudioCtx({ sampleRate: 16000 }); + // Use native sample rate (48kHz for WebRTC/Opus) to avoid + // forced resampling which destabilises the Chromium audio stack. + const ctx = new AudioCtx(); + const nativeRate = ctx.sampleRate; const stream = new MediaStream([event.track]); const source = ctx.createMediaStreamSource(stream); - // ScriptProcessor for raw PCM access (deprecated but widely supported) - const processor = ctx.createScriptProcessor(4096, 1, 1); + // ScriptProcessor with larger buffer (8192) reduces callback + // frequency and gives the renderer more breathing room. + const processor = ctx.createScriptProcessor(8192, 1, 1); let chunkBuffer: Float32Array[] = []; let samplesCollected = 0; - const samplesPerChunk = 16000; // 1 second of audio at 16kHz + // Collect ~1 second of audio at native rate before emitting + const samplesPerChunk = nativeRate; + const targetRate = 16000; processor.onaudioprocess = (e: AudioProcessingEvent) => { const input = e.inputBuffer.getChannelData(0); @@ -72,7 +77,7 @@ export class AudioCaptureProcedure { samplesCollected += input.length; if (samplesCollected >= samplesPerChunk) { - // Merge buffers into one Float32Array + // Merge buffers into one contiguous array const merged = new Float32Array(samplesCollected); let offset = 0; for (const buf of chunkBuffer) { @@ -80,11 +85,14 @@ export class AudioCaptureProcedure { offset += buf.length; } - // Convert Float32 [-1,1] to PCM16 Int16 - const pcm16 = new Int16Array(merged.length); - for (let i = 0; i < merged.length; i++) { - const s = Math.max(-1, Math.min(1, merged[i])); - pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF; + // Downsample from nativeRate to 16 kHz + const ratio = nativeRate / targetRate; + const outLen = Math.floor(merged.length / ratio); + const pcm16 = new Int16Array(outLen); + for (let i = 0; i < outLen; i++) { + const srcIdx = Math.floor(i * ratio); + const s = Math.max(-1, Math.min(1, merged[srcIdx])); + pcm16[i] = Math.round(s * 32767); } // Convert to base64 @@ -95,7 +103,6 @@ export class AudioCaptureProcedure { } const base64 = btoa(binary); - // Push to buffer for Node.js to poll const chunks = (window as any).__audioCaptureChunks as string[]; if (chunks.length < 30) { chunks.push(base64); @@ -107,15 +114,37 @@ export class AudioCaptureProcedure { }; source.connect(processor); - processor.connect(ctx.destination); + // Connect to a silent gain node so the ScriptProcessor fires + // its onaudioprocess callback without routing captured audio + // to the speakers (which would conflict with the TTS AudioContext). + const silentGain = ctx.createGain(); + silentGain.gain.value = 0; + processor.connect(silentGain); + silentGain.connect(ctx.destination); + + // Resume the context explicitly — in authMode Chromium does + // not set --autoplay-policy, so new AudioContexts start suspended. + ctx.resume().catch(() => {}); + + // Clean up when the track ends (peer leaves, renegotiation, etc.) + event.track.addEventListener('ended', () => { + try { + processor.disconnect(); + source.disconnect(); + silentGain.disconnect(); + ctx.close(); + } catch { /* already closed */ } + (window as any).__audioCaptureActive = false; + console.log('[AudioCapture] Audio track ended, resources cleaned up'); + }); - // Store references for cleanup (window as any).__audioCaptureCtx = ctx; (window as any).__audioCaptureProcessor = processor; - console.log('[AudioCapture] WebRTC audio track intercepted, capturing at 16kHz mono'); + console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`); } catch (err) { console.error('[AudioCapture] Failed to set up audio capture:', err); + (window as any).__audioCaptureActive = false; } }); @@ -171,6 +200,8 @@ export class AudioCaptureProcedure { try { await this._page.evaluate(() => { (window as any).__audioCaptureActive = false; + const proc = (window as any).__audioCaptureProcessor; + if (proc) try { proc.disconnect(); } catch { /* ok */ } const ctx = (window as any).__audioCaptureCtx as AudioContext; if (ctx) ctx.close(); }); diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts index dd47c3e..5d8fa25 100644 --- a/src/bot/orchestrator.ts +++ b/src/bot/orchestrator.ts @@ -837,6 +837,7 @@ export class BotOrchestrator { '--no-sandbox', '--use-fake-ui-for-media-stream', '--use-fake-device-for-media-stream', + '--autoplay-policy=no-user-gesture-required', ] : [ '--use-fake-ui-for-media-stream',