fixed voice

2026-02-23 23:01:25 +01:00 · 2026-02-23 23:01:25 +01:00 · dbecc602b7
commit dbecc602b7
parent 9e4aad973f
2 changed files with 49 additions and 17 deletions
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@ -8,8 +8,8 @@ import { Logger } from 'winston';
 * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
 * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
 * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
- * 4. Audio chunks (PCM16, 16kHz mono) are buffered and exposed via a global callback
+ * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
- * 5. The Node.js side polls for chunks and sends them to the Gateway
+ * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
 */
 export class AudioCaptureProcedure {
  private _page: Page;
@ -39,7 +39,6 @@ export class AudioCaptureProcedure {
    this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
    await this._page.addInitScript(() => {
      // Audio chunk buffer — Node.js polls this periodically
      (window as any).__audioCaptureChunks = [] as string[];
      (window as any).__audioCaptureActive = false;
@ -56,15 +55,21 @@ export class AudioCaptureProcedure {
          try {
            const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
-            const ctx = new AudioCtx({ sampleRate: 16000 });
+            // Use native sample rate (48kHz for WebRTC/Opus) to avoid
            // forced resampling which destabilises the Chromium audio stack.
            const ctx = new AudioCtx();
            const nativeRate = ctx.sampleRate;
            const stream = new MediaStream([event.track]);
            const source = ctx.createMediaStreamSource(stream);
-            // ScriptProcessor for raw PCM access (deprecated but widely supported)
+            // ScriptProcessor with larger buffer (8192) reduces callback
-            const processor = ctx.createScriptProcessor(4096, 1, 1);
+            // frequency and gives the renderer more breathing room.
            const processor = ctx.createScriptProcessor(8192, 1, 1);
            let chunkBuffer: Float32Array[] = [];
            let samplesCollected = 0;
-            const samplesPerChunk = 16000; // 1 second of audio at 16kHz
+            // Collect ~1 second of audio at native rate before emitting
            const samplesPerChunk = nativeRate;
            const targetRate = 16000;
            processor.onaudioprocess = (e: AudioProcessingEvent) => {
              const input = e.inputBuffer.getChannelData(0);
@ -72,7 +77,7 @@ export class AudioCaptureProcedure {
              samplesCollected += input.length;
              if (samplesCollected >= samplesPerChunk) {
-                // Merge buffers into one Float32Array
+                // Merge buffers into one contiguous array
                const merged = new Float32Array(samplesCollected);
                let offset = 0;
                for (const buf of chunkBuffer) {
@ -80,11 +85,14 @@ export class AudioCaptureProcedure {
                  offset += buf.length;
                }
-                // Convert Float32 [-1,1] to PCM16 Int16
+                // Downsample from nativeRate to 16 kHz
-                const pcm16 = new Int16Array(merged.length);
+                const ratio = nativeRate / targetRate;
-                for (let i = 0; i < merged.length; i++) {
+                const outLen = Math.floor(merged.length / ratio);
-                  const s = Math.max(-1, Math.min(1, merged[i]));
+                const pcm16 = new Int16Array(outLen);
-                  pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+                for (let i = 0; i < outLen; i++) {
                  const srcIdx = Math.floor(i * ratio);
                  const s = Math.max(-1, Math.min(1, merged[srcIdx]));
                  pcm16[i] = Math.round(s * 32767);
                }
                // Convert to base64
@ -95,7 +103,6 @@ export class AudioCaptureProcedure {
                }
                const base64 = btoa(binary);
                // Push to buffer for Node.js to poll
                const chunks = (window as any).__audioCaptureChunks as string[];
                if (chunks.length < 30) {
                  chunks.push(base64);
@ -107,15 +114,37 @@ export class AudioCaptureProcedure {
            };
            source.connect(processor);
-            processor.connect(ctx.destination);
+            // Connect to a silent gain node so the ScriptProcessor fires
            // its onaudioprocess callback without routing captured audio
            // to the speakers (which would conflict with the TTS AudioContext).
            const silentGain = ctx.createGain();
            silentGain.gain.value = 0;
            processor.connect(silentGain);
            silentGain.connect(ctx.destination);
            // Resume the context explicitly — in authMode Chromium does
            // not set --autoplay-policy, so new AudioContexts start suspended.
            ctx.resume().catch(() => {});
            // Clean up when the track ends (peer leaves, renegotiation, etc.)
            event.track.addEventListener('ended', () => {
              try {
                processor.disconnect();
                source.disconnect();
                silentGain.disconnect();
                ctx.close();
              } catch { /* already closed */ }
              (window as any).__audioCaptureActive = false;
              console.log('[AudioCapture] Audio track ended, resources cleaned up');
            });
            // Store references for cleanup
            (window as any).__audioCaptureCtx = ctx;
            (window as any).__audioCaptureProcessor = processor;
-            console.log('[AudioCapture] WebRTC audio track intercepted, capturing at 16kHz mono');
+            console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
          } catch (err) {
            console.error('[AudioCapture] Failed to set up audio capture:', err);
            (window as any).__audioCaptureActive = false;
          }
        });
@ -171,6 +200,8 @@ export class AudioCaptureProcedure {
    try {
      await this._page.evaluate(() => {
        (window as any).__audioCaptureActive = false;
        const proc = (window as any).__audioCaptureProcessor;
        if (proc) try { proc.disconnect(); } catch { /* ok */ }
        const ctx = (window as any).__audioCaptureCtx as AudioContext;
        if (ctx) ctx.close();
      });
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@ -837,6 +837,7 @@ export class BotOrchestrator {
          '--no-sandbox',
          '--use-fake-ui-for-media-stream',
          '--use-fake-device-for-media-stream',
          '--autoplay-policy=no-user-gesture-required',
        ]
      : [
          '--use-fake-ui-for-media-stream',