From dbecc602b72dd6bfa0cc09e50a796899c158886a Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Mon, 23 Feb 2026 23:01:25 +0100
Subject: [PATCH] fixed voice

---
 src/bot/audioCaptureProcedure.ts | 65 +++++++++++++++++++++++---------
 src/bot/orchestrator.ts          |  1 +
 2 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts
index b81a57e..a0cca6b 100644
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@@ -8,8 +8,8 @@ import { Logger } from 'winston';
  * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
  * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
  * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
- * 4. Audio chunks (PCM16, 16kHz mono) are buffered and exposed via a global callback
- * 5. The Node.js side polls for chunks and sends them to the Gateway
+ * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
+ * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
  */
 export class AudioCaptureProcedure {
   private _page: Page;
@@ -39,7 +39,6 @@ export class AudioCaptureProcedure {
     this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
 
     await this._page.addInitScript(() => {
-      // Audio chunk buffer — Node.js polls this periodically
       (window as any).__audioCaptureChunks = [] as string[];
       (window as any).__audioCaptureActive = false;
 
@@ -56,15 +55,21 @@ export class AudioCaptureProcedure {
 
           try {
             const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
-            const ctx = new AudioCtx({ sampleRate: 16000 });
+            // Use native sample rate (48kHz for WebRTC/Opus) to avoid
+            // forced resampling which destabilises the Chromium audio stack.
+            const ctx = new AudioCtx();
+            const nativeRate = ctx.sampleRate;
             const stream = new MediaStream([event.track]);
             const source = ctx.createMediaStreamSource(stream);
 
-            // ScriptProcessor for raw PCM access (deprecated but widely supported)
-            const processor = ctx.createScriptProcessor(4096, 1, 1);
+            // ScriptProcessor with larger buffer (8192) reduces callback
+            // frequency and gives the renderer more breathing room.
+            const processor = ctx.createScriptProcessor(8192, 1, 1);
             let chunkBuffer: Float32Array[] = [];
             let samplesCollected = 0;
-            const samplesPerChunk = 16000; // 1 second of audio at 16kHz
+            // Collect ~1 second of audio at native rate before emitting
+            const samplesPerChunk = nativeRate;
+            const targetRate = 16000;
 
             processor.onaudioprocess = (e: AudioProcessingEvent) => {
               const input = e.inputBuffer.getChannelData(0);
@@ -72,7 +77,7 @@ export class AudioCaptureProcedure {
               samplesCollected += input.length;
 
               if (samplesCollected >= samplesPerChunk) {
-                // Merge buffers into one Float32Array
+                // Merge buffers into one contiguous array
                 const merged = new Float32Array(samplesCollected);
                 let offset = 0;
                 for (const buf of chunkBuffer) {
@@ -80,11 +85,14 @@ export class AudioCaptureProcedure {
                   offset += buf.length;
                 }
 
-                // Convert Float32 [-1,1] to PCM16 Int16
-                const pcm16 = new Int16Array(merged.length);
-                for (let i = 0; i < merged.length; i++) {
-                  const s = Math.max(-1, Math.min(1, merged[i]));
-                  pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+                // Downsample from nativeRate to 16 kHz
+                const ratio = nativeRate / targetRate;
+                const outLen = Math.floor(merged.length / ratio);
+                const pcm16 = new Int16Array(outLen);
+                for (let i = 0; i < outLen; i++) {
+                  const srcIdx = Math.floor(i * ratio);
+                  const s = Math.max(-1, Math.min(1, merged[srcIdx]));
+                  pcm16[i] = Math.round(s * 32767);
                 }
 
                 // Convert to base64
@@ -95,7 +103,6 @@ export class AudioCaptureProcedure {
                 }
                 const base64 = btoa(binary);
 
-                // Push to buffer for Node.js to poll
                 const chunks = (window as any).__audioCaptureChunks as string[];
                 if (chunks.length < 30) {
                   chunks.push(base64);
@@ -107,15 +114,37 @@ export class AudioCaptureProcedure {
             };
 
             source.connect(processor);
-            processor.connect(ctx.destination);
+            // Connect to a silent gain node so the ScriptProcessor fires
+            // its onaudioprocess callback without routing captured audio
+            // to the speakers (which would conflict with the TTS AudioContext).
+            const silentGain = ctx.createGain();
+            silentGain.gain.value = 0;
+            processor.connect(silentGain);
+            silentGain.connect(ctx.destination);
+
+            // Resume the context explicitly — in authMode Chromium does
+            // not set --autoplay-policy, so new AudioContexts start suspended.
+            ctx.resume().catch(() => {});
+
+            // Clean up when the track ends (peer leaves, renegotiation, etc.)
+            event.track.addEventListener('ended', () => {
+              try {
+                processor.disconnect();
+                source.disconnect();
+                silentGain.disconnect();
+                ctx.close();
+              } catch { /* already closed */ }
+              (window as any).__audioCaptureActive = false;
+              console.log('[AudioCapture] Audio track ended, resources cleaned up');
+            });
 
-            // Store references for cleanup
             (window as any).__audioCaptureCtx = ctx;
             (window as any).__audioCaptureProcessor = processor;
 
-            console.log('[AudioCapture] WebRTC audio track intercepted, capturing at 16kHz mono');
+            console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
           } catch (err) {
             console.error('[AudioCapture] Failed to set up audio capture:', err);
+            (window as any).__audioCaptureActive = false;
           }
         });
 
@@ -171,6 +200,8 @@ export class AudioCaptureProcedure {
     try {
       await this._page.evaluate(() => {
         (window as any).__audioCaptureActive = false;
+        const proc = (window as any).__audioCaptureProcessor;
+        if (proc) try { proc.disconnect(); } catch { /* ok */ }
         const ctx = (window as any).__audioCaptureCtx as AudioContext;
         if (ctx) ctx.close();
       });
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index dd47c3e..5d8fa25 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -837,6 +837,7 @@ export class BotOrchestrator {
           '--no-sandbox',
           '--use-fake-ui-for-media-stream',
           '--use-fake-device-for-media-stream',
+          '--autoplay-policy=no-user-gesture-required',
         ]
       : [
           '--use-fake-ui-for-media-stream',