From dbecc602b72dd6bfa0cc09e50a796899c158886a Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Mon, 23 Feb 2026 23:01:25 +0100
Subject: [PATCH] fixed voice
---
src/bot/audioCaptureProcedure.ts | 65 +++++++++++++++++++++++---------
src/bot/orchestrator.ts | 1 +
2 files changed, 49 insertions(+), 17 deletions(-)
diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts
index b81a57e..a0cca6b 100644
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@@ -8,8 +8,8 @@ import { Logger } from 'winston';
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
* 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
- * 4. Audio chunks (PCM16, 16kHz mono) are buffered and exposed via a global callback
- * 5. The Node.js side polls for chunks and sends them to the Gateway
+ * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
+ * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
*/
export class AudioCaptureProcedure {
private _page: Page;
@@ -39,7 +39,6 @@ export class AudioCaptureProcedure {
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
await this._page.addInitScript(() => {
- // Audio chunk buffer — Node.js polls this periodically
(window as any).__audioCaptureChunks = [] as string[];
(window as any).__audioCaptureActive = false;
@@ -56,15 +55,21 @@ export class AudioCaptureProcedure {
try {
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
- const ctx = new AudioCtx({ sampleRate: 16000 });
+ // Use native sample rate (48kHz for WebRTC/Opus) to avoid
+ // forced resampling which destabilises the Chromium audio stack.
+ const ctx = new AudioCtx();
+ const nativeRate = ctx.sampleRate;
const stream = new MediaStream([event.track]);
const source = ctx.createMediaStreamSource(stream);
- // ScriptProcessor for raw PCM access (deprecated but widely supported)
- const processor = ctx.createScriptProcessor(4096, 1, 1);
+ // ScriptProcessor with larger buffer (8192) reduces callback
+ // frequency and gives the renderer more breathing room.
+ const processor = ctx.createScriptProcessor(8192, 1, 1);
let chunkBuffer: Float32Array[] = [];
let samplesCollected = 0;
- const samplesPerChunk = 16000; // 1 second of audio at 16kHz
+ // Collect ~1 second of audio at native rate before emitting
+ const samplesPerChunk = nativeRate;
+ const targetRate = 16000;
processor.onaudioprocess = (e: AudioProcessingEvent) => {
const input = e.inputBuffer.getChannelData(0);
@@ -72,7 +77,7 @@ export class AudioCaptureProcedure {
samplesCollected += input.length;
if (samplesCollected >= samplesPerChunk) {
- // Merge buffers into one Float32Array
+ // Merge buffers into one contiguous array
const merged = new Float32Array(samplesCollected);
let offset = 0;
for (const buf of chunkBuffer) {
@@ -80,11 +85,14 @@ export class AudioCaptureProcedure {
offset += buf.length;
}
- // Convert Float32 [-1,1] to PCM16 Int16
- const pcm16 = new Int16Array(merged.length);
- for (let i = 0; i < merged.length; i++) {
- const s = Math.max(-1, Math.min(1, merged[i]));
- pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+ // Downsample from nativeRate to 16 kHz
+ const ratio = nativeRate / targetRate;
+ const outLen = Math.floor(merged.length / ratio);
+ const pcm16 = new Int16Array(outLen);
+ for (let i = 0; i < outLen; i++) {
+ const srcIdx = Math.floor(i * ratio);
+ const s = Math.max(-1, Math.min(1, merged[srcIdx]));
+ pcm16[i] = Math.round(s * 32767);
}
// Convert to base64
@@ -95,7 +103,6 @@ export class AudioCaptureProcedure {
}
const base64 = btoa(binary);
- // Push to buffer for Node.js to poll
const chunks = (window as any).__audioCaptureChunks as string[];
if (chunks.length < 30) {
chunks.push(base64);
@@ -107,15 +114,37 @@ export class AudioCaptureProcedure {
};
source.connect(processor);
- processor.connect(ctx.destination);
+ // Connect to a silent gain node so the ScriptProcessor fires
+ // its onaudioprocess callback without routing captured audio
+ // to the speakers (which would conflict with the TTS AudioContext).
+ const silentGain = ctx.createGain();
+ silentGain.gain.value = 0;
+ processor.connect(silentGain);
+ silentGain.connect(ctx.destination);
+
+ // Resume the context explicitly — in authMode Chromium does
+ // not set --autoplay-policy, so new AudioContexts start suspended.
+ ctx.resume().catch(() => {});
+
+ // Clean up when the track ends (peer leaves, renegotiation, etc.)
+ event.track.addEventListener('ended', () => {
+ try {
+ processor.disconnect();
+ source.disconnect();
+ silentGain.disconnect();
+ ctx.close();
+ } catch { /* already closed */ }
+ (window as any).__audioCaptureActive = false;
+ console.log('[AudioCapture] Audio track ended, resources cleaned up');
+ });
- // Store references for cleanup
(window as any).__audioCaptureCtx = ctx;
(window as any).__audioCaptureProcessor = processor;
- console.log('[AudioCapture] WebRTC audio track intercepted, capturing at 16kHz mono');
+ console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
} catch (err) {
console.error('[AudioCapture] Failed to set up audio capture:', err);
+ (window as any).__audioCaptureActive = false;
}
});
@@ -171,6 +200,8 @@ export class AudioCaptureProcedure {
try {
await this._page.evaluate(() => {
(window as any).__audioCaptureActive = false;
+ const proc = (window as any).__audioCaptureProcessor;
+ if (proc) try { proc.disconnect(); } catch { /* ok */ }
const ctx = (window as any).__audioCaptureCtx as AudioContext;
if (ctx) ctx.close();
});
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index dd47c3e..5d8fa25 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -837,6 +837,7 @@ export class BotOrchestrator {
'--no-sandbox',
'--use-fake-ui-for-media-stream',
'--use-fake-device-for-media-stream',
+ '--autoplay-policy=no-user-gesture-required',
]
: [
'--use-fake-ui-for-media-stream',