fixed voice

This commit is contained in:
ValueOn AG 2026-02-23 23:01:25 +01:00
parent 9e4aad973f
commit dbecc602b7
2 changed files with 49 additions and 17 deletions

View file

@ -8,8 +8,8 @@ import { Logger } from 'winston';
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
* 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode * 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
* 4. Audio chunks (PCM16, 16kHz mono) are buffered and exposed via a global callback * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
* 5. The Node.js side polls for chunks and sends them to the Gateway * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
*/ */
export class AudioCaptureProcedure { export class AudioCaptureProcedure {
private _page: Page; private _page: Page;
@ -39,7 +39,6 @@ export class AudioCaptureProcedure {
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
await this._page.addInitScript(() => { await this._page.addInitScript(() => {
// Audio chunk buffer — Node.js polls this periodically
(window as any).__audioCaptureChunks = [] as string[]; (window as any).__audioCaptureChunks = [] as string[];
(window as any).__audioCaptureActive = false; (window as any).__audioCaptureActive = false;
@ -56,15 +55,21 @@ export class AudioCaptureProcedure {
try { try {
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext; const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
const ctx = new AudioCtx({ sampleRate: 16000 }); // Use native sample rate (48kHz for WebRTC/Opus) to avoid
// forced resampling which destabilises the Chromium audio stack.
const ctx = new AudioCtx();
const nativeRate = ctx.sampleRate;
const stream = new MediaStream([event.track]); const stream = new MediaStream([event.track]);
const source = ctx.createMediaStreamSource(stream); const source = ctx.createMediaStreamSource(stream);
// ScriptProcessor for raw PCM access (deprecated but widely supported) // ScriptProcessor with larger buffer (8192) reduces callback
const processor = ctx.createScriptProcessor(4096, 1, 1); // frequency and gives the renderer more breathing room.
const processor = ctx.createScriptProcessor(8192, 1, 1);
let chunkBuffer: Float32Array[] = []; let chunkBuffer: Float32Array[] = [];
let samplesCollected = 0; let samplesCollected = 0;
const samplesPerChunk = 16000; // 1 second of audio at 16kHz // Collect ~1 second of audio at native rate before emitting
const samplesPerChunk = nativeRate;
const targetRate = 16000;
processor.onaudioprocess = (e: AudioProcessingEvent) => { processor.onaudioprocess = (e: AudioProcessingEvent) => {
const input = e.inputBuffer.getChannelData(0); const input = e.inputBuffer.getChannelData(0);
@ -72,7 +77,7 @@ export class AudioCaptureProcedure {
samplesCollected += input.length; samplesCollected += input.length;
if (samplesCollected >= samplesPerChunk) { if (samplesCollected >= samplesPerChunk) {
// Merge buffers into one Float32Array // Merge buffers into one contiguous array
const merged = new Float32Array(samplesCollected); const merged = new Float32Array(samplesCollected);
let offset = 0; let offset = 0;
for (const buf of chunkBuffer) { for (const buf of chunkBuffer) {
@ -80,11 +85,14 @@ export class AudioCaptureProcedure {
offset += buf.length; offset += buf.length;
} }
// Convert Float32 [-1,1] to PCM16 Int16 // Downsample from nativeRate to 16 kHz
const pcm16 = new Int16Array(merged.length); const ratio = nativeRate / targetRate;
for (let i = 0; i < merged.length; i++) { const outLen = Math.floor(merged.length / ratio);
const s = Math.max(-1, Math.min(1, merged[i])); const pcm16 = new Int16Array(outLen);
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF; for (let i = 0; i < outLen; i++) {
const srcIdx = Math.floor(i * ratio);
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
pcm16[i] = Math.round(s * 32767);
} }
// Convert to base64 // Convert to base64
@ -95,7 +103,6 @@ export class AudioCaptureProcedure {
} }
const base64 = btoa(binary); const base64 = btoa(binary);
// Push to buffer for Node.js to poll
const chunks = (window as any).__audioCaptureChunks as string[]; const chunks = (window as any).__audioCaptureChunks as string[];
if (chunks.length < 30) { if (chunks.length < 30) {
chunks.push(base64); chunks.push(base64);
@ -107,15 +114,37 @@ export class AudioCaptureProcedure {
}; };
source.connect(processor); source.connect(processor);
processor.connect(ctx.destination); // Connect to a silent gain node so the ScriptProcessor fires
// its onaudioprocess callback without routing captured audio
// to the speakers (which would conflict with the TTS AudioContext).
const silentGain = ctx.createGain();
silentGain.gain.value = 0;
processor.connect(silentGain);
silentGain.connect(ctx.destination);
// Resume the context explicitly — in authMode Chromium does
// not set --autoplay-policy, so new AudioContexts start suspended.
ctx.resume().catch(() => {});
// Clean up when the track ends (peer leaves, renegotiation, etc.)
event.track.addEventListener('ended', () => {
try {
processor.disconnect();
source.disconnect();
silentGain.disconnect();
ctx.close();
} catch { /* already closed */ }
(window as any).__audioCaptureActive = false;
console.log('[AudioCapture] Audio track ended, resources cleaned up');
});
// Store references for cleanup
(window as any).__audioCaptureCtx = ctx; (window as any).__audioCaptureCtx = ctx;
(window as any).__audioCaptureProcessor = processor; (window as any).__audioCaptureProcessor = processor;
console.log('[AudioCapture] WebRTC audio track intercepted, capturing at 16kHz mono'); console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
} catch (err) { } catch (err) {
console.error('[AudioCapture] Failed to set up audio capture:', err); console.error('[AudioCapture] Failed to set up audio capture:', err);
(window as any).__audioCaptureActive = false;
} }
}); });
@ -171,6 +200,8 @@ export class AudioCaptureProcedure {
try { try {
await this._page.evaluate(() => { await this._page.evaluate(() => {
(window as any).__audioCaptureActive = false; (window as any).__audioCaptureActive = false;
const proc = (window as any).__audioCaptureProcessor;
if (proc) try { proc.disconnect(); } catch { /* ok */ }
const ctx = (window as any).__audioCaptureCtx as AudioContext; const ctx = (window as any).__audioCaptureCtx as AudioContext;
if (ctx) ctx.close(); if (ctx) ctx.close();
}); });

View file

@ -837,6 +837,7 @@ export class BotOrchestrator {
'--no-sandbox', '--no-sandbox',
'--use-fake-ui-for-media-stream', '--use-fake-ui-for-media-stream',
'--use-fake-device-for-media-stream', '--use-fake-device-for-media-stream',
'--autoplay-policy=no-user-gesture-required',
] ]
: [ : [
'--use-fake-ui-for-media-stream', '--use-fake-ui-for-media-stream',