fixed voice
This commit is contained in:
parent
9e4aad973f
commit
dbecc602b7
2 changed files with 49 additions and 17 deletions
|
|
@ -8,8 +8,8 @@ import { Logger } from 'winston';
|
||||||
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
|
* 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
|
||||||
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
|
* 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
|
||||||
* 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
|
* 3. Incoming audio tracks are captured via AudioContext + ScriptProcessorNode
|
||||||
* 4. Audio chunks (PCM16, 16kHz mono) are buffered and exposed via a global callback
|
* 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
|
||||||
* 5. The Node.js side polls for chunks and sends them to the Gateway
|
* 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
|
||||||
*/
|
*/
|
||||||
export class AudioCaptureProcedure {
|
export class AudioCaptureProcedure {
|
||||||
private _page: Page;
|
private _page: Page;
|
||||||
|
|
@ -39,7 +39,6 @@ export class AudioCaptureProcedure {
|
||||||
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');
|
||||||
|
|
||||||
await this._page.addInitScript(() => {
|
await this._page.addInitScript(() => {
|
||||||
// Audio chunk buffer — Node.js polls this periodically
|
|
||||||
(window as any).__audioCaptureChunks = [] as string[];
|
(window as any).__audioCaptureChunks = [] as string[];
|
||||||
(window as any).__audioCaptureActive = false;
|
(window as any).__audioCaptureActive = false;
|
||||||
|
|
||||||
|
|
@ -56,15 +55,21 @@ export class AudioCaptureProcedure {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
|
const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
|
||||||
const ctx = new AudioCtx({ sampleRate: 16000 });
|
// Use native sample rate (48kHz for WebRTC/Opus) to avoid
|
||||||
|
// forced resampling which destabilises the Chromium audio stack.
|
||||||
|
const ctx = new AudioCtx();
|
||||||
|
const nativeRate = ctx.sampleRate;
|
||||||
const stream = new MediaStream([event.track]);
|
const stream = new MediaStream([event.track]);
|
||||||
const source = ctx.createMediaStreamSource(stream);
|
const source = ctx.createMediaStreamSource(stream);
|
||||||
|
|
||||||
// ScriptProcessor for raw PCM access (deprecated but widely supported)
|
// ScriptProcessor with larger buffer (8192) reduces callback
|
||||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
// frequency and gives the renderer more breathing room.
|
||||||
|
const processor = ctx.createScriptProcessor(8192, 1, 1);
|
||||||
let chunkBuffer: Float32Array[] = [];
|
let chunkBuffer: Float32Array[] = [];
|
||||||
let samplesCollected = 0;
|
let samplesCollected = 0;
|
||||||
const samplesPerChunk = 16000; // 1 second of audio at 16kHz
|
// Collect ~1 second of audio at native rate before emitting
|
||||||
|
const samplesPerChunk = nativeRate;
|
||||||
|
const targetRate = 16000;
|
||||||
|
|
||||||
processor.onaudioprocess = (e: AudioProcessingEvent) => {
|
processor.onaudioprocess = (e: AudioProcessingEvent) => {
|
||||||
const input = e.inputBuffer.getChannelData(0);
|
const input = e.inputBuffer.getChannelData(0);
|
||||||
|
|
@ -72,7 +77,7 @@ export class AudioCaptureProcedure {
|
||||||
samplesCollected += input.length;
|
samplesCollected += input.length;
|
||||||
|
|
||||||
if (samplesCollected >= samplesPerChunk) {
|
if (samplesCollected >= samplesPerChunk) {
|
||||||
// Merge buffers into one Float32Array
|
// Merge buffers into one contiguous array
|
||||||
const merged = new Float32Array(samplesCollected);
|
const merged = new Float32Array(samplesCollected);
|
||||||
let offset = 0;
|
let offset = 0;
|
||||||
for (const buf of chunkBuffer) {
|
for (const buf of chunkBuffer) {
|
||||||
|
|
@ -80,11 +85,14 @@ export class AudioCaptureProcedure {
|
||||||
offset += buf.length;
|
offset += buf.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert Float32 [-1,1] to PCM16 Int16
|
// Downsample from nativeRate to 16 kHz
|
||||||
const pcm16 = new Int16Array(merged.length);
|
const ratio = nativeRate / targetRate;
|
||||||
for (let i = 0; i < merged.length; i++) {
|
const outLen = Math.floor(merged.length / ratio);
|
||||||
const s = Math.max(-1, Math.min(1, merged[i]));
|
const pcm16 = new Int16Array(outLen);
|
||||||
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
for (let i = 0; i < outLen; i++) {
|
||||||
|
const srcIdx = Math.floor(i * ratio);
|
||||||
|
const s = Math.max(-1, Math.min(1, merged[srcIdx]));
|
||||||
|
pcm16[i] = Math.round(s * 32767);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert to base64
|
// Convert to base64
|
||||||
|
|
@ -95,7 +103,6 @@ export class AudioCaptureProcedure {
|
||||||
}
|
}
|
||||||
const base64 = btoa(binary);
|
const base64 = btoa(binary);
|
||||||
|
|
||||||
// Push to buffer for Node.js to poll
|
|
||||||
const chunks = (window as any).__audioCaptureChunks as string[];
|
const chunks = (window as any).__audioCaptureChunks as string[];
|
||||||
if (chunks.length < 30) {
|
if (chunks.length < 30) {
|
||||||
chunks.push(base64);
|
chunks.push(base64);
|
||||||
|
|
@ -107,15 +114,37 @@ export class AudioCaptureProcedure {
|
||||||
};
|
};
|
||||||
|
|
||||||
source.connect(processor);
|
source.connect(processor);
|
||||||
processor.connect(ctx.destination);
|
// Connect to a silent gain node so the ScriptProcessor fires
|
||||||
|
// its onaudioprocess callback without routing captured audio
|
||||||
|
// to the speakers (which would conflict with the TTS AudioContext).
|
||||||
|
const silentGain = ctx.createGain();
|
||||||
|
silentGain.gain.value = 0;
|
||||||
|
processor.connect(silentGain);
|
||||||
|
silentGain.connect(ctx.destination);
|
||||||
|
|
||||||
|
// Resume the context explicitly — in authMode Chromium does
|
||||||
|
// not set --autoplay-policy, so new AudioContexts start suspended.
|
||||||
|
ctx.resume().catch(() => {});
|
||||||
|
|
||||||
|
// Clean up when the track ends (peer leaves, renegotiation, etc.)
|
||||||
|
event.track.addEventListener('ended', () => {
|
||||||
|
try {
|
||||||
|
processor.disconnect();
|
||||||
|
source.disconnect();
|
||||||
|
silentGain.disconnect();
|
||||||
|
ctx.close();
|
||||||
|
} catch { /* already closed */ }
|
||||||
|
(window as any).__audioCaptureActive = false;
|
||||||
|
console.log('[AudioCapture] Audio track ended, resources cleaned up');
|
||||||
|
});
|
||||||
|
|
||||||
// Store references for cleanup
|
|
||||||
(window as any).__audioCaptureCtx = ctx;
|
(window as any).__audioCaptureCtx = ctx;
|
||||||
(window as any).__audioCaptureProcessor = processor;
|
(window as any).__audioCaptureProcessor = processor;
|
||||||
|
|
||||||
console.log('[AudioCapture] WebRTC audio track intercepted, capturing at 16kHz mono');
|
console.log(`[AudioCapture] WebRTC audio track intercepted, native ${nativeRate}Hz -> 16kHz mono`);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
console.error('[AudioCapture] Failed to set up audio capture:', err);
|
||||||
|
(window as any).__audioCaptureActive = false;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -171,6 +200,8 @@ export class AudioCaptureProcedure {
|
||||||
try {
|
try {
|
||||||
await this._page.evaluate(() => {
|
await this._page.evaluate(() => {
|
||||||
(window as any).__audioCaptureActive = false;
|
(window as any).__audioCaptureActive = false;
|
||||||
|
const proc = (window as any).__audioCaptureProcessor;
|
||||||
|
if (proc) try { proc.disconnect(); } catch { /* ok */ }
|
||||||
const ctx = (window as any).__audioCaptureCtx as AudioContext;
|
const ctx = (window as any).__audioCaptureCtx as AudioContext;
|
||||||
if (ctx) ctx.close();
|
if (ctx) ctx.close();
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -837,6 +837,7 @@ export class BotOrchestrator {
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
'--use-fake-ui-for-media-stream',
|
'--use-fake-ui-for-media-stream',
|
||||||
'--use-fake-device-for-media-stream',
|
'--use-fake-device-for-media-stream',
|
||||||
|
'--autoplay-policy=no-user-gesture-required',
|
||||||
]
|
]
|
||||||
: [
|
: [
|
||||||
'--use-fake-ui-for-media-stream',
|
'--use-fake-ui-for-media-stream',
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue