447 lines
16 KiB
TypeScript
447 lines
16 KiB
TypeScript
import { Page } from 'playwright';
|
|
import { Logger } from 'winston';
|
|
import { poweronMediaPatchInstall } from './mediaGetUserMediaPatch';
|
|
|
|
/**
|
|
* Handles audio playback in the Teams meeting.
|
|
*
|
|
* Architecture:
|
|
* - Browser launches with --use-fake-device-for-media-stream so Teams sees
|
|
* real-looking devices (no "no audio/video" modal).
|
|
* - Before any page loads, we inject an init script that wraps getUserMedia.
|
|
* - When Teams calls getUserMedia, the wrapper:
|
|
* 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
|
|
* 2. Replaces the audio track with one from our MediaStreamDestination
|
|
* 3. Returns the modified stream; optional canvas video track instead of fake video
|
|
* - When TTS audio is played, it's piped into the MediaStreamDestination,
|
|
* and Teams sends it via WebRTC to other meeting participants.
|
|
*/
|
|
export type AudioProcedureOptions = {
|
|
useCanvasVideo?: boolean;
|
|
/** Shown in the center of the canvas (e.g. bot display name) */
|
|
displayLabel?: string;
|
|
/** Hex/CSS color of the static avatar background (default: light blue). */
|
|
avatarBgColor?: string;
|
|
/** Hex/CSS color of the centered display label (default: dark blue). */
|
|
avatarTextColor?: string;
|
|
};
|
|
|
|
export class AudioProcedure {
|
|
private _page: Page;
|
|
private _logger: Logger;
|
|
private _useCanvasVideo: boolean;
|
|
private _displayLabel: string;
|
|
private _avatarBgColor: string;
|
|
private _avatarTextColor: string;
|
|
private _audioContext: boolean = false;
|
|
private _initScriptInjected: boolean = false;
|
|
private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = [];
|
|
private _isPlaying: boolean = false;
|
|
private _stopRequested: boolean = false;
|
|
|
|
constructor(page: Page, logger: Logger, options?: AudioProcedureOptions) {
|
|
this._page = page;
|
|
this._logger = logger;
|
|
this._useCanvasVideo = !!options?.useCanvasVideo;
|
|
this._displayLabel = (options?.displayLabel || 'Bot').trim() || 'Bot';
|
|
this._avatarBgColor = (options?.avatarBgColor || '').trim() || '#a8d4f0';
|
|
this._avatarTextColor = (options?.avatarTextColor || '').trim() || '#1a3552';
|
|
}
|
|
|
|
/**
|
|
* Inject the getUserMedia wrapper BEFORE any page navigation.
|
|
* This MUST be called before navigating to Teams.
|
|
* Uses browserContext.addInitScript so the hook runs in the main page and
|
|
* in embedded frames (Teams often runs media/WebRTC in an iframe; page-only
|
|
* injection would miss getUserMedia and you would only see the fake device).
|
|
*/
|
|
async injectAudioOverride(): Promise<void> {
|
|
if (this._initScriptInjected) {
|
|
return;
|
|
}
|
|
|
|
this._logger.info(
|
|
`Injecting audio getUserMedia override (canvasVideo=${this._useCanvasVideo}, label="${this._displayLabel}")...`,
|
|
);
|
|
|
|
await this._page.context().addInitScript(poweronMediaPatchInstall, {
|
|
useCanvasVideo: this._useCanvasVideo,
|
|
displayLabel: this._displayLabel,
|
|
avatarBgColor: this._avatarBgColor,
|
|
avatarTextColor: this._avatarTextColor,
|
|
});
|
|
|
|
this._initScriptInjected = true;
|
|
this._logger.info('Audio getUserMedia override injected');
|
|
}
|
|
|
|
/**
|
|
* Re-run the media patch in every frame. Needed when Teams replaces the document
|
|
* in an iframe (addInitScript runs too early) or overwrites getUserMedia.
|
|
*/
|
|
async reinstallMediaPatchInAllFrames(): Promise<void> {
|
|
const payload = {
|
|
useCanvasVideo: this._useCanvasVideo,
|
|
displayLabel: this._displayLabel,
|
|
avatarBgColor: this._avatarBgColor,
|
|
avatarTextColor: this._avatarTextColor,
|
|
};
|
|
for (const frame of this._page.frames()) {
|
|
try {
|
|
await frame.evaluate(poweronMediaPatchInstall, payload);
|
|
} catch (e) {
|
|
this._logger.info(`[mediaPatch] frame skipped: ${e}`);
|
|
}
|
|
}
|
|
await this._forceCanvasVideoInAllFrames('reinstall');
|
|
}
|
|
|
|
/**
|
|
* Replace outbound video in every frame. Teams may run WebRTC in a subframe;
|
|
* only touching the main window leaves Chromium's default fake (green) video.
|
|
*/
|
|
private async _forceCanvasVideoInAllFrames(phase: string): Promise<void> {
|
|
if (!this._useCanvasVideo) {
|
|
return;
|
|
}
|
|
const parts: string[] = [];
|
|
for (const frame of this._page.frames()) {
|
|
try {
|
|
const r = await frame.evaluate(async () => {
|
|
const w = window as any;
|
|
w.__startBotAvatarStream?.();
|
|
return w.__forceVideoTrackToSenders?.();
|
|
});
|
|
const shortUrl = (() => {
|
|
try {
|
|
return frame.url().substring(0, 100);
|
|
} catch {
|
|
return '(no-url)';
|
|
}
|
|
})();
|
|
const rr: any = r || {};
|
|
const vsArr = (rr.videoStats || []) as any[];
|
|
const vs = vsArr.length
|
|
? vsArr.map(v => `${v.kind}:b=${v.bytes},p=${v.packets},fEnc=${v.framesEncoded},fSent=${v.framesSent},fps=${v.fps},${v.w}x${v.h}`).join(' | ')
|
|
: 'none';
|
|
parts.push(
|
|
`[${shortUrl}] r=${rr.replaced ?? 0} add=${rr.added ?? 0} pcs=${rr.pcs ?? 0} `
|
|
+ `tx=${rr.totalTransceivers ?? 0} vidTx=${rr.videoTransceivers ?? 0} `
|
|
+ `vidWith=${rr.videoSendersWithTrack ?? 0} vidNoTrack=${rr.videoSendersWithoutTrack ?? 0} `
|
|
+ `dirB=[${(rr.directionsBefore || []).join(',')}] dirA=[${(rr.directionsAfter || []).join(',')}] `
|
|
+ `cd=[${(rr.currentDirections || []).join(',')}] `
|
|
+ `track=${rr.trackId || 'n/a'}(en=${rr.trackEnabled},rs=${rr.trackReady},mu=${rr.trackMuted}) `
|
|
+ `vstats=[${vs}] ${rr.reason || ''}`.trim(),
|
|
);
|
|
} catch (e: any) {
|
|
parts.push(`err=${String(e?.message || e).slice(0, 64)}`);
|
|
}
|
|
}
|
|
this._logger.info(`Canvas video ${phase}: ${parts.join(' | ')}`);
|
|
}
|
|
|
|
/**
|
|
* Initialize the audio context in the browser for TTS playback.
|
|
* Must be called after joining the meeting (user gesture context).
|
|
*/
|
|
async initialize(): Promise<void> {
|
|
if (this._audioContext) {
|
|
return;
|
|
}
|
|
|
|
this._logger.info('Initializing audio context...');
|
|
|
|
await this._page.evaluate(() => {
|
|
// The __ttsAudioContext was created by the init script.
|
|
// Resume it now (requires user gesture - joining meeting counts).
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
if (ctx && ctx.state === 'suspended') {
|
|
ctx.resume();
|
|
}
|
|
|
|
// If init script didn't run (e.g. page navigated before injection),
|
|
// create fallback audio infrastructure
|
|
if (!ctx) {
|
|
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
|
const newCtx = new AudioContextClass();
|
|
const streamDest = newCtx.createMediaStreamDestination();
|
|
(window as any).__ttsAudioContext = newCtx;
|
|
(window as any).__ttsStreamDest = streamDest;
|
|
(window as any).__ttsAudioStream = streamDest.stream;
|
|
}
|
|
});
|
|
|
|
if (this._useCanvasVideo) {
|
|
await this._forceCanvasVideoInAllFrames('init');
|
|
}
|
|
|
|
this._audioContext = true;
|
|
this._logger.info('Audio context initialized');
|
|
}
|
|
|
|
/**
|
|
* Queue audio for sequential playback.
|
|
* Audio is never played in parallel -- each clip waits for the previous one to finish.
|
|
*
|
|
* @param audioData Base64 encoded audio data
|
|
* @param format Audio format (mp3, wav, pcm)
|
|
*/
|
|
async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
|
|
// Add to queue
|
|
this._audioQueue.push({ audioData, format });
|
|
this._logger.info(`Audio queued (queue size: ${this._audioQueue.length}, playing: ${this._isPlaying})`);
|
|
|
|
// If not currently playing, start processing the queue
|
|
if (!this._isPlaying) {
|
|
await this._processAudioQueue();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Process the audio queue sequentially.
|
|
*/
|
|
private async _processAudioQueue(): Promise<void> {
|
|
if (this._isPlaying) return;
|
|
this._isPlaying = true;
|
|
this._stopRequested = false;
|
|
|
|
while (this._audioQueue.length > 0 && !this._stopRequested) {
|
|
const item = this._audioQueue.shift()!;
|
|
try {
|
|
await this._playAudioInternal(item.audioData, item.format);
|
|
} catch (error) {
|
|
this._logger.error('Error playing queued audio:', error);
|
|
}
|
|
}
|
|
|
|
if (this._stopRequested) {
|
|
this._audioQueue = [];
|
|
this._logger.info('Audio queue cleared due to stop request');
|
|
}
|
|
|
|
this._isPlaying = false;
|
|
this._stopRequested = false;
|
|
}
|
|
|
|
/**
|
|
* Stop all audio immediately: stop current playback and clear the queue.
|
|
* Called when a user says "<botname> STOP" or similar.
|
|
*/
|
|
async stopAllAudio(): Promise<void> {
|
|
this._logger.info('Stop all audio requested');
|
|
this._stopRequested = true;
|
|
this._audioQueue = [];
|
|
|
|
try {
|
|
await this._page.evaluate(() => {
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
if (ctx) {
|
|
// Suspend immediately stops all audio output
|
|
ctx.suspend();
|
|
// Resume after a short delay so future audio can play
|
|
setTimeout(() => ctx.resume(), 100);
|
|
}
|
|
});
|
|
} catch {
|
|
// Page might not be ready
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Internal: Play audio in the browser (single clip, no queuing).
|
|
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
|
|
* Before playback, forces all WebRTC audio senders to use the TTS track
|
|
* (sender.replaceTrack) so Teams transmits our audio to participants.
|
|
*/
|
|
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
|
|
if (!this._audioContext) {
|
|
await this.initialize();
|
|
}
|
|
|
|
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
|
|
|
|
try {
|
|
// Force all outgoing audio senders to use the TTS track
|
|
const senderInjectInfo = await this._page.evaluate(async () => {
|
|
const forceFn = (window as any).__forceTtsTrackToSenders;
|
|
if (typeof forceFn === 'function') {
|
|
return await forceFn();
|
|
}
|
|
return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
|
|
});
|
|
// #region agent log
|
|
const diag = senderInjectInfo?.diag || {};
|
|
this._logger.info(
|
|
`TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'} ` +
|
|
`ttsTrack=${diag.ttsTrackId || 'n/a'}(enabled=${diag.ttsTrackEnabled},state=${diag.ttsTrackReadyState},muted=${diag.ttsTrackMuted}) ` +
|
|
`beforeSenders=[${(diag.beforeSenderTrackIds || []).join(',')}] afterSenders=[${(diag.afterSenderTrackIds || []).join(',')}] ` +
|
|
`afterEnabled=${diag.afterSenderTrackEnabled} afterState=${diag.afterSenderTrackReadyState} forcedEnabled=${diag.forcedEnabled || false}`,
|
|
);
|
|
// #endregion
|
|
|
|
if (this._useCanvasVideo) {
|
|
await this._forceCanvasVideoInAllFrames('tts');
|
|
}
|
|
|
|
// Collect WebRTC stats BEFORE playback
|
|
// #region agent log
|
|
const statsBefore = await this._page.evaluate(async () => {
|
|
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
|
let bytesSent = 0; let packetsSent = 0;
|
|
for (const pc of pcs) {
|
|
for (const s of (pc.getSenders?.() || [])) {
|
|
if (s?.track?.kind !== 'audio') continue;
|
|
try {
|
|
const stats = await s.getStats();
|
|
stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } });
|
|
} catch {}
|
|
}
|
|
}
|
|
return { bytesSent, packetsSent };
|
|
});
|
|
// #endregion
|
|
|
|
await this._page.evaluate(async ({ audioData, format }) => {
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
|
|
|
if (!ctx || !streamDest) {
|
|
throw new Error('Audio context not initialized');
|
|
}
|
|
|
|
if (ctx.state === 'suspended') {
|
|
await ctx.resume();
|
|
}
|
|
|
|
const binaryString = atob(audioData);
|
|
const bytes = new Uint8Array(binaryString.length);
|
|
for (let i = 0; i < binaryString.length; i++) {
|
|
bytes[i] = binaryString.charCodeAt(i);
|
|
}
|
|
|
|
let audioBuffer: AudioBuffer;
|
|
|
|
if (format === 'pcm') {
|
|
const pcmData = new Int16Array(bytes.buffer);
|
|
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
|
|
const channelData = audioBuffer.getChannelData(0);
|
|
for (let i = 0; i < pcmData.length; i++) {
|
|
channelData[i] = pcmData[i] / 32768;
|
|
}
|
|
} else {
|
|
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
|
}
|
|
|
|
const source = ctx.createBufferSource();
|
|
source.buffer = audioBuffer;
|
|
source.connect(streamDest);
|
|
source.start(0);
|
|
|
|
return new Promise<void>((resolve) => {
|
|
source.onended = () => {
|
|
try {
|
|
source.disconnect();
|
|
} catch {
|
|
// already disconnected
|
|
}
|
|
resolve();
|
|
};
|
|
});
|
|
}, { audioData, format });
|
|
|
|
// Collect WebRTC stats AFTER playback
|
|
// #region agent log
|
|
const statsAfter = await this._page.evaluate(async () => {
|
|
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
|
let bytesSent = 0; let packetsSent = 0;
|
|
for (const pc of pcs) {
|
|
for (const s of (pc.getSenders?.() || [])) {
|
|
if (s?.track?.kind !== 'audio') continue;
|
|
try {
|
|
const stats = await s.getStats();
|
|
stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } });
|
|
} catch {}
|
|
}
|
|
}
|
|
// Also check current sender track state
|
|
const senderInfo: any[] = [];
|
|
for (const pc of pcs) {
|
|
for (const s of (pc.getSenders?.() || [])) {
|
|
if (s?.track?.kind !== 'audio') continue;
|
|
senderInfo.push({ id: s.track.id, enabled: s.track.enabled, readyState: s.track.readyState, muted: s.track.muted });
|
|
}
|
|
}
|
|
return { bytesSent, packetsSent, senderInfo };
|
|
});
|
|
this._logger.info(
|
|
`[Voice] WebRTC stats: before(bytes=${statsBefore.bytesSent},pkts=${statsBefore.packetsSent}) ` +
|
|
`after(bytes=${statsAfter.bytesSent},pkts=${statsAfter.packetsSent}) ` +
|
|
`delta(bytes=${statsAfter.bytesSent - statsBefore.bytesSent},pkts=${statsAfter.packetsSent - statsBefore.packetsSent}) ` +
|
|
`senders=${JSON.stringify(statsAfter.senderInfo)}`,
|
|
);
|
|
// #endregion
|
|
|
|
this._logger.info('Audio playback completed');
|
|
} catch (error) {
|
|
this._logger.error('Error playing audio:', error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Stop any currently playing audio.
|
|
*/
|
|
async stopAudio(): Promise<void> {
|
|
try {
|
|
await this._page.evaluate(() => {
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
if (ctx) {
|
|
ctx.suspend();
|
|
}
|
|
});
|
|
} catch {
|
|
// Ignore errors
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up audio resources.
|
|
*/
|
|
async cleanup(): Promise<void> {
|
|
try {
|
|
for (const frame of this._page.frames()) {
|
|
try {
|
|
await frame.evaluate(() => {
|
|
const w = window as any;
|
|
if (w.__botAvatarDrawInterval) {
|
|
clearInterval(w.__botAvatarDrawInterval);
|
|
w.__botAvatarDrawInterval = null;
|
|
}
|
|
if (w.__botAvatarVideoTrack) {
|
|
try {
|
|
w.__botAvatarVideoTrack.stop();
|
|
} catch {
|
|
// ignore
|
|
}
|
|
w.__botAvatarVideoTrack = null;
|
|
}
|
|
if (w.__botAvatarCanvas && w.__botAvatarCanvas.remove) {
|
|
w.__botAvatarCanvas.remove();
|
|
w.__botAvatarCanvas = null;
|
|
}
|
|
w.__botAvatarStreamStarted = false;
|
|
const actx = w.__ttsAudioContext as AudioContext;
|
|
if (actx) {
|
|
actx.close();
|
|
}
|
|
});
|
|
} catch {
|
|
// cross-origin or closed frame
|
|
}
|
|
}
|
|
} catch {
|
|
// Page might be closed
|
|
}
|
|
this._audioContext = false;
|
|
}
|
|
}
|