214 lines
7.1 KiB
TypeScript
214 lines
7.1 KiB
TypeScript
import { Page } from 'playwright';
|
|
import { Logger } from 'winston';
|
|
|
|
/**
|
|
* Handles audio playback in the Teams meeting.
|
|
*
|
|
* Architecture:
|
|
* - Before any page loads, we inject an init script that overrides getUserMedia
|
|
* to return a MediaStream from a MediaStreamDestination we control.
|
|
* - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
|
|
* - When TTS audio is played, it's piped into the same MediaStreamDestination,
|
|
* so Teams picks it up as microphone input and sends it via WebRTC.
|
|
*/
|
|
export class AudioProcedure {
|
|
private _page: Page;
|
|
private _logger: Logger;
|
|
private _audioContext: boolean = false;
|
|
private _initScriptInjected: boolean = false;
|
|
|
|
constructor(page: Page, logger: Logger) {
|
|
this._page = page;
|
|
this._logger = logger;
|
|
}
|
|
|
|
/**
|
|
* Inject the getUserMedia override BEFORE any page navigation.
|
|
* This MUST be called before navigating to Teams.
|
|
* Uses page.addInitScript so it runs in every new document context.
|
|
*/
|
|
async injectAudioOverride(): Promise<void> {
|
|
if (this._initScriptInjected) {
|
|
return;
|
|
}
|
|
|
|
this._logger.info('Injecting audio getUserMedia override...');
|
|
|
|
await this._page.addInitScript(() => {
|
|
// Create a shared AudioContext and MediaStreamDestination
|
|
// These persist across the page lifetime
|
|
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
|
const ctx = new AudioContextClass();
|
|
const streamDest = ctx.createMediaStreamDestination();
|
|
|
|
// Store globally for later TTS injection
|
|
(window as any).__ttsAudioContext = ctx;
|
|
(window as any).__ttsStreamDest = streamDest;
|
|
(window as any).__ttsAudioStream = streamDest.stream;
|
|
|
|
// Override getUserMedia to return our controlled stream for audio requests
|
|
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
|
|
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
|
|
if (constraints && constraints.audio) {
|
|
// Return our TTS-injectable audio stream
|
|
// If video is also requested, combine our audio with real/fake video
|
|
if (constraints.video) {
|
|
try {
|
|
const videoStream = await originalGetUserMedia({ video: constraints.video });
|
|
const combinedStream = new MediaStream();
|
|
// Add our audio track
|
|
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
|
|
// Add their video track
|
|
videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
|
|
return combinedStream;
|
|
} catch {
|
|
// If video fails, just return audio
|
|
return streamDest.stream;
|
|
}
|
|
}
|
|
return streamDest.stream;
|
|
}
|
|
return originalGetUserMedia(constraints);
|
|
};
|
|
});
|
|
|
|
this._initScriptInjected = true;
|
|
this._logger.info('Audio getUserMedia override injected');
|
|
}
|
|
|
|
/**
|
|
* Initialize the audio context in the browser for TTS playback.
|
|
* Must be called after joining the meeting (user gesture context).
|
|
*/
|
|
async initialize(): Promise<void> {
|
|
if (this._audioContext) {
|
|
return;
|
|
}
|
|
|
|
this._logger.info('Initializing audio context...');
|
|
|
|
await this._page.evaluate(() => {
|
|
// The __ttsAudioContext was created by the init script.
|
|
// Resume it now (requires user gesture - joining meeting counts).
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
if (ctx && ctx.state === 'suspended') {
|
|
ctx.resume();
|
|
}
|
|
|
|
// If init script didn't run (e.g. page navigated before injection),
|
|
// create fallback audio infrastructure
|
|
if (!ctx) {
|
|
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
|
const newCtx = new AudioContextClass();
|
|
const streamDest = newCtx.createMediaStreamDestination();
|
|
(window as any).__ttsAudioContext = newCtx;
|
|
(window as any).__ttsStreamDest = streamDest;
|
|
(window as any).__ttsAudioStream = streamDest.stream;
|
|
}
|
|
});
|
|
|
|
this._audioContext = true;
|
|
this._logger.info('Audio context initialized');
|
|
}
|
|
|
|
/**
|
|
* Play audio in the browser.
|
|
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
|
|
*
|
|
* @param audioData Base64 encoded audio data
|
|
* @param format Audio format (mp3, wav, pcm)
|
|
*/
|
|
async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
|
|
if (!this._audioContext) {
|
|
await this.initialize();
|
|
}
|
|
|
|
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
|
|
|
|
try {
|
|
await this._page.evaluate(async ({ audioData, format }) => {
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
|
|
|
if (!ctx || !streamDest) {
|
|
throw new Error('Audio context not initialized');
|
|
}
|
|
|
|
// Resume context if suspended
|
|
if (ctx.state === 'suspended') {
|
|
await ctx.resume();
|
|
}
|
|
|
|
// Decode base64 to ArrayBuffer
|
|
const binaryString = atob(audioData);
|
|
const bytes = new Uint8Array(binaryString.length);
|
|
for (let i = 0; i < binaryString.length; i++) {
|
|
bytes[i] = binaryString.charCodeAt(i);
|
|
}
|
|
|
|
let audioBuffer: AudioBuffer;
|
|
|
|
if (format === 'pcm') {
|
|
// PCM: Assume 16-bit mono 16kHz
|
|
const pcmData = new Int16Array(bytes.buffer);
|
|
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
|
|
const channelData = audioBuffer.getChannelData(0);
|
|
for (let i = 0; i < pcmData.length; i++) {
|
|
channelData[i] = pcmData[i] / 32768;
|
|
}
|
|
} else {
|
|
// MP3/WAV: Use decodeAudioData
|
|
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
|
}
|
|
|
|
// Play through the MediaStreamDestination -> Teams mic input
|
|
const source = ctx.createBufferSource();
|
|
source.buffer = audioBuffer;
|
|
source.connect(streamDest);
|
|
source.start(0);
|
|
|
|
return new Promise<void>((resolve) => {
|
|
source.onended = () => resolve();
|
|
});
|
|
}, { audioData, format });
|
|
|
|
this._logger.info('Audio playback completed');
|
|
} catch (error) {
|
|
this._logger.error('Error playing audio:', error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Stop any currently playing audio.
|
|
*/
|
|
async stopAudio(): Promise<void> {
|
|
try {
|
|
await this._page.evaluate(() => {
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
if (ctx) {
|
|
ctx.suspend();
|
|
}
|
|
});
|
|
} catch {
|
|
// Ignore errors
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up audio resources.
|
|
*/
|
|
async cleanup(): Promise<void> {
|
|
try {
|
|
await this._page.evaluate(() => {
|
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
|
if (ctx) {
|
|
ctx.close();
|
|
}
|
|
});
|
|
} catch {
|
|
// Page might be closed
|
|
}
|
|
this._audioContext = false;
|
|
}
|
|
}
|