import { Page } from 'playwright'; import { Logger } from 'winston'; /** * Handles audio playback in the Teams meeting. * * Architecture: * - Before any page loads, we inject an init script that overrides getUserMedia * to return a MediaStream from a MediaStreamDestination we control. * - When Teams calls getUserMedia({audio: true}), it gets our custom stream. * - When TTS audio is played, it's piped into the same MediaStreamDestination, * so Teams picks it up as microphone input and sends it via WebRTC. */ export class AudioProcedure { private _page: Page; private _logger: Logger; private _audioContext: boolean = false; private _initScriptInjected: boolean = false; constructor(page: Page, logger: Logger) { this._page = page; this._logger = logger; } /** * Inject the getUserMedia override BEFORE any page navigation. * This MUST be called before navigating to Teams. * Uses page.addInitScript so it runs in every new document context. */ async injectAudioOverride(): Promise { if (this._initScriptInjected) { return; } this._logger.info('Injecting audio getUserMedia override...'); await this._page.addInitScript(() => { // Create a shared AudioContext and MediaStreamDestination // These persist across the page lifetime const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; const ctx = new AudioContextClass(); const streamDest = ctx.createMediaStreamDestination(); // Store globally for later TTS injection (window as any).__ttsAudioContext = ctx; (window as any).__ttsStreamDest = streamDest; (window as any).__ttsAudioStream = streamDest.stream; // Override getUserMedia to return our controlled stream for audio requests const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices); navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => { if (constraints && constraints.audio) { // Return our TTS-injectable audio stream // If video is also requested, combine our audio with real/fake video if (constraints.video) { try { const videoStream = await originalGetUserMedia({ video: constraints.video }); const combinedStream = new MediaStream(); // Add our audio track streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t)); // Add their video track videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t)); return combinedStream; } catch { // If video fails, just return audio return streamDest.stream; } } return streamDest.stream; } return originalGetUserMedia(constraints); }; }); this._initScriptInjected = true; this._logger.info('Audio getUserMedia override injected'); } /** * Initialize the audio context in the browser for TTS playback. * Must be called after joining the meeting (user gesture context). */ async initialize(): Promise { if (this._audioContext) { return; } this._logger.info('Initializing audio context...'); await this._page.evaluate(() => { // The __ttsAudioContext was created by the init script. // Resume it now (requires user gesture - joining meeting counts). const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx && ctx.state === 'suspended') { ctx.resume(); } // If init script didn't run (e.g. page navigated before injection), // create fallback audio infrastructure if (!ctx) { const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; const newCtx = new AudioContextClass(); const streamDest = newCtx.createMediaStreamDestination(); (window as any).__ttsAudioContext = newCtx; (window as any).__ttsStreamDest = streamDest; (window as any).__ttsAudioStream = streamDest.stream; } }); this._audioContext = true; this._logger.info('Audio context initialized'); } /** * Play audio in the browser. * Audio is piped into the MediaStreamDestination that Teams uses as mic input. * * @param audioData Base64 encoded audio data * @param format Audio format (mp3, wav, pcm) */ async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise { if (!this._audioContext) { await this.initialize(); } this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`); try { await this._page.evaluate(async ({ audioData, format }) => { const ctx = (window as any).__ttsAudioContext as AudioContext; const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode; if (!ctx || !streamDest) { throw new Error('Audio context not initialized'); } // Resume context if suspended if (ctx.state === 'suspended') { await ctx.resume(); } // Decode base64 to ArrayBuffer const binaryString = atob(audioData); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } let audioBuffer: AudioBuffer; if (format === 'pcm') { // PCM: Assume 16-bit mono 16kHz const pcmData = new Int16Array(bytes.buffer); audioBuffer = ctx.createBuffer(1, pcmData.length, 16000); const channelData = audioBuffer.getChannelData(0); for (let i = 0; i < pcmData.length; i++) { channelData[i] = pcmData[i] / 32768; } } else { // MP3/WAV: Use decodeAudioData audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0)); } // Play through the MediaStreamDestination -> Teams mic input const source = ctx.createBufferSource(); source.buffer = audioBuffer; source.connect(streamDest); source.start(0); return new Promise((resolve) => { source.onended = () => resolve(); }); }, { audioData, format }); this._logger.info('Audio playback completed'); } catch (error) { this._logger.error('Error playing audio:', error); throw error; } } /** * Stop any currently playing audio. */ async stopAudio(): Promise { try { await this._page.evaluate(() => { const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx) { ctx.suspend(); } }); } catch { // Ignore errors } } /** * Clean up audio resources. */ async cleanup(): Promise { try { await this._page.evaluate(() => { const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx) { ctx.close(); } }); } catch { // Page might be closed } this._audioContext = false; } }