import { Page } from 'playwright'; import { Logger } from 'winston'; import { poweronMediaPatchInstall } from './mediaGetUserMediaPatch'; /** * Handles audio playback in the Teams meeting. * * Architecture: * - Browser launches with --use-fake-device-for-media-stream so Teams sees * real-looking devices (no "no audio/video" modal). * - Before any page loads, we inject an init script that wraps getUserMedia. * - When Teams calls getUserMedia, the wrapper: * 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream) * 2. Replaces the audio track with one from our MediaStreamDestination * 3. Returns the modified stream; optional canvas video track instead of fake video * - When TTS audio is played, it's piped into the MediaStreamDestination, * and Teams sends it via WebRTC to other meeting participants. */ export type AudioProcedureOptions = { useCanvasVideo?: boolean; /** Shown in the center of the canvas (e.g. bot display name) */ displayLabel?: string; /** Hex/CSS color of the static avatar background (default: light blue). */ avatarBgColor?: string; /** Hex/CSS color of the centered display label (default: dark blue). */ avatarTextColor?: string; }; export class AudioProcedure { private _page: Page; private _logger: Logger; private _useCanvasVideo: boolean; private _displayLabel: string; private _avatarBgColor: string; private _avatarTextColor: string; private _audioContext: boolean = false; private _initScriptInjected: boolean = false; private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = []; private _isPlaying: boolean = false; private _stopRequested: boolean = false; constructor(page: Page, logger: Logger, options?: AudioProcedureOptions) { this._page = page; this._logger = logger; this._useCanvasVideo = !!options?.useCanvasVideo; this._displayLabel = (options?.displayLabel || 'Bot').trim() || 'Bot'; this._avatarBgColor = (options?.avatarBgColor || '').trim() || '#a8d4f0'; this._avatarTextColor = (options?.avatarTextColor || '').trim() || '#1a3552'; } /** * Inject the getUserMedia wrapper BEFORE any page navigation. * This MUST be called before navigating to Teams. * Uses browserContext.addInitScript so the hook runs in the main page and * in embedded frames (Teams often runs media/WebRTC in an iframe; page-only * injection would miss getUserMedia and you would only see the fake device). */ async injectAudioOverride(): Promise { if (this._initScriptInjected) { return; } this._logger.info( `Injecting audio getUserMedia override (canvasVideo=${this._useCanvasVideo}, label="${this._displayLabel}")...`, ); await this._page.context().addInitScript(poweronMediaPatchInstall, { useCanvasVideo: this._useCanvasVideo, displayLabel: this._displayLabel, avatarBgColor: this._avatarBgColor, avatarTextColor: this._avatarTextColor, }); this._initScriptInjected = true; this._logger.info('Audio getUserMedia override injected'); } /** * Re-run the media patch in every frame. Needed when Teams replaces the document * in an iframe (addInitScript runs too early) or overwrites getUserMedia. */ async reinstallMediaPatchInAllFrames(): Promise { const payload = { useCanvasVideo: this._useCanvasVideo, displayLabel: this._displayLabel, avatarBgColor: this._avatarBgColor, avatarTextColor: this._avatarTextColor, }; for (const frame of this._page.frames()) { try { await frame.evaluate(poweronMediaPatchInstall, payload); } catch (e) { this._logger.info(`[mediaPatch] frame skipped: ${e}`); } } await this._forceCanvasVideoInAllFrames('reinstall'); } /** * Replace outbound video in every frame. Teams may run WebRTC in a subframe; * only touching the main window leaves Chromium's default fake (green) video. */ private async _forceCanvasVideoInAllFrames(phase: string): Promise { if (!this._useCanvasVideo) { return; } const parts: string[] = []; for (const frame of this._page.frames()) { try { const r = await frame.evaluate(async () => { const w = window as any; w.__startBotAvatarStream?.(); return w.__forceVideoTrackToSenders?.(); }); const shortUrl = (() => { try { return frame.url().substring(0, 100); } catch { return '(no-url)'; } })(); const rr: any = r || {}; const vsArr = (rr.videoStats || []) as any[]; const vs = vsArr.length ? vsArr.map(v => `${v.kind}:b=${v.bytes},p=${v.packets},fEnc=${v.framesEncoded},fSent=${v.framesSent},fps=${v.fps},${v.w}x${v.h}`).join(' | ') : 'none'; parts.push( `[${shortUrl}] r=${rr.replaced ?? 0} add=${rr.added ?? 0} pcs=${rr.pcs ?? 0} ` + `tx=${rr.totalTransceivers ?? 0} vidTx=${rr.videoTransceivers ?? 0} ` + `vidWith=${rr.videoSendersWithTrack ?? 0} vidNoTrack=${rr.videoSendersWithoutTrack ?? 0} ` + `dirB=[${(rr.directionsBefore || []).join(',')}] dirA=[${(rr.directionsAfter || []).join(',')}] ` + `cd=[${(rr.currentDirections || []).join(',')}] ` + `track=${rr.trackId || 'n/a'}(en=${rr.trackEnabled},rs=${rr.trackReady},mu=${rr.trackMuted}) ` + `vstats=[${vs}] ${rr.reason || ''}`.trim(), ); } catch (e: any) { parts.push(`err=${String(e?.message || e).slice(0, 64)}`); } } this._logger.info(`Canvas video ${phase}: ${parts.join(' | ')}`); } /** * Initialize the audio context in the browser for TTS playback. * Must be called after joining the meeting (user gesture context). */ async initialize(): Promise { if (this._audioContext) { return; } this._logger.info('Initializing audio context...'); await this._page.evaluate(() => { // The __ttsAudioContext was created by the init script. // Resume it now (requires user gesture - joining meeting counts). const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx && ctx.state === 'suspended') { ctx.resume(); } // If init script didn't run (e.g. page navigated before injection), // create fallback audio infrastructure if (!ctx) { const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; const newCtx = new AudioContextClass(); const streamDest = newCtx.createMediaStreamDestination(); (window as any).__ttsAudioContext = newCtx; (window as any).__ttsStreamDest = streamDest; (window as any).__ttsAudioStream = streamDest.stream; } }); if (this._useCanvasVideo) { await this._forceCanvasVideoInAllFrames('init'); } this._audioContext = true; this._logger.info('Audio context initialized'); } /** * Queue audio for sequential playback. * Audio is never played in parallel -- each clip waits for the previous one to finish. * * @param audioData Base64 encoded audio data * @param format Audio format (mp3, wav, pcm) */ async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise { // Add to queue this._audioQueue.push({ audioData, format }); this._logger.info(`Audio queued (queue size: ${this._audioQueue.length}, playing: ${this._isPlaying})`); // If not currently playing, start processing the queue if (!this._isPlaying) { await this._processAudioQueue(); } } /** * Process the audio queue sequentially. */ private async _processAudioQueue(): Promise { if (this._isPlaying) return; this._isPlaying = true; this._stopRequested = false; while (this._audioQueue.length > 0 && !this._stopRequested) { const item = this._audioQueue.shift()!; try { await this._playAudioInternal(item.audioData, item.format); } catch (error) { this._logger.error('Error playing queued audio:', error); } } if (this._stopRequested) { this._audioQueue = []; this._logger.info('Audio queue cleared due to stop request'); } this._isPlaying = false; this._stopRequested = false; } /** * Stop all audio immediately: stop current playback and clear the queue. * Called when a user says " STOP" or similar. */ async stopAllAudio(): Promise { this._logger.info('Stop all audio requested'); this._stopRequested = true; this._audioQueue = []; try { await this._page.evaluate(() => { const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx) { // Suspend immediately stops all audio output ctx.suspend(); // Resume after a short delay so future audio can play setTimeout(() => ctx.resume(), 100); } }); } catch { // Page might not be ready } } /** * Internal: Play audio in the browser (single clip, no queuing). * Audio is piped into the MediaStreamDestination that Teams uses as mic input. * Before playback, forces all WebRTC audio senders to use the TTS track * (sender.replaceTrack) so Teams transmits our audio to participants. */ private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise { if (!this._audioContext) { await this.initialize(); } this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`); try { // Force all outgoing audio senders to use the TTS track const senderInjectInfo = await this._page.evaluate(async () => { const forceFn = (window as any).__forceTtsTrackToSenders; if (typeof forceFn === 'function') { return await forceFn(); } return { replaced: 0, pcs: 0, reason: 'force-function-missing' }; }); // #region agent log const diag = senderInjectInfo?.diag || {}; this._logger.info( `TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'} ` + `ttsTrack=${diag.ttsTrackId || 'n/a'}(enabled=${diag.ttsTrackEnabled},state=${diag.ttsTrackReadyState},muted=${diag.ttsTrackMuted}) ` + `beforeSenders=[${(diag.beforeSenderTrackIds || []).join(',')}] afterSenders=[${(diag.afterSenderTrackIds || []).join(',')}] ` + `afterEnabled=${diag.afterSenderTrackEnabled} afterState=${diag.afterSenderTrackReadyState} forcedEnabled=${diag.forcedEnabled || false}`, ); // #endregion if (this._useCanvasVideo) { await this._forceCanvasVideoInAllFrames('tts'); } // Collect WebRTC stats BEFORE playback // #region agent log const statsBefore = await this._page.evaluate(async () => { const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[]; let bytesSent = 0; let packetsSent = 0; for (const pc of pcs) { for (const s of (pc.getSenders?.() || [])) { if (s?.track?.kind !== 'audio') continue; try { const stats = await s.getStats(); stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } }); } catch {} } } return { bytesSent, packetsSent }; }); // #endregion await this._page.evaluate(async ({ audioData, format }) => { const ctx = (window as any).__ttsAudioContext as AudioContext; const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode; if (!ctx || !streamDest) { throw new Error('Audio context not initialized'); } if (ctx.state === 'suspended') { await ctx.resume(); } const binaryString = atob(audioData); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } let audioBuffer: AudioBuffer; if (format === 'pcm') { const pcmData = new Int16Array(bytes.buffer); audioBuffer = ctx.createBuffer(1, pcmData.length, 16000); const channelData = audioBuffer.getChannelData(0); for (let i = 0; i < pcmData.length; i++) { channelData[i] = pcmData[i] / 32768; } } else { audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0)); } const source = ctx.createBufferSource(); source.buffer = audioBuffer; source.connect(streamDest); source.start(0); return new Promise((resolve) => { source.onended = () => { try { source.disconnect(); } catch { // already disconnected } resolve(); }; }); }, { audioData, format }); // Collect WebRTC stats AFTER playback // #region agent log const statsAfter = await this._page.evaluate(async () => { const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[]; let bytesSent = 0; let packetsSent = 0; for (const pc of pcs) { for (const s of (pc.getSenders?.() || [])) { if (s?.track?.kind !== 'audio') continue; try { const stats = await s.getStats(); stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } }); } catch {} } } // Also check current sender track state const senderInfo: any[] = []; for (const pc of pcs) { for (const s of (pc.getSenders?.() || [])) { if (s?.track?.kind !== 'audio') continue; senderInfo.push({ id: s.track.id, enabled: s.track.enabled, readyState: s.track.readyState, muted: s.track.muted }); } } return { bytesSent, packetsSent, senderInfo }; }); this._logger.info( `[Voice] WebRTC stats: before(bytes=${statsBefore.bytesSent},pkts=${statsBefore.packetsSent}) ` + `after(bytes=${statsAfter.bytesSent},pkts=${statsAfter.packetsSent}) ` + `delta(bytes=${statsAfter.bytesSent - statsBefore.bytesSent},pkts=${statsAfter.packetsSent - statsBefore.packetsSent}) ` + `senders=${JSON.stringify(statsAfter.senderInfo)}`, ); // #endregion this._logger.info('Audio playback completed'); } catch (error) { this._logger.error('Error playing audio:', error); throw error; } } /** * Stop any currently playing audio. */ async stopAudio(): Promise { try { await this._page.evaluate(() => { const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx) { ctx.suspend(); } }); } catch { // Ignore errors } } /** * Clean up audio resources. */ async cleanup(): Promise { try { for (const frame of this._page.frames()) { try { await frame.evaluate(() => { const w = window as any; if (w.__botAvatarDrawInterval) { clearInterval(w.__botAvatarDrawInterval); w.__botAvatarDrawInterval = null; } if (w.__botAvatarVideoTrack) { try { w.__botAvatarVideoTrack.stop(); } catch { // ignore } w.__botAvatarVideoTrack = null; } if (w.__botAvatarCanvas && w.__botAvatarCanvas.remove) { w.__botAvatarCanvas.remove(); w.__botAvatarCanvas = null; } w.__botAvatarStreamStarted = false; const actx = w.__ttsAudioContext as AudioContext; if (actx) { actx.close(); } }); } catch { // cross-origin or closed frame } } } catch { // Page might be closed } this._audioContext = false; } }