service-teams-browser-bot/src/bot/audioProcedure.ts
2026-02-15 22:28:51 +01:00

214 lines
7.1 KiB
TypeScript

import { Page } from 'playwright';
import { Logger } from 'winston';
/**
* Handles audio playback in the Teams meeting.
*
* Architecture:
* - Before any page loads, we inject an init script that overrides getUserMedia
* to return a MediaStream from a MediaStreamDestination we control.
* - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
* - When TTS audio is played, it's piped into the same MediaStreamDestination,
* so Teams picks it up as microphone input and sends it via WebRTC.
*/
export class AudioProcedure {
private _page: Page;
private _logger: Logger;
private _audioContext: boolean = false;
private _initScriptInjected: boolean = false;
constructor(page: Page, logger: Logger) {
this._page = page;
this._logger = logger;
}
/**
* Inject the getUserMedia override BEFORE any page navigation.
* This MUST be called before navigating to Teams.
* Uses page.addInitScript so it runs in every new document context.
*/
async injectAudioOverride(): Promise<void> {
if (this._initScriptInjected) {
return;
}
this._logger.info('Injecting audio getUserMedia override...');
await this._page.addInitScript(() => {
// Create a shared AudioContext and MediaStreamDestination
// These persist across the page lifetime
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
const ctx = new AudioContextClass();
const streamDest = ctx.createMediaStreamDestination();
// Store globally for later TTS injection
(window as any).__ttsAudioContext = ctx;
(window as any).__ttsStreamDest = streamDest;
(window as any).__ttsAudioStream = streamDest.stream;
// Override getUserMedia to return our controlled stream for audio requests
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
if (constraints && constraints.audio) {
// Return our TTS-injectable audio stream
// If video is also requested, combine our audio with real/fake video
if (constraints.video) {
try {
const videoStream = await originalGetUserMedia({ video: constraints.video });
const combinedStream = new MediaStream();
// Add our audio track
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
// Add their video track
videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
return combinedStream;
} catch {
// If video fails, just return audio
return streamDest.stream;
}
}
return streamDest.stream;
}
return originalGetUserMedia(constraints);
};
});
this._initScriptInjected = true;
this._logger.info('Audio getUserMedia override injected');
}
/**
* Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context).
*/
async initialize(): Promise<void> {
if (this._audioContext) {
return;
}
this._logger.info('Initializing audio context...');
await this._page.evaluate(() => {
// The __ttsAudioContext was created by the init script.
// Resume it now (requires user gesture - joining meeting counts).
const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx && ctx.state === 'suspended') {
ctx.resume();
}
// If init script didn't run (e.g. page navigated before injection),
// create fallback audio infrastructure
if (!ctx) {
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
const newCtx = new AudioContextClass();
const streamDest = newCtx.createMediaStreamDestination();
(window as any).__ttsAudioContext = newCtx;
(window as any).__ttsStreamDest = streamDest;
(window as any).__ttsAudioStream = streamDest.stream;
}
});
this._audioContext = true;
this._logger.info('Audio context initialized');
}
/**
* Play audio in the browser.
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
*
* @param audioData Base64 encoded audio data
* @param format Audio format (mp3, wav, pcm)
*/
async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
if (!this._audioContext) {
await this.initialize();
}
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
try {
await this._page.evaluate(async ({ audioData, format }) => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!ctx || !streamDest) {
throw new Error('Audio context not initialized');
}
// Resume context if suspended
if (ctx.state === 'suspended') {
await ctx.resume();
}
// Decode base64 to ArrayBuffer
const binaryString = atob(audioData);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
let audioBuffer: AudioBuffer;
if (format === 'pcm') {
// PCM: Assume 16-bit mono 16kHz
const pcmData = new Int16Array(bytes.buffer);
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
const channelData = audioBuffer.getChannelData(0);
for (let i = 0; i < pcmData.length; i++) {
channelData[i] = pcmData[i] / 32768;
}
} else {
// MP3/WAV: Use decodeAudioData
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
}
// Play through the MediaStreamDestination -> Teams mic input
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
source.connect(streamDest);
source.start(0);
return new Promise<void>((resolve) => {
source.onended = () => resolve();
});
}, { audioData, format });
this._logger.info('Audio playback completed');
} catch (error) {
this._logger.error('Error playing audio:', error);
throw error;
}
}
/**
* Stop any currently playing audio.
*/
async stopAudio(): Promise<void> {
try {
await this._page.evaluate(() => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) {
ctx.suspend();
}
});
} catch {
// Ignore errors
}
}
/**
* Clean up audio resources.
*/
async cleanup(): Promise<void> {
try {
await this._page.evaluate(() => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) {
ctx.close();
}
});
} catch {
// Page might be closed
}
this._audioContext = false;
}
}