service-teams-browser-bot/src/bot/audioProcedure.ts

import { Page } from 'playwright';
import { Logger } from 'winston';

/**
 * Handles audio playback in the Teams meeting.
 *
 * Architecture:
 * - Before any page loads, we inject an init script that overrides getUserMedia
 *   to return a MediaStream from a MediaStreamDestination we control.
 * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
 * - When TTS audio is played, it's piped into the same MediaStreamDestination,
 *   so Teams picks it up as microphone input and sends it via WebRTC.
 */
export class AudioProcedure {
  private _page: Page;
  private _logger: Logger;
  private _audioContext: boolean = false;
  private _initScriptInjected: boolean = false;

  constructor(page: Page, logger: Logger) {
    this._page = page;
    this._logger = logger;
  }

  /**
   * Inject the getUserMedia override BEFORE any page navigation.
   * This MUST be called before navigating to Teams.
   * Uses page.addInitScript so it runs in every new document context.
   */
  async injectAudioOverride(): Promise<void> {
    if (this._initScriptInjected) {
      return;
    }

    this._logger.info('Injecting audio getUserMedia override...');

    await this._page.addInitScript(() => {
      // Create a shared AudioContext and MediaStreamDestination
      // These persist across the page lifetime
      const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
      const ctx = new AudioContextClass();
      const streamDest = ctx.createMediaStreamDestination();

      // Store globally for later TTS injection
      (window as any).__ttsAudioContext = ctx;
      (window as any).__ttsStreamDest = streamDest;
      (window as any).__ttsAudioStream = streamDest.stream;

      // Override getUserMedia to return our controlled stream for audio requests
      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
        if (constraints && constraints.audio) {
          // Return our TTS-injectable audio stream
          // If video is also requested, combine our audio with real/fake video
          if (constraints.video) {
            try {
              const videoStream = await originalGetUserMedia({ video: constraints.video });
              const combinedStream = new MediaStream();
              // Add our audio track
              streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
              // Add their video track
              videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
              return combinedStream;
            } catch {
              // If video fails, just return audio
              return streamDest.stream;
            }
          }
          return streamDest.stream;
        }
        return originalGetUserMedia(constraints);
      };
    });

    this._initScriptInjected = true;
    this._logger.info('Audio getUserMedia override injected');
  }

  /**
   * Initialize the audio context in the browser for TTS playback.
   * Must be called after joining the meeting (user gesture context).
   */
  async initialize(): Promise<void> {
    if (this._audioContext) {
      return;
    }

    this._logger.info('Initializing audio context...');

    await this._page.evaluate(() => {
      // The __ttsAudioContext was created by the init script.
      // Resume it now (requires user gesture - joining meeting counts).
      const ctx = (window as any).__ttsAudioContext as AudioContext;
      if (ctx && ctx.state === 'suspended') {
        ctx.resume();
      }

      // If init script didn't run (e.g. page navigated before injection),
      // create fallback audio infrastructure
      if (!ctx) {
        const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
        const newCtx = new AudioContextClass();
        const streamDest = newCtx.createMediaStreamDestination();
        (window as any).__ttsAudioContext = newCtx;
        (window as any).__ttsStreamDest = streamDest;
        (window as any).__ttsAudioStream = streamDest.stream;
      }
    });

    this._audioContext = true;
    this._logger.info('Audio context initialized');
  }

  /**
   * Play audio in the browser.
   * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
   *
   * @param audioData Base64 encoded audio data
   * @param format Audio format (mp3, wav, pcm)
   */
  async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
    if (!this._audioContext) {
      await this.initialize();
    }

    this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);

    try {
      await this._page.evaluate(async ({ audioData, format }) => {
        const ctx = (window as any).__ttsAudioContext as AudioContext;
        const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;

        if (!ctx || !streamDest) {
          throw new Error('Audio context not initialized');
        }

        // Resume context if suspended
        if (ctx.state === 'suspended') {
          await ctx.resume();
        }

        // Decode base64 to ArrayBuffer
        const binaryString = atob(audioData);
        const bytes = new Uint8Array(binaryString.length);
        for (let i = 0; i < binaryString.length; i++) {
          bytes[i] = binaryString.charCodeAt(i);
        }

        let audioBuffer: AudioBuffer;

        if (format === 'pcm') {
          // PCM: Assume 16-bit mono 16kHz
          const pcmData = new Int16Array(bytes.buffer);
          audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
          const channelData = audioBuffer.getChannelData(0);
          for (let i = 0; i < pcmData.length; i++) {
            channelData[i] = pcmData[i] / 32768;
          }
        } else {
          // MP3/WAV: Use decodeAudioData
          audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
        }

        // Play through the MediaStreamDestination -> Teams mic input
        const source = ctx.createBufferSource();
        source.buffer = audioBuffer;
        source.connect(streamDest);
        source.start(0);

        return new Promise<void>((resolve) => {
          source.onended = () => resolve();
        });
      }, { audioData, format });

      this._logger.info('Audio playback completed');
    } catch (error) {
      this._logger.error('Error playing audio:', error);
      throw error;
    }
  }

  /**
   * Stop any currently playing audio.
   */
  async stopAudio(): Promise<void> {
    try {
      await this._page.evaluate(() => {
        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.suspend();
        }
      });
    } catch {
      // Ignore errors
    }
  }

  /**
   * Clean up audio resources.
   */
  async cleanup(): Promise<void> {
    try {
      await this._page.evaluate(() => {
        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.close();
        }
      });
    } catch {
      // Page might be closed
    }
    this._audioContext = false;
  }
}