service-teams-browser-bot/src/bot/audioProcedure.ts

import { Page } from 'playwright';
import { Logger } from 'winston';
import { poweronMediaPatchInstall } from './mediaGetUserMediaPatch';

/**
 * Handles audio playback in the Teams meeting.
 *
 * Architecture:
 * - Browser launches with --use-fake-device-for-media-stream so Teams sees
 *   real-looking devices (no "no audio/video" modal).
 * - Before any page loads, we inject an init script that wraps getUserMedia.
 * - When Teams calls getUserMedia, the wrapper:
 *   1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
 *   2. Replaces the audio track with one from our MediaStreamDestination
 *   3. Returns the modified stream; optional canvas video track instead of fake video
 * - When TTS audio is played, it's piped into the MediaStreamDestination,
 *   and Teams sends it via WebRTC to other meeting participants.
 */
export type AudioProcedureOptions = {
  useCanvasVideo?: boolean;
  /** Shown in the center of the canvas (e.g. bot display name) */
  displayLabel?: string;
  /** Hex/CSS color of the static avatar background (default: light blue). */
  avatarBgColor?: string;
  /** Hex/CSS color of the centered display label (default: dark blue). */
  avatarTextColor?: string;
};

export class AudioProcedure {
  private _page: Page;
  private _logger: Logger;
  private _useCanvasVideo: boolean;
  private _displayLabel: string;
  private _avatarBgColor: string;
  private _avatarTextColor: string;
  private _audioContext: boolean = false;
  private _initScriptInjected: boolean = false;
  private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = [];
  private _isPlaying: boolean = false;
  private _stopRequested: boolean = false;

  constructor(page: Page, logger: Logger, options?: AudioProcedureOptions) {
    this._page = page;
    this._logger = logger;
    this._useCanvasVideo = !!options?.useCanvasVideo;
    this._displayLabel = (options?.displayLabel || 'Bot').trim() || 'Bot';
    this._avatarBgColor = (options?.avatarBgColor || '').trim() || '#a8d4f0';
    this._avatarTextColor = (options?.avatarTextColor || '').trim() || '#1a3552';
  }

  /**
   * Inject the getUserMedia wrapper BEFORE any page navigation.
   * This MUST be called before navigating to Teams.
   * Uses browserContext.addInitScript so the hook runs in the main page and
   * in embedded frames (Teams often runs media/WebRTC in an iframe; page-only
   * injection would miss getUserMedia and you would only see the fake device).
   */
  async injectAudioOverride(): Promise<void> {
    if (this._initScriptInjected) {
      return;
    }

    this._logger.info(
      `Injecting audio getUserMedia override (canvasVideo=${this._useCanvasVideo}, label="${this._displayLabel}")...`,
    );

    await this._page.context().addInitScript(poweronMediaPatchInstall, {
      useCanvasVideo: this._useCanvasVideo,
      displayLabel: this._displayLabel,
      avatarBgColor: this._avatarBgColor,
      avatarTextColor: this._avatarTextColor,
    });

    this._initScriptInjected = true;
    this._logger.info('Audio getUserMedia override injected');
  }

  /**
   * Re-run the media patch in every frame. Needed when Teams replaces the document
   * in an iframe (addInitScript runs too early) or overwrites getUserMedia.
   */
  async reinstallMediaPatchInAllFrames(): Promise<void> {
    const payload = {
      useCanvasVideo: this._useCanvasVideo,
      displayLabel: this._displayLabel,
      avatarBgColor: this._avatarBgColor,
      avatarTextColor: this._avatarTextColor,
    };
    for (const frame of this._page.frames()) {
      try {
        await frame.evaluate(poweronMediaPatchInstall, payload);
      } catch (e) {
        this._logger.info(`[mediaPatch] frame skipped: ${e}`);
      }
    }
    await this._forceCanvasVideoInAllFrames('reinstall');
  }

  /**
   * Replace outbound video in every frame. Teams may run WebRTC in a subframe;
   * only touching the main window leaves Chromium's default fake (green) video.
   */
  private async _forceCanvasVideoInAllFrames(phase: string): Promise<void> {
    if (!this._useCanvasVideo) {
      return;
    }
    const parts: string[] = [];
    for (const frame of this._page.frames()) {
      try {
        const r = await frame.evaluate(async () => {
          const w = window as any;
          w.__startBotAvatarStream?.();
          return w.__forceVideoTrackToSenders?.();
        });
        const shortUrl = (() => {
          try {
            return frame.url().substring(0, 100);
          } catch {
            return '(no-url)';
          }
        })();
        const rr: any = r || {};
        const vsArr = (rr.videoStats || []) as any[];
        const vs = vsArr.length
          ? vsArr.map(v => `${v.kind}:b=${v.bytes},p=${v.packets},fEnc=${v.framesEncoded},fSent=${v.framesSent},fps=${v.fps},${v.w}x${v.h}`).join(' | ')
          : 'none';
        parts.push(
          `[${shortUrl}] r=${rr.replaced ?? 0} add=${rr.added ?? 0} pcs=${rr.pcs ?? 0} `
          + `tx=${rr.totalTransceivers ?? 0} vidTx=${rr.videoTransceivers ?? 0} `
          + `vidWith=${rr.videoSendersWithTrack ?? 0} vidNoTrack=${rr.videoSendersWithoutTrack ?? 0} `
          + `dirB=[${(rr.directionsBefore || []).join(',')}] dirA=[${(rr.directionsAfter || []).join(',')}] `
          + `cd=[${(rr.currentDirections || []).join(',')}] `
          + `track=${rr.trackId || 'n/a'}(en=${rr.trackEnabled},rs=${rr.trackReady},mu=${rr.trackMuted}) `
          + `vstats=[${vs}] ${rr.reason || ''}`.trim(),
        );
      } catch (e: any) {
        parts.push(`err=${String(e?.message || e).slice(0, 64)}`);
      }
    }
    this._logger.info(`Canvas video ${phase}: ${parts.join(' | ')}`);
  }

  /**
   * Initialize the audio context in the browser for TTS playback.
   * Must be called after joining the meeting (user gesture context).
   */
  async initialize(): Promise<void> {
    if (this._audioContext) {
      return;
    }

    this._logger.info('Initializing audio context...');

    await this._page.evaluate(() => {
      // The __ttsAudioContext was created by the init script.
      // Resume it now (requires user gesture - joining meeting counts).
      const ctx = (window as any).__ttsAudioContext as AudioContext;
      if (ctx && ctx.state === 'suspended') {
        ctx.resume();
      }

      // If init script didn't run (e.g. page navigated before injection),
      // create fallback audio infrastructure
      if (!ctx) {
        const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
        const newCtx = new AudioContextClass();
        const streamDest = newCtx.createMediaStreamDestination();
        (window as any).__ttsAudioContext = newCtx;
        (window as any).__ttsStreamDest = streamDest;
        (window as any).__ttsAudioStream = streamDest.stream;
      }
    });

    if (this._useCanvasVideo) {
      await this._forceCanvasVideoInAllFrames('init');
    }

    this._audioContext = true;
    this._logger.info('Audio context initialized');
  }

  /**
   * Queue audio for sequential playback.
   * Audio is never played in parallel -- each clip waits for the previous one to finish.
   *
   * @param audioData Base64 encoded audio data
   * @param format Audio format (mp3, wav, pcm)
   */
  async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
    // Add to queue
    this._audioQueue.push({ audioData, format });
    this._logger.info(`Audio queued (queue size: ${this._audioQueue.length}, playing: ${this._isPlaying})`);

    // If not currently playing, start processing the queue
    if (!this._isPlaying) {
      await this._processAudioQueue();
    }
  }

  /**
   * Process the audio queue sequentially.
   */
  private async _processAudioQueue(): Promise<void> {
    if (this._isPlaying) return;
    this._isPlaying = true;
    this._stopRequested = false;

    while (this._audioQueue.length > 0 && !this._stopRequested) {
      const item = this._audioQueue.shift()!;
      try {
        await this._playAudioInternal(item.audioData, item.format);
      } catch (error) {
        this._logger.error('Error playing queued audio:', error);
      }
    }

    if (this._stopRequested) {
      this._audioQueue = [];
      this._logger.info('Audio queue cleared due to stop request');
    }

    this._isPlaying = false;
    this._stopRequested = false;
  }

  /**
   * Stop all audio immediately: stop current playback and clear the queue.
   * Called when a user says "<botname> STOP" or similar.
   */
  async stopAllAudio(): Promise<void> {
    this._logger.info('Stop all audio requested');
    this._stopRequested = true;
    this._audioQueue = [];

    try {
      await this._page.evaluate(() => {
        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          // Suspend immediately stops all audio output
          ctx.suspend();
          // Resume after a short delay so future audio can play
          setTimeout(() => ctx.resume(), 100);
        }
      });
    } catch {
      // Page might not be ready
    }
  }

  /**
   * Internal: Play audio in the browser (single clip, no queuing).
   * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
   * Before playback, forces all WebRTC audio senders to use the TTS track
   * (sender.replaceTrack) so Teams transmits our audio to participants.
   */
  private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
    if (!this._audioContext) {
      await this.initialize();
    }

    this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);

    try {
      // Force all outgoing audio senders to use the TTS track
      const senderInjectInfo = await this._page.evaluate(async () => {
        const forceFn = (window as any).__forceTtsTrackToSenders;
        if (typeof forceFn === 'function') {
          return await forceFn();
        }
        return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
      });
      // #region agent log
      const diag = senderInjectInfo?.diag || {};
      this._logger.info(
        `TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'} ` +
        `ttsTrack=${diag.ttsTrackId || 'n/a'}(enabled=${diag.ttsTrackEnabled},state=${diag.ttsTrackReadyState},muted=${diag.ttsTrackMuted}) ` +
        `beforeSenders=[${(diag.beforeSenderTrackIds || []).join(',')}] afterSenders=[${(diag.afterSenderTrackIds || []).join(',')}] ` +
        `afterEnabled=${diag.afterSenderTrackEnabled} afterState=${diag.afterSenderTrackReadyState} forcedEnabled=${diag.forcedEnabled || false}`,
      );
      // #endregion

      if (this._useCanvasVideo) {
        await this._forceCanvasVideoInAllFrames('tts');
      }

      // Collect WebRTC stats BEFORE playback
      // #region agent log
      const statsBefore = await this._page.evaluate(async () => {
        const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
        let bytesSent = 0; let packetsSent = 0;
        for (const pc of pcs) {
          for (const s of (pc.getSenders?.() || [])) {
            if (s?.track?.kind !== 'audio') continue;
            try {
              const stats = await s.getStats();
              stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } });
            } catch {}
          }
        }
        return { bytesSent, packetsSent };
      });
      // #endregion

      await this._page.evaluate(async ({ audioData, format }) => {
        const ctx = (window as any).__ttsAudioContext as AudioContext;
        const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;

        if (!ctx || !streamDest) {
          throw new Error('Audio context not initialized');
        }

        if (ctx.state === 'suspended') {
          await ctx.resume();
        }

        const binaryString = atob(audioData);
        const bytes = new Uint8Array(binaryString.length);
        for (let i = 0; i < binaryString.length; i++) {
          bytes[i] = binaryString.charCodeAt(i);
        }

        let audioBuffer: AudioBuffer;

        if (format === 'pcm') {
          const pcmData = new Int16Array(bytes.buffer);
          audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
          const channelData = audioBuffer.getChannelData(0);
          for (let i = 0; i < pcmData.length; i++) {
            channelData[i] = pcmData[i] / 32768;
          }
        } else {
          audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
        }

        const source = ctx.createBufferSource();
        source.buffer = audioBuffer;
        source.connect(streamDest);
        source.start(0);

        return new Promise<void>((resolve) => {
          source.onended = () => {
            try {
              source.disconnect();
            } catch {
              // already disconnected
            }
            resolve();
          };
        });
      }, { audioData, format });

      // Collect WebRTC stats AFTER playback
      // #region agent log
      const statsAfter = await this._page.evaluate(async () => {
        const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
        let bytesSent = 0; let packetsSent = 0;
        for (const pc of pcs) {
          for (const s of (pc.getSenders?.() || [])) {
            if (s?.track?.kind !== 'audio') continue;
            try {
              const stats = await s.getStats();
              stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } });
            } catch {}
          }
        }
        // Also check current sender track state
        const senderInfo: any[] = [];
        for (const pc of pcs) {
          for (const s of (pc.getSenders?.() || [])) {
            if (s?.track?.kind !== 'audio') continue;
            senderInfo.push({ id: s.track.id, enabled: s.track.enabled, readyState: s.track.readyState, muted: s.track.muted });
          }
        }
        return { bytesSent, packetsSent, senderInfo };
      });
      this._logger.info(
        `[Voice] WebRTC stats: before(bytes=${statsBefore.bytesSent},pkts=${statsBefore.packetsSent}) ` +
        `after(bytes=${statsAfter.bytesSent},pkts=${statsAfter.packetsSent}) ` +
        `delta(bytes=${statsAfter.bytesSent - statsBefore.bytesSent},pkts=${statsAfter.packetsSent - statsBefore.packetsSent}) ` +
        `senders=${JSON.stringify(statsAfter.senderInfo)}`,
      );
      // #endregion

      this._logger.info('Audio playback completed');
    } catch (error) {
      this._logger.error('Error playing audio:', error);
      throw error;
    }
  }

  /**
   * Stop any currently playing audio.
   */
  async stopAudio(): Promise<void> {
    try {
      await this._page.evaluate(() => {
        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.suspend();
        }
      });
    } catch {
      // Ignore errors
    }
  }

  /**
   * Clean up audio resources.
   */
  async cleanup(): Promise<void> {
    try {
      for (const frame of this._page.frames()) {
        try {
          await frame.evaluate(() => {
            const w = window as any;
            if (w.__botAvatarDrawInterval) {
              clearInterval(w.__botAvatarDrawInterval);
              w.__botAvatarDrawInterval = null;
            }
            if (w.__botAvatarVideoTrack) {
              try {
                w.__botAvatarVideoTrack.stop();
              } catch {
                // ignore
              }
              w.__botAvatarVideoTrack = null;
            }
            if (w.__botAvatarCanvas && w.__botAvatarCanvas.remove) {
              w.__botAvatarCanvas.remove();
              w.__botAvatarCanvas = null;
            }
            w.__botAvatarStreamStarted = false;
            const actx = w.__ttsAudioContext as AudioContext;
            if (actx) {
              actx.close();
            }
          });
        } catch {
          // cross-origin or closed frame
        }
      }
    } catch {
      // Page might be closed
    }
    this._audioContext = false;
  }
}