service-teams-browser-bot/src/bot/audioCaptureProcedure.ts

import { Page } from 'playwright';
import { Logger } from 'winston';

interface AudioChunkDiagnostics {
  trackId?: string;
  readyState?: string;
  rms?: number;
  nativeSampleRate?: number;
}

interface CapturedAudioChunk {
  data: string;
  sampleRate: number;
  captureDiagnostics?: AudioChunkDiagnostics;
}

const AUDIO_CAPTURE_WORKLET_CODE = `
class AudioCaptureProcessor extends AudioWorkletProcessor {
  constructor(options) {
    super();
    const opts = options.processorOptions || {};
    this.nativeRate = opts.nativeRate || 48000;
    this.targetRate = opts.targetRate || 16000;
    this.maxSamplesPerChunk = this.nativeRate * 8;
    this.minRmsThreshold = 0.0003;
    this.preRollSamples = Math.ceil(this.nativeRate * 1.0);
    this.minFlushSamples = Math.ceil(this.nativeRate * 0.5);
    this.silenceFlushCallbacks = 6;
    this.ratio = this.nativeRate / this.targetRate;
    this.chunkBuffer = [];
    this.samplesCollected = 0;
    this.hasVoicedContent = false;
    this.consecutiveSilentCallbacks = 0;
  }

  process(inputs, outputs, parameters) {
    const input = inputs[0]?.[0];
    if (!input || input.length === 0) return true;

    let cbPower = 0;
    for (let i = 0; i < input.length; i++) {
      cbPower += input[i] * input[i];
    }
    const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));

    if (cbRms >= this.minRmsThreshold) {
      this.hasVoicedContent = true;
      this.consecutiveSilentCallbacks = 0;
    } else {
      this.consecutiveSilentCallbacks++;
    }

    this.chunkBuffer.push(new Float32Array(input));
    this.samplesCollected += input.length;

    const shouldFlush = (
      this.samplesCollected >= this.maxSamplesPerChunk
      || (this.hasVoicedContent
          && this.consecutiveSilentCallbacks >= this.silenceFlushCallbacks
          && this.samplesCollected > this.minFlushSamples)
    );

    if (shouldFlush) {
      const merged = new Float32Array(this.samplesCollected);
      let offset = 0;
      for (const buf of this.chunkBuffer) {
        merged.set(buf, offset);
        offset += buf.length;
      }

      let powerSum = 0;
      for (let i = 0; i < merged.length; i++) {
        powerSum += merged[i] * merged[i];
      }
      const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));

      this.hasVoicedContent = false;
      this.consecutiveSilentCallbacks = 0;

      if (rms >= this.minRmsThreshold) {
        const outLen = Math.floor(merged.length / this.ratio);
        const pcm16 = new Int16Array(outLen);
        for (let i = 0; i < outLen; i++) {
          const srcIdx = Math.floor(i * this.ratio);
          const s = Math.max(-1, Math.min(1, merged[srcIdx]));
          pcm16[i] = Math.round(s * 32767);
        }
        this.port.postMessage({
          type: 'chunk',
          data: pcm16.buffer,
          rms,
          nativeSampleRate: this.nativeRate
        }, [pcm16.buffer]);
      } else {
        const keep = Math.min(this.preRollSamples, merged.length);
        const preRoll = merged.slice(merged.length - keep);
        this.chunkBuffer = [preRoll];
        this.samplesCollected = keep;
        return true;
      }

      const keep = Math.min(this.preRollSamples, merged.length);
      const preRoll = merged.slice(merged.length - keep);
      this.chunkBuffer = [preRoll];
      this.samplesCollected = keep;
    }
    return true;
  }
}

registerProcessor('audio-capture-processor', AudioCaptureProcessor);
`;

/**
 * Captures incoming meeting audio by intercepting WebRTC RTCPeerConnection.
 *
 * How it works:
 * 1. Before page navigation, wraps window.RTCPeerConnection via addInitScript
 * 2. When Teams establishes WebRTC connections, the wrapper intercepts incoming audio tracks
 * 3. Incoming audio tracks are captured via AudioContext + AudioWorkletNode (or ScriptProcessorNode fallback)
 * 4. Audio is captured at native 48kHz, downsampled to 16kHz, and converted to PCM16
 * 5. Audio chunks are buffered and the Node.js side polls for them to send to the Gateway
 */
export class AudioCaptureProcedure {
  private _page: Page;
  private _logger: Logger;
  private _onAudioChunk: (
    base64Data: string,
    sampleRate: number,
    captureDiagnostics?: AudioChunkDiagnostics
  ) => void;
  private _isCapturing: boolean = false;
  private _pollInterval: ReturnType<typeof setInterval> | null = null;
  private _injected: boolean = false;

  constructor(
    page: Page,
    logger: Logger,
    onAudioChunk: (
      base64Data: string,
      sampleRate: number,
      captureDiagnostics?: AudioChunkDiagnostics
    ) => void,
  ) {
    this._page = page;
    this._logger = logger;
    this._onAudioChunk = onAudioChunk;
  }

  /**
   * Inject the RTCPeerConnection wrapper BEFORE any page navigation.
   * Must be called before navigating to Teams.
   */
  async injectCaptureOverride(): Promise<void> {
    if (this._injected) return;

    this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...');

    await this._page.addInitScript((workletCode: string) => {
      (window as any).__audioCaptureChunks = [] as any[];
      (window as any).__audioCaptureProcessors = {} as Record<string, any>;
      (window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;
      (window as any).__audioCapturePeerConnections = [] as RTCPeerConnection[];

      const OrigRTC = window.RTCPeerConnection;

      // @ts-ignore — wrapping constructor
      window.RTCPeerConnection = function (this: RTCPeerConnection, ...args: any[]) {
        const pc = new OrigRTC(...args);
        try {
          const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[];
          pcs.push(pc);
          // #region agent log
          console.log(`[AudioCapture][DIAG] New RTCPeerConnection created (total: ${pcs.length}), config:`, JSON.stringify(args[0] || {}).substring(0, 200));
          // #endregion
        } catch {
          // ignore
        }

        pc.addEventListener('track', (event: RTCTrackEvent) => {
          if (event.track.kind !== 'audio') return;

          const trackId = event.track.id || `audio-track-${Date.now()}`;
          const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
          if (processors[trackId]) {
            return;
          }

          // #region agent log
          console.log(
            `[AudioCapture][DIAG] Track received: id=${trackId}, enabled=${event.track.enabled}, muted=${event.track.muted}, readyState=${event.track.readyState}, label=${event.track.label}`
          );
          event.track.addEventListener('mute', () => {
            console.log(`[AudioCapture][DIAG] Track MUTED: id=${trackId}`);
          });
          event.track.addEventListener('unmute', () => {
            console.log(`[AudioCapture][DIAG] Track UNMUTED: id=${trackId}`);
          });
          // #endregion

          try {
            const AudioCtx = window.AudioContext || (window as any).webkitAudioContext;
            const ctx = new AudioCtx();
            const nativeRate = ctx.sampleRate;
            const stream = new MediaStream([event.track]);
            const source = ctx.createMediaStreamSource(stream);
            const targetRate = 16000;

            // #region agent log
            console.log(
              `[AudioCapture][DIAG] AudioContext: state=${ctx.state}, sampleRate=${nativeRate}, stream.active=${stream.active}, streamTracks=${stream.getAudioTracks().length}`
            );
            ctx.addEventListener('statechange', () => {
              console.log(`[AudioCapture][DIAG] AudioContext statechange: ${ctx.state} for track=${trackId}`);
            });
            // #endregion

            const silentGain = ctx.createGain();
            silentGain.gain.value = 0;

            const pushChunk = (base64Data: string, rms: number) => {
              const chunks = (window as any).__audioCaptureChunks as any[];
              if (chunks.length < 60) {
                chunks.push({
                  data: base64Data,
                  sampleRate: targetRate,
                  captureDiagnostics: {
                    trackId,
                    readyState: event.track.readyState,
                    rms: Number(rms.toFixed(6)),
                    nativeSampleRate: nativeRate,
                  },
                });
              }
            };

            let workletNode: AudioWorkletNode | null = null;
            let scriptProcessor: ScriptProcessorNode | null = null;

            const useWorklet = async () => {
              try {
                const blob = new Blob([workletCode], { type: 'application/javascript' });
                const blobUrl = URL.createObjectURL(blob);
                await ctx.audioWorklet.addModule(blobUrl);
                URL.revokeObjectURL(blobUrl);

                workletNode = new AudioWorkletNode(ctx, 'audio-capture-processor', {
                  processorOptions: { nativeRate, targetRate },
                });

                workletNode.port.onmessage = (ev: MessageEvent) => {
                  if (ev.data?.type !== 'chunk' || !ev.data.data) return;
                  const pcm16 = new Int16Array(ev.data.data);
                  const bytes = new Uint8Array(pcm16.buffer);
                  let binary = '';
                  for (let i = 0; i < bytes.length; i++) {
                    binary += String.fromCharCode(bytes[i]);
                  }
                  pushChunk(btoa(binary), ev.data.rms || 0);
                };

                source.connect(workletNode);
                workletNode.connect(silentGain);
                silentGain.connect(ctx.destination);

                const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
                processorsObj[trackId] = workletNode;
                console.log(`[AudioCapture] WebRTC audio track intercepted (AudioWorklet): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
                return true;
              } catch (err) {
                console.warn(`[AudioCapture] AudioWorklet not available, falling back to ScriptProcessor: ${err}`);
                return false;
              }
            };

            const useScriptProcessor = () => {
              const minRmsThreshold = 0.0003;
              const maxSamplesPerChunk = nativeRate * 8;
              const preRollSamples = Math.ceil(nativeRate * 1.0);
              const minFlushSamples = Math.ceil(nativeRate * 0.5);
              const silenceFlushCallbacks = 6;
              const ratio = nativeRate / targetRate;

              scriptProcessor = ctx.createScriptProcessor(8192, 1, 1);
              let chunkBuffer: Float32Array[] = [];
              let samplesCollected = 0;
              let hasVoicedContent = false;
              let consecutiveSilentCallbacks = 0;

              scriptProcessor.onaudioprocess = (e: AudioProcessingEvent) => {
                const input = e.inputBuffer.getChannelData(0);
                let cbPower = 0;
                for (let i = 0; i < input.length; i++) {
                  cbPower += input[i] * input[i];
                }
                const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));

                if (cbRms >= minRmsThreshold) {
                  hasVoicedContent = true;
                  consecutiveSilentCallbacks = 0;
                } else {
                  consecutiveSilentCallbacks++;
                }

                chunkBuffer.push(new Float32Array(input));
                samplesCollected += input.length;

                const shouldFlush = (
                  samplesCollected >= maxSamplesPerChunk
                  || (hasVoicedContent
                      && consecutiveSilentCallbacks >= silenceFlushCallbacks
                      && samplesCollected > minFlushSamples)
                );

                if (shouldFlush) {
                  const merged = new Float32Array(samplesCollected);
                  let offset = 0;
                  for (const buf of chunkBuffer) {
                    merged.set(buf, offset);
                    offset += buf.length;
                  }

                  let powerSum = 0;
                  for (let i = 0; i < merged.length; i++) {
                    powerSum += merged[i] * merged[i];
                  }
                  const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));

                  hasVoicedContent = false;
                  consecutiveSilentCallbacks = 0;

                  if (rms >= minRmsThreshold) {
                    const outLen = Math.floor(merged.length / ratio);
                    const pcm16 = new Int16Array(outLen);
                    for (let i = 0; i < outLen; i++) {
                      const srcIdx = Math.floor(i * ratio);
                      const s = Math.max(-1, Math.min(1, merged[srcIdx]));
                      pcm16[i] = Math.round(s * 32767);
                    }
                    const bytes = new Uint8Array(pcm16.buffer);
                    let binary = '';
                    for (let i = 0; i < bytes.length; i++) {
                      binary += String.fromCharCode(bytes[i]);
                    }
                    pushChunk(btoa(binary), rms);
                  } else {
                    const keep = Math.min(preRollSamples, merged.length);
                    const preRoll = merged.slice(merged.length - keep);
                    chunkBuffer = [preRoll];
                    samplesCollected = keep;
                    return;
                  }
                  const keep = Math.min(preRollSamples, merged.length);
                  const preRoll = merged.slice(merged.length - keep);
                  chunkBuffer = [preRoll];
                  samplesCollected = keep;
                }
              };

              source.connect(scriptProcessor);
              scriptProcessor.connect(silentGain);
              silentGain.connect(ctx.destination);

              const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
              processorsObj[trackId] = scriptProcessor;
              console.log(`[AudioCapture] WebRTC audio track intercepted (ScriptProcessor fallback): track=${trackId}, native=${nativeRate}Hz -> 16kHz mono`);
            };

            (async () => {
              const ok = await useWorklet();
              if (!ok) useScriptProcessor();

              ctx.resume().catch(() => {});
            })();

            // Clean up when the track ends (peer leaves, renegotiation, etc.)
            event.track.addEventListener('ended', () => {
              try {
                if (workletNode) {
                  workletNode.disconnect();
                }
                if (scriptProcessor) {
                  scriptProcessor.disconnect();
                }
                source.disconnect();
                silentGain.disconnect();
                ctx.close();
              } catch { /* already closed */ }
              const processorsObj = (window as any).__audioCaptureProcessors as Record<string, any>;
              const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
              delete processorsObj[trackId];
              delete contextsObj[trackId];
              console.log(`[AudioCapture] Audio track ended: track=${trackId}, resources cleaned up`);
            });

            const contextsObj = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
            contextsObj[trackId] = ctx;
          } catch (err) {
            console.error('[AudioCapture] Failed to set up audio capture:', err);
          }
        });

        return pc;
      } as any;

      // Copy static properties
      window.RTCPeerConnection.prototype = OrigRTC.prototype;
      Object.setPrototypeOf(window.RTCPeerConnection, OrigRTC);
    }, AUDIO_CAPTURE_WORKLET_CODE);

    this._injected = true;
    this._logger.info('[AudioCapture] RTCPeerConnection wrapper injected');
  }

  /**
   * Start polling for captured audio chunks and forwarding them to the callback.
   */
  async startCapture(): Promise<void> {
    if (this._isCapturing) return;
    this._isCapturing = true;

    this._logger.info('[AudioCapture] Starting audio chunk polling...');

    // #region agent log
    let pollCount = 0;
    // #endregion
    this._pollInterval = setInterval(async () => {
      try {
        // #region agent log
        pollCount++;
        if (pollCount % 60 === 1) {
          const diagInfo = await this._page.evaluate(() => {
            const pcs = (window as any).__audioCapturePeerConnections as RTCPeerConnection[] || [];
            const procs = (window as any).__audioCaptureProcessors as Record<string, any> || {};
            const ctxs = (window as any).__audioCaptureContexts as Record<string, AudioContext> || {};
            const procKeys = Object.keys(procs);
            const ctxStates = Object.entries(ctxs).map(([k, c]) => `${k}:${c.state}`);
            return {
              peerConnections: pcs.length,
              pcStates: pcs.map((p: RTCPeerConnection) => p.connectionState || 'unknown'),
              processors: procKeys.length,
              processorTrackIds: procKeys,
              audioContextStates: ctxStates,
            };
          });
          this._logger.info(`[AudioCapture][DIAG] Periodic: ${JSON.stringify(diagInfo)}`);
        }
        // #endregion
        const chunks = await this._page.evaluate(() => {
          const buf = (window as any).__audioCaptureChunks as CapturedAudioChunk[];
          const result = buf.splice(0, buf.length);
          return result;
        });

        for (const chunk of chunks) {
          this._onAudioChunk(
            chunk.data,
            chunk.sampleRate || 16000,
            chunk.captureDiagnostics
          );
        }
      } catch {
        // Page might be navigating or closed
      }
    }, 500);
  }

  /**
   * Stop capturing audio.
   */
  async stopCapture(): Promise<void> {
    this._isCapturing = false;

    if (this._pollInterval) {
      clearInterval(this._pollInterval);
      this._pollInterval = null;
    }

    try {
      await this._page.evaluate(() => {
        const processors = (window as any).__audioCaptureProcessors as Record<string, any>;
        const contexts = (window as any).__audioCaptureContexts as Record<string, AudioContext>;
        Object.keys(processors || {}).forEach((trackId) => {
          try {
            processors[trackId]?.disconnect();
          } catch {
            // ignore
          }
        });
        Object.keys(contexts || {}).forEach((trackId) => {
          try {
            contexts[trackId]?.close();
          } catch {
            // ignore
          }
        });
        (window as any).__audioCaptureProcessors = {};
        (window as any).__audioCaptureContexts = {};
      });
    } catch {
      // Page might already be closed
    }

    this._logger.info('[AudioCapture] Audio capture stopped');
  }
}