From 5f7dc60376e3f4d91858577f2cdd29cc207dbecf Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Fri, 27 Feb 2026 12:51:24 +0100
Subject: [PATCH] Voice: restore sender.replaceTrack for TTS injection, remove
 broken iframe logic

Made-with: Cursor
---
 src/bot/audioProcedure.ts | 165 ++++++++++----------------------------
 1 file changed, 42 insertions(+), 123 deletions(-)
diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index a89dacc..25faafd 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -1,4 +1,4 @@
-import { Frame, Page } from 'playwright';
+import { Page } from 'playwright';
 import { Logger } from 'winston';
 
 /**
@@ -86,47 +86,36 @@ export class AudioProcedure {
         return realStream;
       };
 
+      // Force all RTCPeerConnection audio senders to use our TTS track.
+      // This ensures Teams actually sends our audio even if getUserMedia
+      // override happened in a different context or was renegotiated.
+      (window as any).__forceTtsTrackToSenders = async () => {
+        const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
+        const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
+        if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
+
+        let replaced = 0;
+        for (const pc of pcs) {
+          try {
+            const senders = pc.getSenders?.() || [];
+            for (const sender of senders) {
+              if (sender?.track?.kind === 'audio') {
+                await sender.replaceTrack(ttsTrack);
+                replaced++;
+              }
+            }
+          } catch {
+            // ignore per peer connection
+          }
+        }
+        return { replaced, pcs: pcs?.length || 0, reason: 'ok' };
+      };
     });
 
     this._initScriptInjected = true;
     this._logger.info('Audio getUserMedia override injected');
   }
 
-  /**
-   * Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
-   * Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
-   * play into the wrong streamDest. Returns the frame to use, or null for main page.
-   */
-  private async _getTtsFrame(): Promise<Frame | null> {
-    const frames = this._page.frames();
-    for (const frame of frames) {
-      try {
-        const match = await frame.evaluate(() => {
-          const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
-          const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
-          if (!pcs.length || !streamDest) return false;
-          const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
-          if (!ttsTrackId) return false;
-          for (const pc of pcs) {
-            const senders = pc.getSenders?.() || [];
-            for (const s of senders) {
-              if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
-            }
-          }
-          return false;
-        });
-        if (match) {
-          this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
-          return frame;
-        }
-      } catch {
-        // Frame may be detached
-      }
-    }
-    this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
-    return null;
-  }
-
   /**
    * Initialize the audio context in the browser for TTS playback.
    * Must be called after joining the meeting (user gesture context).
@@ -233,8 +222,8 @@ export class AudioProcedure {
   /**
    * Internal: Play audio in the browser (single clip, no queuing).
    * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
-   * Teams meeting may run in an iframe; we must play in the frame that has the
-   * RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants).
+   * Before playback, forces all WebRTC audio senders to use the TTS track
+   * (sender.replaceTrack) so Teams transmits our audio to participants.
    */
   private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
     if (!this._audioContext) {
@@ -243,66 +232,31 @@ export class AudioProcedure {
 
     this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
 
-    const targetFrame = await this._getTtsFrame();
-    const evalTarget = targetFrame || this._page;
-
     try {
-      const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => {
+      // Force all outgoing audio senders to use the TTS track
+      const senderInjectInfo = await this._page.evaluate(async () => {
+        const forceFn = (window as any).__forceTtsTrackToSenders;
+        if (typeof forceFn === 'function') {
+          return await forceFn();
+        }
+        return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
+      });
+      this._logger.info(
+        `TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'}`,
+      );
+
+      await this._page.evaluate(async ({ audioData, format }) => {
         const ctx = (window as any).__ttsAudioContext as AudioContext;
         const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
-        const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
 
         if (!ctx || !streamDest) {
           throw new Error('Audio context not initialized');
         }
 
-        const collectWebRtcAudioStats = async () => {
-          let senderCount = 0;
-          let bytesSentTotal = 0;
-          let packetsSentTotal = 0;
-          const tracks: Array<Record<string, any>> = [];
-
-          for (const pc of pcs) {
-            const senders = pc.getSenders?.() || [];
-            for (const sender of senders) {
-              if (!sender?.track || sender.track.kind !== 'audio') continue;
-              senderCount++;
-              tracks.push({
-                id: sender.track.id,
-                label: sender.track.label,
-                enabled: sender.track.enabled,
-                muted: sender.track.muted,
-                readyState: sender.track.readyState,
-              });
-              try {
-                const stats = await sender.getStats();
-                stats.forEach((report) => {
-                  if (report.type === 'outbound-rtp' && (report as any).kind === 'audio') {
-                    bytesSentTotal += Number((report as any).bytesSent || 0);
-                    packetsSentTotal += Number((report as any).packetsSent || 0);
-                  }
-                });
-              } catch {
-                // ignore stats errors per sender
-              }
-            }
-          }
-
-          return {
-            pcs: pcs.length,
-            senderCount,
-            bytesSentTotal,
-            packetsSentTotal,
-            tracks,
-          };
-        };
-
-        // Resume context if suspended
         if (ctx.state === 'suspended') {
           await ctx.resume();
         }
 
-        // Decode base64 to ArrayBuffer
         const binaryString = atob(audioData);
         const bytes = new Uint8Array(binaryString.length);
         for (let i = 0; i < binaryString.length; i++) {
@@ -312,7 +266,6 @@ export class AudioProcedure {
         let audioBuffer: AudioBuffer;
 
         if (format === 'pcm') {
-          // PCM: Assume 16-bit mono 16kHz
           const pcmData = new Int16Array(bytes.buffer);
           audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
           const channelData = audioBuffer.getChannelData(0);
@@ -320,60 +273,26 @@ export class AudioProcedure {
             channelData[i] = pcmData[i] / 32768;
           }
         } else {
-          // MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
           audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
         }
 
-        const before = await collectWebRtcAudioStats();
-
-        // Hypothesis B: verify TTS track matches PC sender track
-        const ttsTrack = streamDest.stream.getAudioTracks()[0];
-        const ttsTrackId = ttsTrack?.id || null;
-        const senderTrackIds: string[] = [];
-        for (const pc of pcs) {
-          const senders = pc.getSenders?.() || [];
-          for (const s of senders) {
-            if (s?.track?.kind === 'audio') senderTrackIds.push(s.track.id);
-          }
-        }
-        const trackMatch = ttsTrackId && senderTrackIds.includes(ttsTrackId);
-
-        // Play through the MediaStreamDestination -> Teams mic input
         const source = ctx.createBufferSource();
         source.buffer = audioBuffer;
         source.connect(streamDest);
         source.start(0);
 
-        return new Promise((resolve) => {
+        return new Promise<void>((resolve) => {
           source.onended = () => {
             try {
               source.disconnect();
             } catch {
               // already disconnected
             }
-            resolve(null);
-          };
-        }).then(async () => {
-          const after = await collectWebRtcAudioStats();
-          return {
-            before,
-            after,
-            deltaBytes: after.bytesSentTotal - before.bytesSentTotal,
-            deltaPackets: after.packetsSentTotal - before.packetsSentTotal,
-            ttsTrackId,
-            senderTrackIds,
-            trackMatch,
+            resolve();
           };
         });
       }, { audioData, format });
 
-      this._logger.info(
-        `[Voice] TTS track vs PC: ttsTrackId=${playbackDiag?.ttsTrackId ?? 'n/a'} senderTrackIds=[${(playbackDiag?.senderTrackIds ?? []).join(',')}] trackMatch=${playbackDiag?.trackMatch ?? false}`,
-      );
-      this._logger.info(
-        `TTS WebRTC diagnostics: pcs=${playbackDiag?.after?.pcs ?? 0}, senders=${playbackDiag?.after?.senderCount ?? 0}, ` +
-        `deltaBytes=${playbackDiag?.deltaBytes ?? 0}, deltaPackets=${playbackDiag?.deltaPackets ?? 0}`,
-      );
       this._logger.info('Audio playback completed');
     } catch (error) {
       this._logger.error('Error playing audio:', error);