From 533e976039155bbafa46baf6343bbe3590ad6c63 Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Fri, 27 Feb 2026 12:27:02 +0100
Subject: [PATCH] Voice: play TTS in correct frame (iframe), add mic toggle
 fallbacks

Made-with: Cursor
---
 src/bot/audioProcedure.ts | 42 ++++++++++++++++++++++++++++++++++++++-
 src/bot/orchestrator.ts   | 35 +++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 8 deletions(-)
diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index 37a900c..0b9ce6c 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -92,6 +92,41 @@ export class AudioProcedure {
     this._logger.info('Audio getUserMedia override injected');
   }
 
+  /**
+   * Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
+   * Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
+   * play into the wrong streamDest. Returns the frame to use, or null for main page.
+   */
+  private async _getTtsFrame(): Promise<{ evaluate: typeof this._page.evaluate } | null> {
+    const frames = this._page.frames();
+    for (const frame of frames) {
+      try {
+        const match = await frame.evaluate(() => {
+          const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
+          const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
+          if (!pcs.length || !streamDest) return false;
+          const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
+          if (!ttsTrackId) return false;
+          for (const pc of pcs) {
+            const senders = pc.getSenders?.() || [];
+            for (const s of senders) {
+              if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
+            }
+          }
+          return false;
+        });
+        if (match) {
+          this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
+          return frame;
+        }
+      } catch {
+        // Frame may be detached
+      }
+    }
+    this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
+    return null;
+  }
+
   /**
    * Initialize the audio context in the browser for TTS playback.
    * Must be called after joining the meeting (user gesture context).
@@ -198,6 +233,8 @@ export class AudioProcedure {
   /**
    * Internal: Play audio in the browser (single clip, no queuing).
    * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
+   * Teams meeting may run in an iframe; we must play in the frame that has the
+   * RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants).
    */
   private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
     if (!this._audioContext) {
@@ -206,8 +243,11 @@ export class AudioProcedure {
 
     this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
 
+    const targetFrame = await this._getTtsFrame();
+    const evalTarget = targetFrame || this._page;
+
     try {
-      const playbackDiag = await this._page.evaluate(async ({ audioData, format }) => {
+      const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => {
         const ctx = (window as any).__ttsAudioContext as AudioContext;
         const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
         const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index 3b44f20..39bca99 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -493,6 +493,12 @@ export class BotOrchestrator {
       const micToggle = await this._pollForElement([
         'input[data-tid="toggle-audio"]',
         '[data-tid="toggle-audio"]',
+        'input[data-tid="toggle-mute"]',
+        '[data-tid="toggle-mute"]',
+        'button[id="microphone-button"]',
+        'button[data-inp="microphone-button"]',
+        'button[aria-label*="microphone" i]',
+        'button[aria-label*="Mikrofon" i]',
         'input[role="switch"][title*="microphone" i]',
         'input[role="switch"][title*="Mikrofon" i]',
         'input[role="switch"][title*="mic" i]',
@@ -501,14 +507,29 @@ export class BotOrchestrator {
 
       if (!micToggle) return;
 
-      const state = await micToggle.evaluate((el: HTMLInputElement) => ({
-        checked: el.checked,
-        dataCid: el.getAttribute('data-cid') || '',
-        title: el.getAttribute('title') || '',
-      }));
-      this._logger.info(`Mic state: checked=${state.checked}, data-cid="${state.dataCid}", title="${state.title}"`);
+      const state = await micToggle.evaluate((el: HTMLElement) => {
+        const input = el as HTMLInputElement;
+        const label = (el.getAttribute('aria-label') || el.getAttribute('title') || '').toLowerCase();
+        const isInput = el.tagName === 'INPUT';
+        const checked = isInput ? input.checked : undefined;
+        const looksMuted = !isInput && (
+          label.includes('unmute') || label.includes('einschalten') ||
+          label.includes('turn on') || label.includes('turn microphone on')
+        );
+        return {
+          checked,
+          dataCid: el.getAttribute('data-cid') || '',
+          title: el.getAttribute('title') || '',
+          tagName: el.tagName,
+          looksMuted,
+        };
+      });
+      this._logger.info(`Mic state: checked=${state.checked}, data-cid="${state.dataCid}", tag=${state.tagName}, looksMuted=${state.looksMuted}`);
 
-      if (!state.checked) {
+      const needsClick = state.tagName === 'INPUT'
+        ? !state.checked
+        : (state.looksMuted === true);
+      if (needsClick) {
         await micToggle.click();
         this._logger.info('Mic toggled ON');
       } else {