fix: restore fake-device flag, wrap getUserMedia to swap audio track, handle no-audio modal

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-15 22:40:48 +01:00 · 2026-02-15 22:40:48 +01:00 · 39c8012358
commit 39c8012358
parent bd63dfc40a
3 changed files with 72 additions and 31 deletions
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@ -5,11 +5,15 @@ import { Logger } from 'winston';
 * Handles audio playback in the Teams meeting.
 * 
 * Architecture:
- * - Before any page loads, we inject an init script that overrides getUserMedia
- *   to return a MediaStream from a MediaStreamDestination we control.
- * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
- * - When TTS audio is played, it's piped into the same MediaStreamDestination,
- *   so Teams picks it up as microphone input and sends it via WebRTC.
+ * - Browser launches with --use-fake-device-for-media-stream so Teams sees
+ *   real-looking devices (no "no audio/video" modal).
+ * - Before any page loads, we inject an init script that wraps getUserMedia.
+ * - When Teams calls getUserMedia, the wrapper:
+ *   1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
+ *   2. Replaces the audio track with one from our MediaStreamDestination
+ *   3. Returns the modified stream (our audio + Chromium's fake video)
+ * - When TTS audio is played, it's piped into the MediaStreamDestination,
+ *   and Teams sends it via WebRTC to other meeting participants.
 */
 export class AudioProcedure {
  private _page: Page;
@ -23,7 +27,7 @@ export class AudioProcedure {
  }

  /**
-   * Inject the getUserMedia override BEFORE any page navigation.
+   * Inject the getUserMedia wrapper BEFORE any page navigation.
   * This MUST be called before navigating to Teams.
   * Uses page.addInitScript so it runs in every new document context.
   */
@ -35,8 +39,7 @@ export class AudioProcedure {
    this._logger.info('Injecting audio getUserMedia override...');

    await this._page.addInitScript(() => {
-      // Create a shared AudioContext and MediaStreamDestination
-      // These persist across the page lifetime
+      // Create a shared AudioContext and MediaStreamDestination for TTS injection
      const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
      const ctx = new AudioContextClass();
      const streamDest = ctx.createMediaStreamDestination();
@ -46,29 +49,27 @@ export class AudioProcedure {
      (window as any).__ttsStreamDest = streamDest;
      (window as any).__ttsAudioStream = streamDest.stream;

-      // Override getUserMedia to return our controlled stream for audio requests
+      // Wrap getUserMedia to replace audio tracks with our TTS-injectable stream
      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
+        // Get the real stream (from Chromium's fake devices)
+        const realStream = await originalGetUserMedia(constraints);
+
        if (constraints && constraints.audio) {
-          // Return our TTS-injectable audio stream
-          // If video is also requested, combine our audio with real/fake video
-          if (constraints.video) {
-            try {
-              const videoStream = await originalGetUserMedia({ video: constraints.video });
+          // Build a new stream: our TTS audio track + their video tracks
          const combinedStream = new MediaStream();
-              // Add our audio track
+
+          // Add our controlled audio track (TTS will be piped here)
          streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
-              // Add their video track
-              videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
+
+          // Keep the real video tracks (from fake camera)
+          realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
+
          return combinedStream;
-            } catch {
-              // If video fails, just return audio
-              return streamDest.stream;
        }
-          }
-          return streamDest.stream;
-        }
-        return originalGetUserMedia(constraints);
+
+        // No audio requested - return the real stream as-is
+        return realStream;
      };
    });

@ -157,7 +158,7 @@ export class AudioProcedure {
            channelData[i] = pcmData[i] / 32768;
          }
        } else {
-          // MP3/WAV: Use decodeAudioData
+          // MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
          audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
        }

--- a/src/bot/joinProcedure.ts
+++ b/src/bot/joinProcedure.ts
@ -174,10 +174,17 @@ export class JoinProcedure {
  /**
   * Click the "Join now" button.
   * Primary selector: button:has-text("Join now") (confirmed by Recall.ai).
+   * 
+   * IMPORTANT: Teams may show a "no audio/video" modal that blocks the Join button.
+   * This happens when getUserMedia doesn't return real-looking devices.
+   * We handle this by dismissing the modal first.
   */
  private async _clickJoinNow(): Promise<void> {
    this._logger.info('Clicking Join now...');

+    // First, dismiss any "no audio/video" modal that may be blocking
+    await this._dismissNoAudioVideoModal();
+
    // Primary selector - confirmed working by Recall.ai (Jan 2025)
    const primarySelector = 'button:has-text("Join now")';

@ -185,6 +192,10 @@ export class JoinProcedure {
      await this._page.waitForSelector(primarySelector, { timeout: 15000 });
      await this._page.click(primarySelector);
      this._logger.info('Clicked "Join now" button');
+
+      // After clicking Join, Teams may show the modal again. Dismiss if present.
+      await this._page.waitForTimeout(2000);
+      await this._dismissNoAudioVideoModal();
      return;
    } catch {
      this._logger.info('Primary join button selector not found, trying fallbacks...');
@ -204,6 +215,8 @@ export class JoinProcedure {
        if (button) {
          await button.click();
          this._logger.info(`Clicked join button (fallback: ${selector})`);
+          await this._page.waitForTimeout(2000);
+          await this._dismissNoAudioVideoModal();
          return;
        }
      } catch {
@ -223,6 +236,35 @@ export class JoinProcedure {
    );
  }

+  /**
+   * Dismiss the "Are you sure you don't want audio or video?" modal.
+   * Teams shows this when it can't access camera/mic devices.
+   * We click "Continue without audio or video" to proceed.
+   */
+  private async _dismissNoAudioVideoModal(): Promise<void> {
+    const modalSelectors = [
+      'button:has-text("Continue without audio or video")',
+      'button:has-text("Ohne Audio oder Video fortfahren")',
+      'button:has-text("Continue without")',
+      'button:has-text("Ohne Audio")',
+    ];
+
+    for (const selector of modalSelectors) {
+      try {
+        const button = await this._page.$(selector);
+        if (button) {
+          await button.click();
+          this._logger.info(`Dismissed no-audio modal: ${selector}`);
+          await this._page.waitForTimeout(1000);
+          return;
+        }
+      } catch {
+        // Continue
+      }
+    }
+    // No modal found - that's fine, it means devices were detected properly
+  }
+
  /**
   * Check if the bot is currently in the lobby (waiting to be admitted).
   * Primary check: text "Someone will let you in shortly" (confirmed by Recall.ai).
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@ -437,9 +437,7 @@ export class BotOrchestrator {
      headless: config.botHeadless,
      args: [
        '--use-fake-ui-for-media-stream', // Auto-accept media permissions
-        // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
-        // We override getUserMedia via addInitScript to return a MediaStreamDestination
-        // that we control, so TTS audio can be injected into Teams' mic input.
+        '--use-fake-device-for-media-stream', // Provide fake camera/mic so Teams sees devices
        '--disable-web-security',
        '--disable-features=IsolateOrigins,site-per-process',
        '--autoplay-policy=no-user-gesture-required',