fix: auth join detection, caption language dropdown, audio injection via getUserMedia override

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-15 22:28:51 +01:00 · 2026-02-15 22:28:51 +01:00 · bd63dfc40a
commit bd63dfc40a
parent 13bf75bea7
4 changed files with 242 additions and 47 deletions
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@ -3,12 +3,19 @@ import { Logger } from 'winston';

 /**
 * Handles audio playback in the Teams meeting.
- * Injects TTS audio into the browser to be played through the meeting.
+ * 
+ * Architecture:
+ * - Before any page loads, we inject an init script that overrides getUserMedia
+ *   to return a MediaStream from a MediaStreamDestination we control.
+ * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
+ * - When TTS audio is played, it's piped into the same MediaStreamDestination,
+ *   so Teams picks it up as microphone input and sends it via WebRTC.
 */
 export class AudioProcedure {
  private _page: Page;
  private _logger: Logger;
  private _audioContext: boolean = false;
+  private _initScriptInjected: boolean = false;

  constructor(page: Page, logger: Logger) {
    this._page = page;
@ -16,8 +23,62 @@ export class AudioProcedure {
  }

  /**
-   * Initialize the audio context in the browser.
-   * Must be called after user interaction (joining meeting counts).
+   * Inject the getUserMedia override BEFORE any page navigation.
+   * This MUST be called before navigating to Teams.
+   * Uses page.addInitScript so it runs in every new document context.
+   */
+  async injectAudioOverride(): Promise<void> {
+    if (this._initScriptInjected) {
+      return;
+    }
+
+    this._logger.info('Injecting audio getUserMedia override...');
+
+    await this._page.addInitScript(() => {
+      // Create a shared AudioContext and MediaStreamDestination
+      // These persist across the page lifetime
+      const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
+      const ctx = new AudioContextClass();
+      const streamDest = ctx.createMediaStreamDestination();
+
+      // Store globally for later TTS injection
+      (window as any).__ttsAudioContext = ctx;
+      (window as any).__ttsStreamDest = streamDest;
+      (window as any).__ttsAudioStream = streamDest.stream;
+
+      // Override getUserMedia to return our controlled stream for audio requests
+      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
+      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
+        if (constraints && constraints.audio) {
+          // Return our TTS-injectable audio stream
+          // If video is also requested, combine our audio with real/fake video
+          if (constraints.video) {
+            try {
+              const videoStream = await originalGetUserMedia({ video: constraints.video });
+              const combinedStream = new MediaStream();
+              // Add our audio track
+              streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
+              // Add their video track
+              videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
+              return combinedStream;
+            } catch {
+              // If video fails, just return audio
+              return streamDest.stream;
+            }
+          }
+          return streamDest.stream;
+        }
+        return originalGetUserMedia(constraints);
+      };
+    });
+
+    this._initScriptInjected = true;
+    this._logger.info('Audio getUserMedia override injected');
+  }
+
+  /**
+   * Initialize the audio context in the browser for TTS playback.
+   * Must be called after joining the meeting (user gesture context).
   */
  async initialize(): Promise<void> {
    if (this._audioContext) {
@ -27,30 +88,23 @@ export class AudioProcedure {
    this._logger.info('Initializing audio context...');

    await this._page.evaluate(() => {
-      // Create a global audio context
-      const AudioContext = window.AudioContext || (window as any).webkitAudioContext;
-      const ctx = new AudioContext();
-      (window as any).__audioContext = ctx;
-      (window as any).__audioQueue = [];
-      (window as any).__isPlaying = false;
+      // The __ttsAudioContext was created by the init script.
+      // Resume it now (requires user gesture - joining meeting counts).
+      const ctx = (window as any).__ttsAudioContext as AudioContext;
+      if (ctx && ctx.state === 'suspended') {
+        ctx.resume();
+      }

-      // Create a MediaStream destination so audio is routed into the
-      // browser's virtual microphone (picked up by Teams) instead of
-      // the default speaker output (ctx.destination).
-      const streamDest = ctx.createMediaStreamDestination();
-      (window as any).__audioStreamDest = streamDest;
-
-      // Expose the stream so headless Chromium can pipe it as mic input.
-      // navigator.mediaDevices.getUserMedia will be overridden to return this stream.
-      const audioStream = streamDest.stream;
-      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
-      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
-        // If requesting audio only, return our TTS stream
-        if (constraints && constraints.audio && !constraints.video) {
-          return audioStream;
-        }
-        return originalGetUserMedia(constraints);
-      };
+      // If init script didn't run (e.g. page navigated before injection),
+      // create fallback audio infrastructure
+      if (!ctx) {
+        const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
+        const newCtx = new AudioContextClass();
+        const streamDest = newCtx.createMediaStreamDestination();
+        (window as any).__ttsAudioContext = newCtx;
+        (window as any).__ttsStreamDest = streamDest;
+        (window as any).__ttsAudioStream = streamDest.stream;
+      }
    });

    this._audioContext = true;
@ -59,7 +113,7 @@ export class AudioProcedure {

  /**
   * Play audio in the browser.
-   * The audio will be heard by other meeting participants.
+   * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
   * 
   * @param audioData Base64 encoded audio data
   * @param format Audio format (mp3, wav, pcm)
@ -73,7 +127,12 @@ export class AudioProcedure {

    try {
      await this._page.evaluate(async ({ audioData, format }) => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
+        const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
+
+        if (!ctx || !streamDest) {
+          throw new Error('Audio context not initialized');
+        }

        // Resume context if suspended
        if (ctx.state === 'suspended') {
@ -95,22 +154,19 @@ export class AudioProcedure {
          audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
          const channelData = audioBuffer.getChannelData(0);
          for (let i = 0; i < pcmData.length; i++) {
-            channelData[i] = pcmData[i] / 32768; // Convert to float
+            channelData[i] = pcmData[i] / 32768;
          }
        } else {
          // MP3/WAV: Use decodeAudioData
-          audioBuffer = await ctx.decodeAudioData(bytes.buffer);
+          audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
        }

-        // Create source and play through the MediaStream destination
-        // so audio is routed into the Teams microphone input, not speakers
+        // Play through the MediaStreamDestination -> Teams mic input
        const source = ctx.createBufferSource();
        source.buffer = audioBuffer;
-        const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode;
-        source.connect(streamDest || ctx.destination);
+        source.connect(streamDest);
        source.start(0);

-        // Return a promise that resolves when playback ends
        return new Promise<void>((resolve) => {
          source.onended = () => resolve();
        });
@ -129,7 +185,7 @@ export class AudioProcedure {
  async stopAudio(): Promise<void> {
    try {
      await this._page.evaluate(() => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.suspend();
        }
@ -145,7 +201,7 @@ export class AudioProcedure {
  async cleanup(): Promise<void> {
    try {
      await this._page.evaluate(() => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.close();
        }
--- a/src/bot/captionsProcedure.ts
+++ b/src/bot/captionsProcedure.ts
@ -332,6 +332,32 @@ export class CaptionsProcedure {

      // Look for the spoken language dropdown/combobox
      let languageSet = false;
+
+      // First, log what's visible in the settings panel for debugging
+      const panelInfo = await this._page.evaluate(() => {
+        const selects = document.querySelectorAll('select');
+        const comboboxes = document.querySelectorAll('[role="combobox"]');
+        const listboxes = document.querySelectorAll('[role="listbox"]');
+        const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]');
+        const allButtons = document.querySelectorAll('button');
+        const buttonsWithText = Array.from(allButtons)
+          .map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`)
+          .filter(t => t.length > 10)
+          .slice(0, 10);
+        return {
+          selects: selects.length,
+          comboboxes: comboboxes.length,
+          listboxes: listboxes.length,
+          dropdowns: dropdowns.length,
+          buttons: buttonsWithText,
+          bodySnippet: document.body?.innerText?.substring(0, 800) || '',
+        };
+      });
+      this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`);
+      this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`);
+      this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`);
+
+      // Strategy A: Standard selectors
      const dropdownSelectors = [
        'select[aria-label*="spoken language" i]',
        'select[aria-label*="Meeting spoken language" i]',
@ -339,7 +365,7 @@ export class CaptionsProcedure {
        '[data-tid="spoken-language-dropdown"]',
        'div[role="combobox"]',
        'div[role="listbox"]',
-        'select', // Generic fallback
+        'select',
      ];

      for (const selector of dropdownSelectors) {
@ -350,7 +376,6 @@ export class CaptionsProcedure {
            const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase());
            
            if (tagName === 'select') {
-              // Native select element
              for (const name of targetNames) {
                try {
                  await this._page.selectOption(selector, { label: name });
@ -368,7 +393,6 @@ export class CaptionsProcedure {
              
              for (const name of targetNames) {
                try {
-                  // Try role="option" first, then generic text search
                  const optionSelectors = [
                    `[role="option"]:has-text("${name}")`,
                    `li:has-text("${name}")`,
@ -397,6 +421,69 @@ export class CaptionsProcedure {
        }
      }

+      // Strategy B: DOM evaluation fallback - find any dropdown-like element and interact
+      if (!languageSet) {
+        this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...');
+        
+        languageSet = await this._page.evaluate((names: string[]) => {
+          // Find all elements that could be dropdowns (Fluent UI uses various patterns)
+          const candidates = document.querySelectorAll(
+            '[role="combobox"], [role="listbox"], select, ' +
+            '[class*="dropdown" i], [class*="Dropdown"], ' +
+            'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' +
+            '[aria-expanded]'
+          );
+
+          for (let i = 0; i < candidates.length; i++) {
+            const el = candidates[i] as HTMLElement;
+            const label = el.getAttribute('aria-label') || '';
+            const nearbyText = el.parentElement?.innerText || '';
+            
+            // Check if this dropdown is related to language
+            const isLanguageRelated = 
+              label.toLowerCase().includes('language') ||
+              label.toLowerCase().includes('sprache') ||
+              nearbyText.toLowerCase().includes('spoken language') ||
+              nearbyText.toLowerCase().includes('gesprochene sprache');
+            
+            if (isLanguageRelated || candidates.length === 1) {
+              // Click to open the dropdown
+              el.click();
+              
+              // Wait a frame for options to render
+              return new Promise<boolean>((resolve) => {
+                requestAnimationFrame(() => {
+                  requestAnimationFrame(() => {
+                    // Look for options
+                    const options = document.querySelectorAll(
+                      '[role="option"], [role="menuitem"], li[class*="option" i]'
+                    );
+                    
+                    for (let j = 0; j < options.length; j++) {
+                      const opt = options[j] as HTMLElement;
+                      const optText = opt.innerText?.trim() || '';
+                      
+                      if (names.some(n => optText.includes(n))) {
+                        opt.click();
+                        resolve(true);
+                        return;
+                      }
+                    }
+                    resolve(false);
+                  });
+                });
+              });
+            }
+          }
+          return Promise.resolve(false);
+        }, targetNames);
+
+        if (languageSet) {
+          this._logger.info('Selected spoken language via DOM evaluation fallback');
+          await this._page.waitForTimeout(500);
+        }
+      }
+
      if (!languageSet) {
        this._logger.warn('Could not find/select spoken language in dropdown');
      }
--- a/src/bot/joinProcedure.ts
+++ b/src/bot/joinProcedure.ts
@ -264,12 +264,14 @@ export class JoinProcedure {
   * Check if the bot is currently in the meeting (admitted from lobby).
   * Primary selector: button[id="hangup-button"] (confirmed by Recall.ai).
   * Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign.
+   * 
+   * For authenticated joins, Teams v2 sometimes renders differently.
+   * Additional fallback: check the URL for meeting patterns and DOM for call UI.
   */
  async isInMeeting(options: { waitForSeconds?: number } = {}): Promise<boolean> {
    const timeout = (options.waitForSeconds || 5) * 1000;

-    // Primary selector - confirmed by Recall.ai (Jan 2025)
-    // Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button"
+    // Primary selectors - known meeting UI elements
    const inMeetingSelectors = [
      'button[id="hangup-button"]',
      'button[id="callingButtons-showMoreBtn"]',
@ -278,6 +280,16 @@ export class JoinProcedure {
      '[data-tid="call-composite"]',
      'button[aria-label*="Leave"]',
      '[data-tid="callingButtons-showMoreBtn"]',
+      // Teams v2 (2025+) additional selectors
+      '[data-tid="call-controls"]',
+      '[data-tid="meeting-composite"]',
+      'div[data-tid="video-gallery"]',
+      'button[aria-label*="Hang up"]',
+      'button[aria-label*="leave" i]',
+      // Mic/Camera toggle buttons are only visible in an active call
+      'button[id="microphone-button"]',
+      'button[data-tid="toggle-mute"]',
+      '[data-tid="microphone-button"]',
    ];

    try {
@ -287,8 +299,35 @@ export class JoinProcedure {
      });
      return true;
    } catch {
-      return false;
+      // Selector-based detection failed, try DOM evaluation as fallback
    }
+
+    // Fallback: evaluate the page for meeting indicators
+    try {
+      const inMeeting = await this._page.evaluate(() => {
+        // Check for call-related aria roles and meeting elements
+        const bodyText = document.body?.innerText || '';
+        const meetingIndicators = [
+          'Leave',         // Leave button text
+          'Mute',          // Mic mute button
+          'Unmute',        // Mic unmute button
+          'Turn off camera', // Camera control
+          'Turn on camera',
+          'Share',         // Share screen
+        ];
+        const found = meetingIndicators.filter(ind => bodyText.includes(ind));
+        // Need at least 2 meeting indicators to confirm we're in a meeting
+        return found.length >= 2;
+      });
+      if (inMeeting) {
+        this._logger.info('Detected meeting via DOM text analysis (fallback)');
+        return true;
+      }
+    } catch {
+      // Page may not be ready
+    }
+
+    return false;
  }

  /**
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@ -437,7 +437,9 @@ export class BotOrchestrator {
      headless: config.botHeadless,
      args: [
        '--use-fake-ui-for-media-stream', // Auto-accept media permissions
-        '--use-fake-device-for-media-stream', // Use fake devices
+        // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
+        // We override getUserMedia via addInitScript to return a MediaStreamDestination
+        // that we control, so TTS audio can be injected into Teams' mic input.
        '--disable-web-security',
        '--disable-features=IsolateOrigins,site-per-process',
        '--autoplay-policy=no-user-gesture-required',
@ -468,6 +470,10 @@ export class BotOrchestrator {
    );
    this._audioProcedure = new AudioProcedure(this._page, this._logger);

+    // Inject audio getUserMedia override BEFORE any navigation
+    // This ensures Teams gets our controlled audio stream when it calls getUserMedia
+    await this._audioProcedure.injectAudioOverride();
+
    // Handle page errors
    this._page.on('pageerror', (error) => {
      this._logger.error('Page error:', error);
@ -537,11 +543,18 @@ export class BotOrchestrator {
      // - Page is transitioning between states
      // Only give up after several consecutive cycles with no signal
      consecutiveNoSignal++;
-      this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`);
+      const currentUrl = this._page?.url() || 'unknown';
+      this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`);

      if (consecutiveNoSignal >= maxNoSignal) {
-        // Take a screenshot for debugging before giving up
+        // Take a screenshot and log page content for debugging before giving up
        await this._takeScreenshot('no-meeting-signal');
+        try {
+          const bodySnippet = await this._page?.evaluate(() =>
+            document.body?.innerText?.substring(0, 500) || '(empty)'
+          );
+          this._logger.warn(`Page content before giving up: ${bodySnippet}`);
+        } catch { /* ignore */ }
        throw new Error('Bot was removed from lobby or meeting ended');
      }
    }