fix: auth join detection, caption language dropdown, audio injection via getUserMedia override

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-15 22:28:51 +01:00 · 2026-02-15 22:28:51 +01:00 · bd63dfc40a
commit bd63dfc40a
parent 13bf75bea7
4 changed files with 242 additions and 47 deletions
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@ -3,12 +3,19 @@ import { Logger } from 'winston';
 /**
 * Handles audio playback in the Teams meeting.
- * Injects TTS audio into the browser to be played through the meeting.
+ * 
 * Architecture:
 * - Before any page loads, we inject an init script that overrides getUserMedia
 *   to return a MediaStream from a MediaStreamDestination we control.
 * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
 * - When TTS audio is played, it's piped into the same MediaStreamDestination,
 *   so Teams picks it up as microphone input and sends it via WebRTC.
 */
 export class AudioProcedure {
  private _page: Page;
  private _logger: Logger;
  private _audioContext: boolean = false;
  private _initScriptInjected: boolean = false;
  constructor(page: Page, logger: Logger) {
    this._page = page;
@ -16,8 +23,62 @@ export class AudioProcedure {
  }
  /**
-   * Initialize the audio context in the browser.
+   * Inject the getUserMedia override BEFORE any page navigation.
-   * Must be called after user interaction (joining meeting counts).
+   * This MUST be called before navigating to Teams.
   * Uses page.addInitScript so it runs in every new document context.
   */
  async injectAudioOverride(): Promise<void> {
    if (this._initScriptInjected) {
      return;
    }
    this._logger.info('Injecting audio getUserMedia override...');
    await this._page.addInitScript(() => {
      // Create a shared AudioContext and MediaStreamDestination
      // These persist across the page lifetime
      const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
      const ctx = new AudioContextClass();
      const streamDest = ctx.createMediaStreamDestination();
      // Store globally for later TTS injection
      (window as any).__ttsAudioContext = ctx;
      (window as any).__ttsStreamDest = streamDest;
      (window as any).__ttsAudioStream = streamDest.stream;
      // Override getUserMedia to return our controlled stream for audio requests
      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
        if (constraints && constraints.audio) {
          // Return our TTS-injectable audio stream
          // If video is also requested, combine our audio with real/fake video
          if (constraints.video) {
            try {
              const videoStream = await originalGetUserMedia({ video: constraints.video });
              const combinedStream = new MediaStream();
              // Add our audio track
              streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
              // Add their video track
              videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
              return combinedStream;
            } catch {
              // If video fails, just return audio
              return streamDest.stream;
            }
          }
          return streamDest.stream;
        }
        return originalGetUserMedia(constraints);
      };
    });
    this._initScriptInjected = true;
    this._logger.info('Audio getUserMedia override injected');
  }
  /**
   * Initialize the audio context in the browser for TTS playback.
   * Must be called after joining the meeting (user gesture context).
   */
  async initialize(): Promise<void> {
    if (this._audioContext) {
@ -27,30 +88,23 @@ export class AudioProcedure {
    this._logger.info('Initializing audio context...');
    await this._page.evaluate(() => {
-      // Create a global audio context
+      // The __ttsAudioContext was created by the init script.
-      const AudioContext = window.AudioContext || (window as any).webkitAudioContext;
+      // Resume it now (requires user gesture - joining meeting counts).
-      const ctx = new AudioContext();
+      const ctx = (window as any).__ttsAudioContext as AudioContext;
-      (window as any).__audioContext = ctx;
+      if (ctx && ctx.state === 'suspended') {
-      (window as any).__audioQueue = [];
+        ctx.resume();
-      (window as any).__isPlaying = false;
+      }
-      // Create a MediaStream destination so audio is routed into the
+      // If init script didn't run (e.g. page navigated before injection),
-      // browser's virtual microphone (picked up by Teams) instead of
+      // create fallback audio infrastructure
-      // the default speaker output (ctx.destination).
+      if (!ctx) {
-      const streamDest = ctx.createMediaStreamDestination();
+        const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
-      (window as any).__audioStreamDest = streamDest;
+        const newCtx = new AudioContextClass();
-
+        const streamDest = newCtx.createMediaStreamDestination();
-      // Expose the stream so headless Chromium can pipe it as mic input.
+        (window as any).__ttsAudioContext = newCtx;
-      // navigator.mediaDevices.getUserMedia will be overridden to return this stream.
+        (window as any).__ttsStreamDest = streamDest;
-      const audioStream = streamDest.stream;
+        (window as any).__ttsAudioStream = streamDest.stream;
-      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
+      }
      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
        // If requesting audio only, return our TTS stream
        if (constraints && constraints.audio && !constraints.video) {
          return audioStream;
        }
        return originalGetUserMedia(constraints);
      };
    });
    this._audioContext = true;
@ -59,7 +113,7 @@ export class AudioProcedure {
  /**
   * Play audio in the browser.
-   * The audio will be heard by other meeting participants.
+   * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
   * 
   * @param audioData Base64 encoded audio data
   * @param format Audio format (mp3, wav, pcm)
@ -73,8 +127,13 @@ export class AudioProcedure {
    try {
      await this._page.evaluate(async ({ audioData, format }) => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
-        
+        const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
        if (!ctx || !streamDest) {
          throw new Error('Audio context not initialized');
        }
        // Resume context if suspended
        if (ctx.state === 'suspended') {
          await ctx.resume();
@ -95,22 +154,19 @@ export class AudioProcedure {
          audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
          const channelData = audioBuffer.getChannelData(0);
          for (let i = 0; i < pcmData.length; i++) {
-            channelData[i] = pcmData[i] / 32768; // Convert to float
+            channelData[i] = pcmData[i] / 32768;
          }
        } else {
          // MP3/WAV: Use decodeAudioData
-          audioBuffer = await ctx.decodeAudioData(bytes.buffer);
+          audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
        }
-        // Create source and play through the MediaStream destination
+        // Play through the MediaStreamDestination -> Teams mic input
        // so audio is routed into the Teams microphone input, not speakers
        const source = ctx.createBufferSource();
        source.buffer = audioBuffer;
-        const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode;
+        source.connect(streamDest);
        source.connect(streamDest || ctx.destination);
        source.start(0);
        // Return a promise that resolves when playback ends
        return new Promise<void>((resolve) => {
          source.onended = () => resolve();
        });
@ -129,7 +185,7 @@ export class AudioProcedure {
  async stopAudio(): Promise<void> {
    try {
      await this._page.evaluate(() => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.suspend();
        }
@ -145,7 +201,7 @@ export class AudioProcedure {
  async cleanup(): Promise<void> {
    try {
      await this._page.evaluate(() => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
        if (ctx) {
          ctx.close();
        }
--- a/src/bot/captionsProcedure.ts
+++ b/src/bot/captionsProcedure.ts
@ -332,6 +332,32 @@ export class CaptionsProcedure {
      // Look for the spoken language dropdown/combobox
      let languageSet = false;
      // First, log what's visible in the settings panel for debugging
      const panelInfo = await this._page.evaluate(() => {
        const selects = document.querySelectorAll('select');
        const comboboxes = document.querySelectorAll('[role="combobox"]');
        const listboxes = document.querySelectorAll('[role="listbox"]');
        const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]');
        const allButtons = document.querySelectorAll('button');
        const buttonsWithText = Array.from(allButtons)
          .map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`)
          .filter(t => t.length > 10)
          .slice(0, 10);
        return {
          selects: selects.length,
          comboboxes: comboboxes.length,
          listboxes: listboxes.length,
          dropdowns: dropdowns.length,
          buttons: buttonsWithText,
          bodySnippet: document.body?.innerText?.substring(0, 800) || '',
        };
      });
      this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`);
      this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`);
      this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`);
      // Strategy A: Standard selectors
      const dropdownSelectors = [
        'select[aria-label*="spoken language" i]',
        'select[aria-label*="Meeting spoken language" i]',
@ -339,7 +365,7 @@ export class CaptionsProcedure {
        '[data-tid="spoken-language-dropdown"]',
        'div[role="combobox"]',
        'div[role="listbox"]',
-        'select', // Generic fallback
+        'select',
      ];
      for (const selector of dropdownSelectors) {
@ -350,7 +376,6 @@ export class CaptionsProcedure {
            const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase());
            if (tagName === 'select') {
              // Native select element
              for (const name of targetNames) {
                try {
                  await this._page.selectOption(selector, { label: name });
@ -368,7 +393,6 @@ export class CaptionsProcedure {
              for (const name of targetNames) {
                try {
                  // Try role="option" first, then generic text search
                  const optionSelectors = [
                    `[role="option"]:has-text("${name}")`,
                    `li:has-text("${name}")`,
@ -397,6 +421,69 @@ export class CaptionsProcedure {
        }
      }
      // Strategy B: DOM evaluation fallback - find any dropdown-like element and interact
      if (!languageSet) {
        this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...');
        languageSet = await this._page.evaluate((names: string[]) => {
          // Find all elements that could be dropdowns (Fluent UI uses various patterns)
          const candidates = document.querySelectorAll(
            '[role="combobox"], [role="listbox"], select, ' +
            '[class*="dropdown" i], [class*="Dropdown"], ' +
            'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' +
            '[aria-expanded]'
          );
          for (let i = 0; i < candidates.length; i++) {
            const el = candidates[i] as HTMLElement;
            const label = el.getAttribute('aria-label') || '';
            const nearbyText = el.parentElement?.innerText || '';
            // Check if this dropdown is related to language
            const isLanguageRelated = 
              label.toLowerCase().includes('language') ||
              label.toLowerCase().includes('sprache') ||
              nearbyText.toLowerCase().includes('spoken language') ||
              nearbyText.toLowerCase().includes('gesprochene sprache');
            if (isLanguageRelated || candidates.length === 1) {
              // Click to open the dropdown
              el.click();
              // Wait a frame for options to render
              return new Promise<boolean>((resolve) => {
                requestAnimationFrame(() => {
                  requestAnimationFrame(() => {
                    // Look for options
                    const options = document.querySelectorAll(
                      '[role="option"], [role="menuitem"], li[class*="option" i]'
                    );
                    for (let j = 0; j < options.length; j++) {
                      const opt = options[j] as HTMLElement;
                      const optText = opt.innerText?.trim() || '';
                      if (names.some(n => optText.includes(n))) {
                        opt.click();
                        resolve(true);
                        return;
                      }
                    }
                    resolve(false);
                  });
                });
              });
            }
          }
          return Promise.resolve(false);
        }, targetNames);
        if (languageSet) {
          this._logger.info('Selected spoken language via DOM evaluation fallback');
          await this._page.waitForTimeout(500);
        }
      }
      if (!languageSet) {
        this._logger.warn('Could not find/select spoken language in dropdown');
      }
--- a/src/bot/joinProcedure.ts
+++ b/src/bot/joinProcedure.ts
@ -264,12 +264,14 @@ export class JoinProcedure {
   * Check if the bot is currently in the meeting (admitted from lobby).
   * Primary selector: button[id="hangup-button"] (confirmed by Recall.ai).
   * Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign.
   * 
   * For authenticated joins, Teams v2 sometimes renders differently.
   * Additional fallback: check the URL for meeting patterns and DOM for call UI.
   */
  async isInMeeting(options: { waitForSeconds?: number } = {}): Promise<boolean> {
    const timeout = (options.waitForSeconds || 5) * 1000;
-    // Primary selector - confirmed by Recall.ai (Jan 2025)
+    // Primary selectors - known meeting UI elements
    // Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button"
    const inMeetingSelectors = [
      'button[id="hangup-button"]',
      'button[id="callingButtons-showMoreBtn"]',
@ -278,6 +280,16 @@ export class JoinProcedure {
      '[data-tid="call-composite"]',
      'button[aria-label*="Leave"]',
      '[data-tid="callingButtons-showMoreBtn"]',
      // Teams v2 (2025+) additional selectors
      '[data-tid="call-controls"]',
      '[data-tid="meeting-composite"]',
      'div[data-tid="video-gallery"]',
      'button[aria-label*="Hang up"]',
      'button[aria-label*="leave" i]',
      // Mic/Camera toggle buttons are only visible in an active call
      'button[id="microphone-button"]',
      'button[data-tid="toggle-mute"]',
      '[data-tid="microphone-button"]',
    ];
    try {
@ -287,8 +299,35 @@ export class JoinProcedure {
      });
      return true;
    } catch {
-      return false;
+      // Selector-based detection failed, try DOM evaluation as fallback
    }
    // Fallback: evaluate the page for meeting indicators
    try {
      const inMeeting = await this._page.evaluate(() => {
        // Check for call-related aria roles and meeting elements
        const bodyText = document.body?.innerText || '';
        const meetingIndicators = [
          'Leave',         // Leave button text
          'Mute',          // Mic mute button
          'Unmute',        // Mic unmute button
          'Turn off camera', // Camera control
          'Turn on camera',
          'Share',         // Share screen
        ];
        const found = meetingIndicators.filter(ind => bodyText.includes(ind));
        // Need at least 2 meeting indicators to confirm we're in a meeting
        return found.length >= 2;
      });
      if (inMeeting) {
        this._logger.info('Detected meeting via DOM text analysis (fallback)');
        return true;
      }
    } catch {
      // Page may not be ready
    }
    return false;
  }
  /**
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@ -437,7 +437,9 @@ export class BotOrchestrator {
      headless: config.botHeadless,
      args: [
        '--use-fake-ui-for-media-stream', // Auto-accept media permissions
-        '--use-fake-device-for-media-stream', // Use fake devices
+        // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
        // We override getUserMedia via addInitScript to return a MediaStreamDestination
        // that we control, so TTS audio can be injected into Teams' mic input.
        '--disable-web-security',
        '--disable-features=IsolateOrigins,site-per-process',
        '--autoplay-policy=no-user-gesture-required',
@ -468,6 +470,10 @@ export class BotOrchestrator {
    );
    this._audioProcedure = new AudioProcedure(this._page, this._logger);
    // Inject audio getUserMedia override BEFORE any navigation
    // This ensures Teams gets our controlled audio stream when it calls getUserMedia
    await this._audioProcedure.injectAudioOverride();
    // Handle page errors
    this._page.on('pageerror', (error) => {
      this._logger.error('Page error:', error);
@ -537,11 +543,18 @@ export class BotOrchestrator {
      // - Page is transitioning between states
      // Only give up after several consecutive cycles with no signal
      consecutiveNoSignal++;
-      this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`);
+      const currentUrl = this._page?.url() || 'unknown';
      this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`);
      if (consecutiveNoSignal >= maxNoSignal) {
-        // Take a screenshot for debugging before giving up
+        // Take a screenshot and log page content for debugging before giving up
        await this._takeScreenshot('no-meeting-signal');
        try {
          const bodySnippet = await this._page?.evaluate(() =>
            document.body?.innerText?.substring(0, 500) || '(empty)'
          );
          this._logger.warn(`Page content before giving up: ${bodySnippet}`);
        } catch { /* ignore */ }
        throw new Error('Bot was removed from lobby or meeting ended');
      }
    }