From bd63dfc40ab4c97a841ea470a956651bde4923af Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Sun, 15 Feb 2026 22:28:51 +0100
Subject: [PATCH] fix: auth join detection, caption language dropdown, audio
 injection via getUserMedia override

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/bot/audioProcedure.ts    | 132 +++++++++++++++++++++++++----------
 src/bot/captionsProcedure.ts |  93 +++++++++++++++++++++++-
 src/bot/joinProcedure.ts     |  45 +++++++++++-
 src/bot/orchestrator.ts      |  19 ++++-
 4 files changed, 242 insertions(+), 47 deletions(-)
diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index 7927657..3a90ac0 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -3,12 +3,19 @@ import { Logger } from 'winston';
 
 /**
  * Handles audio playback in the Teams meeting.
- * Injects TTS audio into the browser to be played through the meeting.
+ * 
+ * Architecture:
+ * - Before any page loads, we inject an init script that overrides getUserMedia
+ *   to return a MediaStream from a MediaStreamDestination we control.
+ * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
+ * - When TTS audio is played, it's piped into the same MediaStreamDestination,
+ *   so Teams picks it up as microphone input and sends it via WebRTC.
  */
 export class AudioProcedure {
   private _page: Page;
   private _logger: Logger;
   private _audioContext: boolean = false;
+  private _initScriptInjected: boolean = false;
 
   constructor(page: Page, logger: Logger) {
     this._page = page;
@@ -16,8 +23,62 @@ export class AudioProcedure {
   }
 
   /**
-   * Initialize the audio context in the browser.
-   * Must be called after user interaction (joining meeting counts).
+   * Inject the getUserMedia override BEFORE any page navigation.
+   * This MUST be called before navigating to Teams.
+   * Uses page.addInitScript so it runs in every new document context.
+   */
+  async injectAudioOverride(): Promise<void> {
+    if (this._initScriptInjected) {
+      return;
+    }
+
+    this._logger.info('Injecting audio getUserMedia override...');
+
+    await this._page.addInitScript(() => {
+      // Create a shared AudioContext and MediaStreamDestination
+      // These persist across the page lifetime
+      const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
+      const ctx = new AudioContextClass();
+      const streamDest = ctx.createMediaStreamDestination();
+
+      // Store globally for later TTS injection
+      (window as any).__ttsAudioContext = ctx;
+      (window as any).__ttsStreamDest = streamDest;
+      (window as any).__ttsAudioStream = streamDest.stream;
+
+      // Override getUserMedia to return our controlled stream for audio requests
+      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
+      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
+        if (constraints && constraints.audio) {
+          // Return our TTS-injectable audio stream
+          // If video is also requested, combine our audio with real/fake video
+          if (constraints.video) {
+            try {
+              const videoStream = await originalGetUserMedia({ video: constraints.video });
+              const combinedStream = new MediaStream();
+              // Add our audio track
+              streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
+              // Add their video track
+              videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
+              return combinedStream;
+            } catch {
+              // If video fails, just return audio
+              return streamDest.stream;
+            }
+          }
+          return streamDest.stream;
+        }
+        return originalGetUserMedia(constraints);
+      };
+    });
+
+    this._initScriptInjected = true;
+    this._logger.info('Audio getUserMedia override injected');
+  }
+
+  /**
+   * Initialize the audio context in the browser for TTS playback.
+   * Must be called after joining the meeting (user gesture context).
    */
   async initialize(): Promise<void> {
     if (this._audioContext) {
@@ -27,30 +88,23 @@ export class AudioProcedure {
     this._logger.info('Initializing audio context...');
 
     await this._page.evaluate(() => {
-      // Create a global audio context
-      const AudioContext = window.AudioContext || (window as any).webkitAudioContext;
-      const ctx = new AudioContext();
-      (window as any).__audioContext = ctx;
-      (window as any).__audioQueue = [];
-      (window as any).__isPlaying = false;
+      // The __ttsAudioContext was created by the init script.
+      // Resume it now (requires user gesture - joining meeting counts).
+      const ctx = (window as any).__ttsAudioContext as AudioContext;
+      if (ctx && ctx.state === 'suspended') {
+        ctx.resume();
+      }
 
-      // Create a MediaStream destination so audio is routed into the
-      // browser's virtual microphone (picked up by Teams) instead of
-      // the default speaker output (ctx.destination).
-      const streamDest = ctx.createMediaStreamDestination();
-      (window as any).__audioStreamDest = streamDest;
-
-      // Expose the stream so headless Chromium can pipe it as mic input.
-      // navigator.mediaDevices.getUserMedia will be overridden to return this stream.
-      const audioStream = streamDest.stream;
-      const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
-      navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
-        // If requesting audio only, return our TTS stream
-        if (constraints && constraints.audio && !constraints.video) {
-          return audioStream;
-        }
-        return originalGetUserMedia(constraints);
-      };
+      // If init script didn't run (e.g. page navigated before injection),
+      // create fallback audio infrastructure
+      if (!ctx) {
+        const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
+        const newCtx = new AudioContextClass();
+        const streamDest = newCtx.createMediaStreamDestination();
+        (window as any).__ttsAudioContext = newCtx;
+        (window as any).__ttsStreamDest = streamDest;
+        (window as any).__ttsAudioStream = streamDest.stream;
+      }
     });
 
     this._audioContext = true;
@@ -59,7 +113,7 @@ export class AudioProcedure {
 
   /**
    * Play audio in the browser.
-   * The audio will be heard by other meeting participants.
+   * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
    * 
    * @param audioData Base64 encoded audio data
    * @param format Audio format (mp3, wav, pcm)
@@ -73,8 +127,13 @@ export class AudioProcedure {
 
     try {
       await this._page.evaluate(async ({ audioData, format }) => {
-        const ctx = (window as any).__audioContext as AudioContext;
-        
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
+        const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
+
+        if (!ctx || !streamDest) {
+          throw new Error('Audio context not initialized');
+        }
+
         // Resume context if suspended
         if (ctx.state === 'suspended') {
           await ctx.resume();
@@ -95,22 +154,19 @@ export class AudioProcedure {
           audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
           const channelData = audioBuffer.getChannelData(0);
           for (let i = 0; i < pcmData.length; i++) {
-            channelData[i] = pcmData[i] / 32768; // Convert to float
+            channelData[i] = pcmData[i] / 32768;
           }
         } else {
           // MP3/WAV: Use decodeAudioData
-          audioBuffer = await ctx.decodeAudioData(bytes.buffer);
+          audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
         }
 
-        // Create source and play through the MediaStream destination
-        // so audio is routed into the Teams microphone input, not speakers
+        // Play through the MediaStreamDestination -> Teams mic input
         const source = ctx.createBufferSource();
         source.buffer = audioBuffer;
-        const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode;
-        source.connect(streamDest || ctx.destination);
+        source.connect(streamDest);
         source.start(0);
 
-        // Return a promise that resolves when playback ends
         return new Promise<void>((resolve) => {
           source.onended = () => resolve();
         });
@@ -129,7 +185,7 @@ export class AudioProcedure {
   async stopAudio(): Promise<void> {
     try {
       await this._page.evaluate(() => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
         if (ctx) {
           ctx.suspend();
         }
@@ -145,7 +201,7 @@ export class AudioProcedure {
   async cleanup(): Promise<void> {
     try {
       await this._page.evaluate(() => {
-        const ctx = (window as any).__audioContext as AudioContext;
+        const ctx = (window as any).__ttsAudioContext as AudioContext;
         if (ctx) {
           ctx.close();
         }
diff --git a/src/bot/captionsProcedure.ts b/src/bot/captionsProcedure.ts
index a79b1f3..20c58b9 100644
--- a/src/bot/captionsProcedure.ts
+++ b/src/bot/captionsProcedure.ts
@@ -332,6 +332,32 @@ export class CaptionsProcedure {
 
       // Look for the spoken language dropdown/combobox
       let languageSet = false;
+
+      // First, log what's visible in the settings panel for debugging
+      const panelInfo = await this._page.evaluate(() => {
+        const selects = document.querySelectorAll('select');
+        const comboboxes = document.querySelectorAll('[role="combobox"]');
+        const listboxes = document.querySelectorAll('[role="listbox"]');
+        const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]');
+        const allButtons = document.querySelectorAll('button');
+        const buttonsWithText = Array.from(allButtons)
+          .map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`)
+          .filter(t => t.length > 10)
+          .slice(0, 10);
+        return {
+          selects: selects.length,
+          comboboxes: comboboxes.length,
+          listboxes: listboxes.length,
+          dropdowns: dropdowns.length,
+          buttons: buttonsWithText,
+          bodySnippet: document.body?.innerText?.substring(0, 800) || '',
+        };
+      });
+      this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`);
+      this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`);
+      this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`);
+
+      // Strategy A: Standard selectors
       const dropdownSelectors = [
         'select[aria-label*="spoken language" i]',
         'select[aria-label*="Meeting spoken language" i]',
@@ -339,7 +365,7 @@ export class CaptionsProcedure {
         '[data-tid="spoken-language-dropdown"]',
         'div[role="combobox"]',
         'div[role="listbox"]',
-        'select', // Generic fallback
+        'select',
       ];
 
       for (const selector of dropdownSelectors) {
@@ -350,7 +376,6 @@ export class CaptionsProcedure {
             const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase());
             
             if (tagName === 'select') {
-              // Native select element
               for (const name of targetNames) {
                 try {
                   await this._page.selectOption(selector, { label: name });
@@ -368,7 +393,6 @@ export class CaptionsProcedure {
               
               for (const name of targetNames) {
                 try {
-                  // Try role="option" first, then generic text search
                   const optionSelectors = [
                     `[role="option"]:has-text("${name}")`,
                     `li:has-text("${name}")`,
@@ -397,6 +421,69 @@ export class CaptionsProcedure {
         }
       }
 
+      // Strategy B: DOM evaluation fallback - find any dropdown-like element and interact
+      if (!languageSet) {
+        this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...');
+        
+        languageSet = await this._page.evaluate((names: string[]) => {
+          // Find all elements that could be dropdowns (Fluent UI uses various patterns)
+          const candidates = document.querySelectorAll(
+            '[role="combobox"], [role="listbox"], select, ' +
+            '[class*="dropdown" i], [class*="Dropdown"], ' +
+            'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' +
+            '[aria-expanded]'
+          );
+
+          for (let i = 0; i < candidates.length; i++) {
+            const el = candidates[i] as HTMLElement;
+            const label = el.getAttribute('aria-label') || '';
+            const nearbyText = el.parentElement?.innerText || '';
+            
+            // Check if this dropdown is related to language
+            const isLanguageRelated = 
+              label.toLowerCase().includes('language') ||
+              label.toLowerCase().includes('sprache') ||
+              nearbyText.toLowerCase().includes('spoken language') ||
+              nearbyText.toLowerCase().includes('gesprochene sprache');
+            
+            if (isLanguageRelated || candidates.length === 1) {
+              // Click to open the dropdown
+              el.click();
+              
+              // Wait a frame for options to render
+              return new Promise<boolean>((resolve) => {
+                requestAnimationFrame(() => {
+                  requestAnimationFrame(() => {
+                    // Look for options
+                    const options = document.querySelectorAll(
+                      '[role="option"], [role="menuitem"], li[class*="option" i]'
+                    );
+                    
+                    for (let j = 0; j < options.length; j++) {
+                      const opt = options[j] as HTMLElement;
+                      const optText = opt.innerText?.trim() || '';
+                      
+                      if (names.some(n => optText.includes(n))) {
+                        opt.click();
+                        resolve(true);
+                        return;
+                      }
+                    }
+                    resolve(false);
+                  });
+                });
+              });
+            }
+          }
+          return Promise.resolve(false);
+        }, targetNames);
+
+        if (languageSet) {
+          this._logger.info('Selected spoken language via DOM evaluation fallback');
+          await this._page.waitForTimeout(500);
+        }
+      }
+
       if (!languageSet) {
         this._logger.warn('Could not find/select spoken language in dropdown');
       }
diff --git a/src/bot/joinProcedure.ts b/src/bot/joinProcedure.ts
index debd78d..0665986 100644
--- a/src/bot/joinProcedure.ts
+++ b/src/bot/joinProcedure.ts
@@ -264,12 +264,14 @@ export class JoinProcedure {
    * Check if the bot is currently in the meeting (admitted from lobby).
    * Primary selector: button[id="hangup-button"] (confirmed by Recall.ai).
    * Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign.
+   * 
+   * For authenticated joins, Teams v2 sometimes renders differently.
+   * Additional fallback: check the URL for meeting patterns and DOM for call UI.
    */
   async isInMeeting(options: { waitForSeconds?: number } = {}): Promise<boolean> {
     const timeout = (options.waitForSeconds || 5) * 1000;
 
-    // Primary selector - confirmed by Recall.ai (Jan 2025)
-    // Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button"
+    // Primary selectors - known meeting UI elements
     const inMeetingSelectors = [
       'button[id="hangup-button"]',
       'button[id="callingButtons-showMoreBtn"]',
@@ -278,6 +280,16 @@ export class JoinProcedure {
       '[data-tid="call-composite"]',
       'button[aria-label*="Leave"]',
       '[data-tid="callingButtons-showMoreBtn"]',
+      // Teams v2 (2025+) additional selectors
+      '[data-tid="call-controls"]',
+      '[data-tid="meeting-composite"]',
+      'div[data-tid="video-gallery"]',
+      'button[aria-label*="Hang up"]',
+      'button[aria-label*="leave" i]',
+      // Mic/Camera toggle buttons are only visible in an active call
+      'button[id="microphone-button"]',
+      'button[data-tid="toggle-mute"]',
+      '[data-tid="microphone-button"]',
     ];
 
     try {
@@ -287,8 +299,35 @@ export class JoinProcedure {
       });
       return true;
     } catch {
-      return false;
+      // Selector-based detection failed, try DOM evaluation as fallback
     }
+
+    // Fallback: evaluate the page for meeting indicators
+    try {
+      const inMeeting = await this._page.evaluate(() => {
+        // Check for call-related aria roles and meeting elements
+        const bodyText = document.body?.innerText || '';
+        const meetingIndicators = [
+          'Leave',         // Leave button text
+          'Mute',          // Mic mute button
+          'Unmute',        // Mic unmute button
+          'Turn off camera', // Camera control
+          'Turn on camera',
+          'Share',         // Share screen
+        ];
+        const found = meetingIndicators.filter(ind => bodyText.includes(ind));
+        // Need at least 2 meeting indicators to confirm we're in a meeting
+        return found.length >= 2;
+      });
+      if (inMeeting) {
+        this._logger.info('Detected meeting via DOM text analysis (fallback)');
+        return true;
+      }
+    } catch {
+      // Page may not be ready
+    }
+
+    return false;
   }
 
   /**
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index 44803ab..0eb347a 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -437,7 +437,9 @@ export class BotOrchestrator {
       headless: config.botHeadless,
       args: [
         '--use-fake-ui-for-media-stream', // Auto-accept media permissions
-        '--use-fake-device-for-media-stream', // Use fake devices
+        // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
+        // We override getUserMedia via addInitScript to return a MediaStreamDestination
+        // that we control, so TTS audio can be injected into Teams' mic input.
         '--disable-web-security',
         '--disable-features=IsolateOrigins,site-per-process',
         '--autoplay-policy=no-user-gesture-required',
@@ -468,6 +470,10 @@ export class BotOrchestrator {
     );
     this._audioProcedure = new AudioProcedure(this._page, this._logger);
 
+    // Inject audio getUserMedia override BEFORE any navigation
+    // This ensures Teams gets our controlled audio stream when it calls getUserMedia
+    await this._audioProcedure.injectAudioOverride();
+
     // Handle page errors
     this._page.on('pageerror', (error) => {
       this._logger.error('Page error:', error);
@@ -537,11 +543,18 @@ export class BotOrchestrator {
       // - Page is transitioning between states
       // Only give up after several consecutive cycles with no signal
       consecutiveNoSignal++;
-      this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`);
+      const currentUrl = this._page?.url() || 'unknown';
+      this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`);
 
       if (consecutiveNoSignal >= maxNoSignal) {
-        // Take a screenshot for debugging before giving up
+        // Take a screenshot and log page content for debugging before giving up
         await this._takeScreenshot('no-meeting-signal');
+        try {
+          const bodySnippet = await this._page?.evaluate(() =>
+            document.body?.innerText?.substring(0, 500) || '(empty)'
+          );
+          this._logger.warn(`Page content before giving up: ${bodySnippet}`);
+        } catch { /* ignore */ }
         throw new Error('Bot was removed from lobby or meeting ended');
       }
     }