From 39c8012358b318e05918e296e4a9a7cc3a236cb4 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 15 Feb 2026 22:40:48 +0100 Subject: [PATCH] fix: restore fake-device flag, wrap getUserMedia to swap audio track, handle no-audio modal Co-authored-by: Cursor --- src/bot/audioProcedure.ts | 57 ++++++++++++++++++++------------------- src/bot/joinProcedure.ts | 42 +++++++++++++++++++++++++++++ src/bot/orchestrator.ts | 4 +-- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts index 3a90ac0..5b242f9 100644 --- a/src/bot/audioProcedure.ts +++ b/src/bot/audioProcedure.ts @@ -5,11 +5,15 @@ import { Logger } from 'winston'; * Handles audio playback in the Teams meeting. * * Architecture: - * - Before any page loads, we inject an init script that overrides getUserMedia - * to return a MediaStream from a MediaStreamDestination we control. - * - When Teams calls getUserMedia({audio: true}), it gets our custom stream. - * - When TTS audio is played, it's piped into the same MediaStreamDestination, - * so Teams picks it up as microphone input and sends it via WebRTC. + * - Browser launches with --use-fake-device-for-media-stream so Teams sees + * real-looking devices (no "no audio/video" modal). + * - Before any page loads, we inject an init script that wraps getUserMedia. + * - When Teams calls getUserMedia, the wrapper: + * 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream) + * 2. Replaces the audio track with one from our MediaStreamDestination + * 3. Returns the modified stream (our audio + Chromium's fake video) + * - When TTS audio is played, it's piped into the MediaStreamDestination, + * and Teams sends it via WebRTC to other meeting participants. */ export class AudioProcedure { private _page: Page; @@ -23,7 +27,7 @@ export class AudioProcedure { } /** - * Inject the getUserMedia override BEFORE any page navigation. + * Inject the getUserMedia wrapper BEFORE any page navigation. * This MUST be called before navigating to Teams. * Uses page.addInitScript so it runs in every new document context. */ @@ -35,8 +39,7 @@ export class AudioProcedure { this._logger.info('Injecting audio getUserMedia override...'); await this._page.addInitScript(() => { - // Create a shared AudioContext and MediaStreamDestination - // These persist across the page lifetime + // Create a shared AudioContext and MediaStreamDestination for TTS injection const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; const ctx = new AudioContextClass(); const streamDest = ctx.createMediaStreamDestination(); @@ -46,29 +49,27 @@ export class AudioProcedure { (window as any).__ttsStreamDest = streamDest; (window as any).__ttsAudioStream = streamDest.stream; - // Override getUserMedia to return our controlled stream for audio requests + // Wrap getUserMedia to replace audio tracks with our TTS-injectable stream const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices); navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => { + // Get the real stream (from Chromium's fake devices) + const realStream = await originalGetUserMedia(constraints); + if (constraints && constraints.audio) { - // Return our TTS-injectable audio stream - // If video is also requested, combine our audio with real/fake video - if (constraints.video) { - try { - const videoStream = await originalGetUserMedia({ video: constraints.video }); - const combinedStream = new MediaStream(); - // Add our audio track - streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t)); - // Add their video track - videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t)); - return combinedStream; - } catch { - // If video fails, just return audio - return streamDest.stream; - } - } - return streamDest.stream; + // Build a new stream: our TTS audio track + their video tracks + const combinedStream = new MediaStream(); + + // Add our controlled audio track (TTS will be piped here) + streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t)); + + // Keep the real video tracks (from fake camera) + realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t)); + + return combinedStream; } - return originalGetUserMedia(constraints); + + // No audio requested - return the real stream as-is + return realStream; }; }); @@ -157,7 +158,7 @@ export class AudioProcedure { channelData[i] = pcmData[i] / 32768; } } else { - // MP3/WAV: Use decodeAudioData + // MP3/WAV: Use decodeAudioData (slice to avoid detached buffer) audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0)); } diff --git a/src/bot/joinProcedure.ts b/src/bot/joinProcedure.ts index 0665986..9567d53 100644 --- a/src/bot/joinProcedure.ts +++ b/src/bot/joinProcedure.ts @@ -174,10 +174,17 @@ export class JoinProcedure { /** * Click the "Join now" button. * Primary selector: button:has-text("Join now") (confirmed by Recall.ai). + * + * IMPORTANT: Teams may show a "no audio/video" modal that blocks the Join button. + * This happens when getUserMedia doesn't return real-looking devices. + * We handle this by dismissing the modal first. */ private async _clickJoinNow(): Promise { this._logger.info('Clicking Join now...'); + // First, dismiss any "no audio/video" modal that may be blocking + await this._dismissNoAudioVideoModal(); + // Primary selector - confirmed working by Recall.ai (Jan 2025) const primarySelector = 'button:has-text("Join now")'; @@ -185,6 +192,10 @@ export class JoinProcedure { await this._page.waitForSelector(primarySelector, { timeout: 15000 }); await this._page.click(primarySelector); this._logger.info('Clicked "Join now" button'); + + // After clicking Join, Teams may show the modal again. Dismiss if present. + await this._page.waitForTimeout(2000); + await this._dismissNoAudioVideoModal(); return; } catch { this._logger.info('Primary join button selector not found, trying fallbacks...'); @@ -204,6 +215,8 @@ export class JoinProcedure { if (button) { await button.click(); this._logger.info(`Clicked join button (fallback: ${selector})`); + await this._page.waitForTimeout(2000); + await this._dismissNoAudioVideoModal(); return; } } catch { @@ -223,6 +236,35 @@ export class JoinProcedure { ); } + /** + * Dismiss the "Are you sure you don't want audio or video?" modal. + * Teams shows this when it can't access camera/mic devices. + * We click "Continue without audio or video" to proceed. + */ + private async _dismissNoAudioVideoModal(): Promise { + const modalSelectors = [ + 'button:has-text("Continue without audio or video")', + 'button:has-text("Ohne Audio oder Video fortfahren")', + 'button:has-text("Continue without")', + 'button:has-text("Ohne Audio")', + ]; + + for (const selector of modalSelectors) { + try { + const button = await this._page.$(selector); + if (button) { + await button.click(); + this._logger.info(`Dismissed no-audio modal: ${selector}`); + await this._page.waitForTimeout(1000); + return; + } + } catch { + // Continue + } + } + // No modal found - that's fine, it means devices were detected properly + } + /** * Check if the bot is currently in the lobby (waiting to be admitted). * Primary check: text "Someone will let you in shortly" (confirmed by Recall.ai). diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts index 0eb347a..d34e261 100644 --- a/src/bot/orchestrator.ts +++ b/src/bot/orchestrator.ts @@ -437,9 +437,7 @@ export class BotOrchestrator { headless: config.botHeadless, args: [ '--use-fake-ui-for-media-stream', // Auto-accept media permissions - // NOTE: --use-fake-device-for-media-stream is intentionally NOT used. - // We override getUserMedia via addInitScript to return a MediaStreamDestination - // that we control, so TTS audio can be injected into Teams' mic input. + '--use-fake-device-for-media-stream', // Provide fake camera/mic so Teams sees devices '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', '--autoplay-policy=no-user-gesture-required',