From 39c8012358b318e05918e296e4a9a7cc3a236cb4 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 15 Feb 2026 22:40:48 +0100
Subject: [PATCH] fix: restore fake-device flag, wrap getUserMedia to swap
audio track, handle no-audio modal
Co-authored-by: Cursor
---
src/bot/audioProcedure.ts | 57 ++++++++++++++++++++-------------------
src/bot/joinProcedure.ts | 42 +++++++++++++++++++++++++++++
src/bot/orchestrator.ts | 4 +--
3 files changed, 72 insertions(+), 31 deletions(-)
diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index 3a90ac0..5b242f9 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -5,11 +5,15 @@ import { Logger } from 'winston';
* Handles audio playback in the Teams meeting.
*
* Architecture:
- * - Before any page loads, we inject an init script that overrides getUserMedia
- * to return a MediaStream from a MediaStreamDestination we control.
- * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
- * - When TTS audio is played, it's piped into the same MediaStreamDestination,
- * so Teams picks it up as microphone input and sends it via WebRTC.
+ * - Browser launches with --use-fake-device-for-media-stream so Teams sees
+ * real-looking devices (no "no audio/video" modal).
+ * - Before any page loads, we inject an init script that wraps getUserMedia.
+ * - When Teams calls getUserMedia, the wrapper:
+ * 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
+ * 2. Replaces the audio track with one from our MediaStreamDestination
+ * 3. Returns the modified stream (our audio + Chromium's fake video)
+ * - When TTS audio is played, it's piped into the MediaStreamDestination,
+ * and Teams sends it via WebRTC to other meeting participants.
*/
export class AudioProcedure {
private _page: Page;
@@ -23,7 +27,7 @@ export class AudioProcedure {
}
/**
- * Inject the getUserMedia override BEFORE any page navigation.
+ * Inject the getUserMedia wrapper BEFORE any page navigation.
* This MUST be called before navigating to Teams.
* Uses page.addInitScript so it runs in every new document context.
*/
@@ -35,8 +39,7 @@ export class AudioProcedure {
this._logger.info('Injecting audio getUserMedia override...');
await this._page.addInitScript(() => {
- // Create a shared AudioContext and MediaStreamDestination
- // These persist across the page lifetime
+ // Create a shared AudioContext and MediaStreamDestination for TTS injection
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
const ctx = new AudioContextClass();
const streamDest = ctx.createMediaStreamDestination();
@@ -46,29 +49,27 @@ export class AudioProcedure {
(window as any).__ttsStreamDest = streamDest;
(window as any).__ttsAudioStream = streamDest.stream;
- // Override getUserMedia to return our controlled stream for audio requests
+ // Wrap getUserMedia to replace audio tracks with our TTS-injectable stream
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
+ // Get the real stream (from Chromium's fake devices)
+ const realStream = await originalGetUserMedia(constraints);
+
if (constraints && constraints.audio) {
- // Return our TTS-injectable audio stream
- // If video is also requested, combine our audio with real/fake video
- if (constraints.video) {
- try {
- const videoStream = await originalGetUserMedia({ video: constraints.video });
- const combinedStream = new MediaStream();
- // Add our audio track
- streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
- // Add their video track
- videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
- return combinedStream;
- } catch {
- // If video fails, just return audio
- return streamDest.stream;
- }
- }
- return streamDest.stream;
+ // Build a new stream: our TTS audio track + their video tracks
+ const combinedStream = new MediaStream();
+
+ // Add our controlled audio track (TTS will be piped here)
+ streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
+
+ // Keep the real video tracks (from fake camera)
+ realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
+
+ return combinedStream;
}
- return originalGetUserMedia(constraints);
+
+ // No audio requested - return the real stream as-is
+ return realStream;
};
});
@@ -157,7 +158,7 @@ export class AudioProcedure {
channelData[i] = pcmData[i] / 32768;
}
} else {
- // MP3/WAV: Use decodeAudioData
+ // MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
}
diff --git a/src/bot/joinProcedure.ts b/src/bot/joinProcedure.ts
index 0665986..9567d53 100644
--- a/src/bot/joinProcedure.ts
+++ b/src/bot/joinProcedure.ts
@@ -174,10 +174,17 @@ export class JoinProcedure {
/**
* Click the "Join now" button.
* Primary selector: button:has-text("Join now") (confirmed by Recall.ai).
+ *
+ * IMPORTANT: Teams may show a "no audio/video" modal that blocks the Join button.
+ * This happens when getUserMedia doesn't return real-looking devices.
+ * We handle this by dismissing the modal first.
*/
private async _clickJoinNow(): Promise {
this._logger.info('Clicking Join now...');
+ // First, dismiss any "no audio/video" modal that may be blocking
+ await this._dismissNoAudioVideoModal();
+
// Primary selector - confirmed working by Recall.ai (Jan 2025)
const primarySelector = 'button:has-text("Join now")';
@@ -185,6 +192,10 @@ export class JoinProcedure {
await this._page.waitForSelector(primarySelector, { timeout: 15000 });
await this._page.click(primarySelector);
this._logger.info('Clicked "Join now" button');
+
+ // After clicking Join, Teams may show the modal again. Dismiss if present.
+ await this._page.waitForTimeout(2000);
+ await this._dismissNoAudioVideoModal();
return;
} catch {
this._logger.info('Primary join button selector not found, trying fallbacks...');
@@ -204,6 +215,8 @@ export class JoinProcedure {
if (button) {
await button.click();
this._logger.info(`Clicked join button (fallback: ${selector})`);
+ await this._page.waitForTimeout(2000);
+ await this._dismissNoAudioVideoModal();
return;
}
} catch {
@@ -223,6 +236,35 @@ export class JoinProcedure {
);
}
+ /**
+ * Dismiss the "Are you sure you don't want audio or video?" modal.
+ * Teams shows this when it can't access camera/mic devices.
+ * We click "Continue without audio or video" to proceed.
+ */
+ private async _dismissNoAudioVideoModal(): Promise {
+ const modalSelectors = [
+ 'button:has-text("Continue without audio or video")',
+ 'button:has-text("Ohne Audio oder Video fortfahren")',
+ 'button:has-text("Continue without")',
+ 'button:has-text("Ohne Audio")',
+ ];
+
+ for (const selector of modalSelectors) {
+ try {
+ const button = await this._page.$(selector);
+ if (button) {
+ await button.click();
+ this._logger.info(`Dismissed no-audio modal: ${selector}`);
+ await this._page.waitForTimeout(1000);
+ return;
+ }
+ } catch {
+ // Continue
+ }
+ }
+ // No modal found - that's fine, it means devices were detected properly
+ }
+
/**
* Check if the bot is currently in the lobby (waiting to be admitted).
* Primary check: text "Someone will let you in shortly" (confirmed by Recall.ai).
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index 0eb347a..d34e261 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -437,9 +437,7 @@ export class BotOrchestrator {
headless: config.botHeadless,
args: [
'--use-fake-ui-for-media-stream', // Auto-accept media permissions
- // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
- // We override getUserMedia via addInitScript to return a MediaStreamDestination
- // that we control, so TTS audio can be injected into Teams' mic input.
+ '--use-fake-device-for-media-stream', // Provide fake camera/mic so Teams sees devices
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--autoplay-policy=no-user-gesture-required',