fix: restore fake-device flag, wrap getUserMedia to swap audio track, handle no-audio modal
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
bd63dfc40a
commit
39c8012358
3 changed files with 72 additions and 31 deletions
|
|
@ -5,11 +5,15 @@ import { Logger } from 'winston';
|
||||||
* Handles audio playback in the Teams meeting.
|
* Handles audio playback in the Teams meeting.
|
||||||
*
|
*
|
||||||
* Architecture:
|
* Architecture:
|
||||||
* - Before any page loads, we inject an init script that overrides getUserMedia
|
* - Browser launches with --use-fake-device-for-media-stream so Teams sees
|
||||||
* to return a MediaStream from a MediaStreamDestination we control.
|
* real-looking devices (no "no audio/video" modal).
|
||||||
* - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
|
* - Before any page loads, we inject an init script that wraps getUserMedia.
|
||||||
* - When TTS audio is played, it's piped into the same MediaStreamDestination,
|
* - When Teams calls getUserMedia, the wrapper:
|
||||||
* so Teams picks it up as microphone input and sends it via WebRTC.
|
* 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
|
||||||
|
* 2. Replaces the audio track with one from our MediaStreamDestination
|
||||||
|
* 3. Returns the modified stream (our audio + Chromium's fake video)
|
||||||
|
* - When TTS audio is played, it's piped into the MediaStreamDestination,
|
||||||
|
* and Teams sends it via WebRTC to other meeting participants.
|
||||||
*/
|
*/
|
||||||
export class AudioProcedure {
|
export class AudioProcedure {
|
||||||
private _page: Page;
|
private _page: Page;
|
||||||
|
|
@ -23,7 +27,7 @@ export class AudioProcedure {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Inject the getUserMedia override BEFORE any page navigation.
|
* Inject the getUserMedia wrapper BEFORE any page navigation.
|
||||||
* This MUST be called before navigating to Teams.
|
* This MUST be called before navigating to Teams.
|
||||||
* Uses page.addInitScript so it runs in every new document context.
|
* Uses page.addInitScript so it runs in every new document context.
|
||||||
*/
|
*/
|
||||||
|
|
@ -35,8 +39,7 @@ export class AudioProcedure {
|
||||||
this._logger.info('Injecting audio getUserMedia override...');
|
this._logger.info('Injecting audio getUserMedia override...');
|
||||||
|
|
||||||
await this._page.addInitScript(() => {
|
await this._page.addInitScript(() => {
|
||||||
// Create a shared AudioContext and MediaStreamDestination
|
// Create a shared AudioContext and MediaStreamDestination for TTS injection
|
||||||
// These persist across the page lifetime
|
|
||||||
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
||||||
const ctx = new AudioContextClass();
|
const ctx = new AudioContextClass();
|
||||||
const streamDest = ctx.createMediaStreamDestination();
|
const streamDest = ctx.createMediaStreamDestination();
|
||||||
|
|
@ -46,29 +49,27 @@ export class AudioProcedure {
|
||||||
(window as any).__ttsStreamDest = streamDest;
|
(window as any).__ttsStreamDest = streamDest;
|
||||||
(window as any).__ttsAudioStream = streamDest.stream;
|
(window as any).__ttsAudioStream = streamDest.stream;
|
||||||
|
|
||||||
// Override getUserMedia to return our controlled stream for audio requests
|
// Wrap getUserMedia to replace audio tracks with our TTS-injectable stream
|
||||||
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
|
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
|
||||||
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
|
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
|
||||||
|
// Get the real stream (from Chromium's fake devices)
|
||||||
|
const realStream = await originalGetUserMedia(constraints);
|
||||||
|
|
||||||
if (constraints && constraints.audio) {
|
if (constraints && constraints.audio) {
|
||||||
// Return our TTS-injectable audio stream
|
// Build a new stream: our TTS audio track + their video tracks
|
||||||
// If video is also requested, combine our audio with real/fake video
|
const combinedStream = new MediaStream();
|
||||||
if (constraints.video) {
|
|
||||||
try {
|
// Add our controlled audio track (TTS will be piped here)
|
||||||
const videoStream = await originalGetUserMedia({ video: constraints.video });
|
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
|
||||||
const combinedStream = new MediaStream();
|
|
||||||
// Add our audio track
|
// Keep the real video tracks (from fake camera)
|
||||||
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
|
realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
|
||||||
// Add their video track
|
|
||||||
videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
|
return combinedStream;
|
||||||
return combinedStream;
|
|
||||||
} catch {
|
|
||||||
// If video fails, just return audio
|
|
||||||
return streamDest.stream;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return streamDest.stream;
|
|
||||||
}
|
}
|
||||||
return originalGetUserMedia(constraints);
|
|
||||||
|
// No audio requested - return the real stream as-is
|
||||||
|
return realStream;
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -157,7 +158,7 @@ export class AudioProcedure {
|
||||||
channelData[i] = pcmData[i] / 32768;
|
channelData[i] = pcmData[i] / 32768;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// MP3/WAV: Use decodeAudioData
|
// MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
|
||||||
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -174,10 +174,17 @@ export class JoinProcedure {
|
||||||
/**
|
/**
|
||||||
* Click the "Join now" button.
|
* Click the "Join now" button.
|
||||||
* Primary selector: button:has-text("Join now") (confirmed by Recall.ai).
|
* Primary selector: button:has-text("Join now") (confirmed by Recall.ai).
|
||||||
|
*
|
||||||
|
* IMPORTANT: Teams may show a "no audio/video" modal that blocks the Join button.
|
||||||
|
* This happens when getUserMedia doesn't return real-looking devices.
|
||||||
|
* We handle this by dismissing the modal first.
|
||||||
*/
|
*/
|
||||||
private async _clickJoinNow(): Promise<void> {
|
private async _clickJoinNow(): Promise<void> {
|
||||||
this._logger.info('Clicking Join now...');
|
this._logger.info('Clicking Join now...');
|
||||||
|
|
||||||
|
// First, dismiss any "no audio/video" modal that may be blocking
|
||||||
|
await this._dismissNoAudioVideoModal();
|
||||||
|
|
||||||
// Primary selector - confirmed working by Recall.ai (Jan 2025)
|
// Primary selector - confirmed working by Recall.ai (Jan 2025)
|
||||||
const primarySelector = 'button:has-text("Join now")';
|
const primarySelector = 'button:has-text("Join now")';
|
||||||
|
|
||||||
|
|
@ -185,6 +192,10 @@ export class JoinProcedure {
|
||||||
await this._page.waitForSelector(primarySelector, { timeout: 15000 });
|
await this._page.waitForSelector(primarySelector, { timeout: 15000 });
|
||||||
await this._page.click(primarySelector);
|
await this._page.click(primarySelector);
|
||||||
this._logger.info('Clicked "Join now" button');
|
this._logger.info('Clicked "Join now" button');
|
||||||
|
|
||||||
|
// After clicking Join, Teams may show the modal again. Dismiss if present.
|
||||||
|
await this._page.waitForTimeout(2000);
|
||||||
|
await this._dismissNoAudioVideoModal();
|
||||||
return;
|
return;
|
||||||
} catch {
|
} catch {
|
||||||
this._logger.info('Primary join button selector not found, trying fallbacks...');
|
this._logger.info('Primary join button selector not found, trying fallbacks...');
|
||||||
|
|
@ -204,6 +215,8 @@ export class JoinProcedure {
|
||||||
if (button) {
|
if (button) {
|
||||||
await button.click();
|
await button.click();
|
||||||
this._logger.info(`Clicked join button (fallback: ${selector})`);
|
this._logger.info(`Clicked join button (fallback: ${selector})`);
|
||||||
|
await this._page.waitForTimeout(2000);
|
||||||
|
await this._dismissNoAudioVideoModal();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
|
|
@ -223,6 +236,35 @@ export class JoinProcedure {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dismiss the "Are you sure you don't want audio or video?" modal.
|
||||||
|
* Teams shows this when it can't access camera/mic devices.
|
||||||
|
* We click "Continue without audio or video" to proceed.
|
||||||
|
*/
|
||||||
|
private async _dismissNoAudioVideoModal(): Promise<void> {
|
||||||
|
const modalSelectors = [
|
||||||
|
'button:has-text("Continue without audio or video")',
|
||||||
|
'button:has-text("Ohne Audio oder Video fortfahren")',
|
||||||
|
'button:has-text("Continue without")',
|
||||||
|
'button:has-text("Ohne Audio")',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of modalSelectors) {
|
||||||
|
try {
|
||||||
|
const button = await this._page.$(selector);
|
||||||
|
if (button) {
|
||||||
|
await button.click();
|
||||||
|
this._logger.info(`Dismissed no-audio modal: ${selector}`);
|
||||||
|
await this._page.waitForTimeout(1000);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// No modal found - that's fine, it means devices were detected properly
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if the bot is currently in the lobby (waiting to be admitted).
|
* Check if the bot is currently in the lobby (waiting to be admitted).
|
||||||
* Primary check: text "Someone will let you in shortly" (confirmed by Recall.ai).
|
* Primary check: text "Someone will let you in shortly" (confirmed by Recall.ai).
|
||||||
|
|
|
||||||
|
|
@ -437,9 +437,7 @@ export class BotOrchestrator {
|
||||||
headless: config.botHeadless,
|
headless: config.botHeadless,
|
||||||
args: [
|
args: [
|
||||||
'--use-fake-ui-for-media-stream', // Auto-accept media permissions
|
'--use-fake-ui-for-media-stream', // Auto-accept media permissions
|
||||||
// NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
|
'--use-fake-device-for-media-stream', // Provide fake camera/mic so Teams sees devices
|
||||||
// We override getUserMedia via addInitScript to return a MediaStreamDestination
|
|
||||||
// that we control, so TTS audio can be injected into Teams' mic input.
|
|
||||||
'--disable-web-security',
|
'--disable-web-security',
|
||||||
'--disable-features=IsolateOrigins,site-per-process',
|
'--disable-features=IsolateOrigins,site-per-process',
|
||||||
'--autoplay-policy=no-user-gesture-required',
|
'--autoplay-policy=no-user-gesture-required',
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue