fix: auth join detection, caption language dropdown, audio injection via getUserMedia override
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
13bf75bea7
commit
bd63dfc40a
4 changed files with 242 additions and 47 deletions
|
|
@ -3,12 +3,19 @@ import { Logger } from 'winston';
|
|||
|
||||
/**
|
||||
* Handles audio playback in the Teams meeting.
|
||||
* Injects TTS audio into the browser to be played through the meeting.
|
||||
*
|
||||
* Architecture:
|
||||
* - Before any page loads, we inject an init script that overrides getUserMedia
|
||||
* to return a MediaStream from a MediaStreamDestination we control.
|
||||
* - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
|
||||
* - When TTS audio is played, it's piped into the same MediaStreamDestination,
|
||||
* so Teams picks it up as microphone input and sends it via WebRTC.
|
||||
*/
|
||||
export class AudioProcedure {
|
||||
private _page: Page;
|
||||
private _logger: Logger;
|
||||
private _audioContext: boolean = false;
|
||||
private _initScriptInjected: boolean = false;
|
||||
|
||||
constructor(page: Page, logger: Logger) {
|
||||
this._page = page;
|
||||
|
|
@ -16,8 +23,62 @@ export class AudioProcedure {
|
|||
}
|
||||
|
||||
/**
|
||||
* Initialize the audio context in the browser.
|
||||
* Must be called after user interaction (joining meeting counts).
|
||||
* Inject the getUserMedia override BEFORE any page navigation.
|
||||
* This MUST be called before navigating to Teams.
|
||||
* Uses page.addInitScript so it runs in every new document context.
|
||||
*/
|
||||
async injectAudioOverride(): Promise<void> {
|
||||
if (this._initScriptInjected) {
|
||||
return;
|
||||
}
|
||||
|
||||
this._logger.info('Injecting audio getUserMedia override...');
|
||||
|
||||
await this._page.addInitScript(() => {
|
||||
// Create a shared AudioContext and MediaStreamDestination
|
||||
// These persist across the page lifetime
|
||||
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
||||
const ctx = new AudioContextClass();
|
||||
const streamDest = ctx.createMediaStreamDestination();
|
||||
|
||||
// Store globally for later TTS injection
|
||||
(window as any).__ttsAudioContext = ctx;
|
||||
(window as any).__ttsStreamDest = streamDest;
|
||||
(window as any).__ttsAudioStream = streamDest.stream;
|
||||
|
||||
// Override getUserMedia to return our controlled stream for audio requests
|
||||
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
|
||||
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
|
||||
if (constraints && constraints.audio) {
|
||||
// Return our TTS-injectable audio stream
|
||||
// If video is also requested, combine our audio with real/fake video
|
||||
if (constraints.video) {
|
||||
try {
|
||||
const videoStream = await originalGetUserMedia({ video: constraints.video });
|
||||
const combinedStream = new MediaStream();
|
||||
// Add our audio track
|
||||
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
|
||||
// Add their video track
|
||||
videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
|
||||
return combinedStream;
|
||||
} catch {
|
||||
// If video fails, just return audio
|
||||
return streamDest.stream;
|
||||
}
|
||||
}
|
||||
return streamDest.stream;
|
||||
}
|
||||
return originalGetUserMedia(constraints);
|
||||
};
|
||||
});
|
||||
|
||||
this._initScriptInjected = true;
|
||||
this._logger.info('Audio getUserMedia override injected');
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the audio context in the browser for TTS playback.
|
||||
* Must be called after joining the meeting (user gesture context).
|
||||
*/
|
||||
async initialize(): Promise<void> {
|
||||
if (this._audioContext) {
|
||||
|
|
@ -27,30 +88,23 @@ export class AudioProcedure {
|
|||
this._logger.info('Initializing audio context...');
|
||||
|
||||
await this._page.evaluate(() => {
|
||||
// Create a global audio context
|
||||
const AudioContext = window.AudioContext || (window as any).webkitAudioContext;
|
||||
const ctx = new AudioContext();
|
||||
(window as any).__audioContext = ctx;
|
||||
(window as any).__audioQueue = [];
|
||||
(window as any).__isPlaying = false;
|
||||
// The __ttsAudioContext was created by the init script.
|
||||
// Resume it now (requires user gesture - joining meeting counts).
|
||||
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
||||
if (ctx && ctx.state === 'suspended') {
|
||||
ctx.resume();
|
||||
}
|
||||
|
||||
// Create a MediaStream destination so audio is routed into the
|
||||
// browser's virtual microphone (picked up by Teams) instead of
|
||||
// the default speaker output (ctx.destination).
|
||||
const streamDest = ctx.createMediaStreamDestination();
|
||||
(window as any).__audioStreamDest = streamDest;
|
||||
|
||||
// Expose the stream so headless Chromium can pipe it as mic input.
|
||||
// navigator.mediaDevices.getUserMedia will be overridden to return this stream.
|
||||
const audioStream = streamDest.stream;
|
||||
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
|
||||
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
|
||||
// If requesting audio only, return our TTS stream
|
||||
if (constraints && constraints.audio && !constraints.video) {
|
||||
return audioStream;
|
||||
}
|
||||
return originalGetUserMedia(constraints);
|
||||
};
|
||||
// If init script didn't run (e.g. page navigated before injection),
|
||||
// create fallback audio infrastructure
|
||||
if (!ctx) {
|
||||
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
|
||||
const newCtx = new AudioContextClass();
|
||||
const streamDest = newCtx.createMediaStreamDestination();
|
||||
(window as any).__ttsAudioContext = newCtx;
|
||||
(window as any).__ttsStreamDest = streamDest;
|
||||
(window as any).__ttsAudioStream = streamDest.stream;
|
||||
}
|
||||
});
|
||||
|
||||
this._audioContext = true;
|
||||
|
|
@ -59,7 +113,7 @@ export class AudioProcedure {
|
|||
|
||||
/**
|
||||
* Play audio in the browser.
|
||||
* The audio will be heard by other meeting participants.
|
||||
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
|
||||
*
|
||||
* @param audioData Base64 encoded audio data
|
||||
* @param format Audio format (mp3, wav, pcm)
|
||||
|
|
@ -73,7 +127,12 @@ export class AudioProcedure {
|
|||
|
||||
try {
|
||||
await this._page.evaluate(async ({ audioData, format }) => {
|
||||
const ctx = (window as any).__audioContext as AudioContext;
|
||||
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
||||
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
||||
|
||||
if (!ctx || !streamDest) {
|
||||
throw new Error('Audio context not initialized');
|
||||
}
|
||||
|
||||
// Resume context if suspended
|
||||
if (ctx.state === 'suspended') {
|
||||
|
|
@ -95,22 +154,19 @@ export class AudioProcedure {
|
|||
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
|
||||
const channelData = audioBuffer.getChannelData(0);
|
||||
for (let i = 0; i < pcmData.length; i++) {
|
||||
channelData[i] = pcmData[i] / 32768; // Convert to float
|
||||
channelData[i] = pcmData[i] / 32768;
|
||||
}
|
||||
} else {
|
||||
// MP3/WAV: Use decodeAudioData
|
||||
audioBuffer = await ctx.decodeAudioData(bytes.buffer);
|
||||
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
||||
}
|
||||
|
||||
// Create source and play through the MediaStream destination
|
||||
// so audio is routed into the Teams microphone input, not speakers
|
||||
// Play through the MediaStreamDestination -> Teams mic input
|
||||
const source = ctx.createBufferSource();
|
||||
source.buffer = audioBuffer;
|
||||
const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode;
|
||||
source.connect(streamDest || ctx.destination);
|
||||
source.connect(streamDest);
|
||||
source.start(0);
|
||||
|
||||
// Return a promise that resolves when playback ends
|
||||
return new Promise<void>((resolve) => {
|
||||
source.onended = () => resolve();
|
||||
});
|
||||
|
|
@ -129,7 +185,7 @@ export class AudioProcedure {
|
|||
async stopAudio(): Promise<void> {
|
||||
try {
|
||||
await this._page.evaluate(() => {
|
||||
const ctx = (window as any).__audioContext as AudioContext;
|
||||
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
||||
if (ctx) {
|
||||
ctx.suspend();
|
||||
}
|
||||
|
|
@ -145,7 +201,7 @@ export class AudioProcedure {
|
|||
async cleanup(): Promise<void> {
|
||||
try {
|
||||
await this._page.evaluate(() => {
|
||||
const ctx = (window as any).__audioContext as AudioContext;
|
||||
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
||||
if (ctx) {
|
||||
ctx.close();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -332,6 +332,32 @@ export class CaptionsProcedure {
|
|||
|
||||
// Look for the spoken language dropdown/combobox
|
||||
let languageSet = false;
|
||||
|
||||
// First, log what's visible in the settings panel for debugging
|
||||
const panelInfo = await this._page.evaluate(() => {
|
||||
const selects = document.querySelectorAll('select');
|
||||
const comboboxes = document.querySelectorAll('[role="combobox"]');
|
||||
const listboxes = document.querySelectorAll('[role="listbox"]');
|
||||
const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]');
|
||||
const allButtons = document.querySelectorAll('button');
|
||||
const buttonsWithText = Array.from(allButtons)
|
||||
.map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`)
|
||||
.filter(t => t.length > 10)
|
||||
.slice(0, 10);
|
||||
return {
|
||||
selects: selects.length,
|
||||
comboboxes: comboboxes.length,
|
||||
listboxes: listboxes.length,
|
||||
dropdowns: dropdowns.length,
|
||||
buttons: buttonsWithText,
|
||||
bodySnippet: document.body?.innerText?.substring(0, 800) || '',
|
||||
};
|
||||
});
|
||||
this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`);
|
||||
this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`);
|
||||
this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`);
|
||||
|
||||
// Strategy A: Standard selectors
|
||||
const dropdownSelectors = [
|
||||
'select[aria-label*="spoken language" i]',
|
||||
'select[aria-label*="Meeting spoken language" i]',
|
||||
|
|
@ -339,7 +365,7 @@ export class CaptionsProcedure {
|
|||
'[data-tid="spoken-language-dropdown"]',
|
||||
'div[role="combobox"]',
|
||||
'div[role="listbox"]',
|
||||
'select', // Generic fallback
|
||||
'select',
|
||||
];
|
||||
|
||||
for (const selector of dropdownSelectors) {
|
||||
|
|
@ -350,7 +376,6 @@ export class CaptionsProcedure {
|
|||
const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase());
|
||||
|
||||
if (tagName === 'select') {
|
||||
// Native select element
|
||||
for (const name of targetNames) {
|
||||
try {
|
||||
await this._page.selectOption(selector, { label: name });
|
||||
|
|
@ -368,7 +393,6 @@ export class CaptionsProcedure {
|
|||
|
||||
for (const name of targetNames) {
|
||||
try {
|
||||
// Try role="option" first, then generic text search
|
||||
const optionSelectors = [
|
||||
`[role="option"]:has-text("${name}")`,
|
||||
`li:has-text("${name}")`,
|
||||
|
|
@ -397,6 +421,69 @@ export class CaptionsProcedure {
|
|||
}
|
||||
}
|
||||
|
||||
// Strategy B: DOM evaluation fallback - find any dropdown-like element and interact
|
||||
if (!languageSet) {
|
||||
this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...');
|
||||
|
||||
languageSet = await this._page.evaluate((names: string[]) => {
|
||||
// Find all elements that could be dropdowns (Fluent UI uses various patterns)
|
||||
const candidates = document.querySelectorAll(
|
||||
'[role="combobox"], [role="listbox"], select, ' +
|
||||
'[class*="dropdown" i], [class*="Dropdown"], ' +
|
||||
'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' +
|
||||
'[aria-expanded]'
|
||||
);
|
||||
|
||||
for (let i = 0; i < candidates.length; i++) {
|
||||
const el = candidates[i] as HTMLElement;
|
||||
const label = el.getAttribute('aria-label') || '';
|
||||
const nearbyText = el.parentElement?.innerText || '';
|
||||
|
||||
// Check if this dropdown is related to language
|
||||
const isLanguageRelated =
|
||||
label.toLowerCase().includes('language') ||
|
||||
label.toLowerCase().includes('sprache') ||
|
||||
nearbyText.toLowerCase().includes('spoken language') ||
|
||||
nearbyText.toLowerCase().includes('gesprochene sprache');
|
||||
|
||||
if (isLanguageRelated || candidates.length === 1) {
|
||||
// Click to open the dropdown
|
||||
el.click();
|
||||
|
||||
// Wait a frame for options to render
|
||||
return new Promise<boolean>((resolve) => {
|
||||
requestAnimationFrame(() => {
|
||||
requestAnimationFrame(() => {
|
||||
// Look for options
|
||||
const options = document.querySelectorAll(
|
||||
'[role="option"], [role="menuitem"], li[class*="option" i]'
|
||||
);
|
||||
|
||||
for (let j = 0; j < options.length; j++) {
|
||||
const opt = options[j] as HTMLElement;
|
||||
const optText = opt.innerText?.trim() || '';
|
||||
|
||||
if (names.some(n => optText.includes(n))) {
|
||||
opt.click();
|
||||
resolve(true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
resolve(false);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
return Promise.resolve(false);
|
||||
}, targetNames);
|
||||
|
||||
if (languageSet) {
|
||||
this._logger.info('Selected spoken language via DOM evaluation fallback');
|
||||
await this._page.waitForTimeout(500);
|
||||
}
|
||||
}
|
||||
|
||||
if (!languageSet) {
|
||||
this._logger.warn('Could not find/select spoken language in dropdown');
|
||||
}
|
||||
|
|
|
|||
|
|
@ -264,12 +264,14 @@ export class JoinProcedure {
|
|||
* Check if the bot is currently in the meeting (admitted from lobby).
|
||||
* Primary selector: button[id="hangup-button"] (confirmed by Recall.ai).
|
||||
* Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign.
|
||||
*
|
||||
* For authenticated joins, Teams v2 sometimes renders differently.
|
||||
* Additional fallback: check the URL for meeting patterns and DOM for call UI.
|
||||
*/
|
||||
async isInMeeting(options: { waitForSeconds?: number } = {}): Promise<boolean> {
|
||||
const timeout = (options.waitForSeconds || 5) * 1000;
|
||||
|
||||
// Primary selector - confirmed by Recall.ai (Jan 2025)
|
||||
// Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button"
|
||||
// Primary selectors - known meeting UI elements
|
||||
const inMeetingSelectors = [
|
||||
'button[id="hangup-button"]',
|
||||
'button[id="callingButtons-showMoreBtn"]',
|
||||
|
|
@ -278,6 +280,16 @@ export class JoinProcedure {
|
|||
'[data-tid="call-composite"]',
|
||||
'button[aria-label*="Leave"]',
|
||||
'[data-tid="callingButtons-showMoreBtn"]',
|
||||
// Teams v2 (2025+) additional selectors
|
||||
'[data-tid="call-controls"]',
|
||||
'[data-tid="meeting-composite"]',
|
||||
'div[data-tid="video-gallery"]',
|
||||
'button[aria-label*="Hang up"]',
|
||||
'button[aria-label*="leave" i]',
|
||||
// Mic/Camera toggle buttons are only visible in an active call
|
||||
'button[id="microphone-button"]',
|
||||
'button[data-tid="toggle-mute"]',
|
||||
'[data-tid="microphone-button"]',
|
||||
];
|
||||
|
||||
try {
|
||||
|
|
@ -287,8 +299,35 @@ export class JoinProcedure {
|
|||
});
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
// Selector-based detection failed, try DOM evaluation as fallback
|
||||
}
|
||||
|
||||
// Fallback: evaluate the page for meeting indicators
|
||||
try {
|
||||
const inMeeting = await this._page.evaluate(() => {
|
||||
// Check for call-related aria roles and meeting elements
|
||||
const bodyText = document.body?.innerText || '';
|
||||
const meetingIndicators = [
|
||||
'Leave', // Leave button text
|
||||
'Mute', // Mic mute button
|
||||
'Unmute', // Mic unmute button
|
||||
'Turn off camera', // Camera control
|
||||
'Turn on camera',
|
||||
'Share', // Share screen
|
||||
];
|
||||
const found = meetingIndicators.filter(ind => bodyText.includes(ind));
|
||||
// Need at least 2 meeting indicators to confirm we're in a meeting
|
||||
return found.length >= 2;
|
||||
});
|
||||
if (inMeeting) {
|
||||
this._logger.info('Detected meeting via DOM text analysis (fallback)');
|
||||
return true;
|
||||
}
|
||||
} catch {
|
||||
// Page may not be ready
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -437,7 +437,9 @@ export class BotOrchestrator {
|
|||
headless: config.botHeadless,
|
||||
args: [
|
||||
'--use-fake-ui-for-media-stream', // Auto-accept media permissions
|
||||
'--use-fake-device-for-media-stream', // Use fake devices
|
||||
// NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
|
||||
// We override getUserMedia via addInitScript to return a MediaStreamDestination
|
||||
// that we control, so TTS audio can be injected into Teams' mic input.
|
||||
'--disable-web-security',
|
||||
'--disable-features=IsolateOrigins,site-per-process',
|
||||
'--autoplay-policy=no-user-gesture-required',
|
||||
|
|
@ -468,6 +470,10 @@ export class BotOrchestrator {
|
|||
);
|
||||
this._audioProcedure = new AudioProcedure(this._page, this._logger);
|
||||
|
||||
// Inject audio getUserMedia override BEFORE any navigation
|
||||
// This ensures Teams gets our controlled audio stream when it calls getUserMedia
|
||||
await this._audioProcedure.injectAudioOverride();
|
||||
|
||||
// Handle page errors
|
||||
this._page.on('pageerror', (error) => {
|
||||
this._logger.error('Page error:', error);
|
||||
|
|
@ -537,11 +543,18 @@ export class BotOrchestrator {
|
|||
// - Page is transitioning between states
|
||||
// Only give up after several consecutive cycles with no signal
|
||||
consecutiveNoSignal++;
|
||||
this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`);
|
||||
const currentUrl = this._page?.url() || 'unknown';
|
||||
this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`);
|
||||
|
||||
if (consecutiveNoSignal >= maxNoSignal) {
|
||||
// Take a screenshot for debugging before giving up
|
||||
// Take a screenshot and log page content for debugging before giving up
|
||||
await this._takeScreenshot('no-meeting-signal');
|
||||
try {
|
||||
const bodySnippet = await this._page?.evaluate(() =>
|
||||
document.body?.innerText?.substring(0, 500) || '(empty)'
|
||||
);
|
||||
this._logger.warn(`Page content before giving up: ${bodySnippet}`);
|
||||
} catch { /* ignore */ }
|
||||
throw new Error('Bot was removed from lobby or meeting ended');
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue