fix: auth join detection, caption language dropdown, audio injection via getUserMedia override

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
ValueOn AG 2026-02-15 22:28:51 +01:00
parent 13bf75bea7
commit bd63dfc40a
4 changed files with 242 additions and 47 deletions

View file

@ -3,12 +3,19 @@ import { Logger } from 'winston';
/** /**
* Handles audio playback in the Teams meeting. * Handles audio playback in the Teams meeting.
* Injects TTS audio into the browser to be played through the meeting. *
* Architecture:
* - Before any page loads, we inject an init script that overrides getUserMedia
* to return a MediaStream from a MediaStreamDestination we control.
* - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
* - When TTS audio is played, it's piped into the same MediaStreamDestination,
* so Teams picks it up as microphone input and sends it via WebRTC.
*/ */
export class AudioProcedure { export class AudioProcedure {
private _page: Page; private _page: Page;
private _logger: Logger; private _logger: Logger;
private _audioContext: boolean = false; private _audioContext: boolean = false;
private _initScriptInjected: boolean = false;
constructor(page: Page, logger: Logger) { constructor(page: Page, logger: Logger) {
this._page = page; this._page = page;
@ -16,8 +23,62 @@ export class AudioProcedure {
} }
/** /**
* Initialize the audio context in the browser. * Inject the getUserMedia override BEFORE any page navigation.
* Must be called after user interaction (joining meeting counts). * This MUST be called before navigating to Teams.
* Uses page.addInitScript so it runs in every new document context.
*/
async injectAudioOverride(): Promise<void> {
if (this._initScriptInjected) {
return;
}
this._logger.info('Injecting audio getUserMedia override...');
await this._page.addInitScript(() => {
// Create a shared AudioContext and MediaStreamDestination
// These persist across the page lifetime
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
const ctx = new AudioContextClass();
const streamDest = ctx.createMediaStreamDestination();
// Store globally for later TTS injection
(window as any).__ttsAudioContext = ctx;
(window as any).__ttsStreamDest = streamDest;
(window as any).__ttsAudioStream = streamDest.stream;
// Override getUserMedia to return our controlled stream for audio requests
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
if (constraints && constraints.audio) {
// Return our TTS-injectable audio stream
// If video is also requested, combine our audio with real/fake video
if (constraints.video) {
try {
const videoStream = await originalGetUserMedia({ video: constraints.video });
const combinedStream = new MediaStream();
// Add our audio track
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
// Add their video track
videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
return combinedStream;
} catch {
// If video fails, just return audio
return streamDest.stream;
}
}
return streamDest.stream;
}
return originalGetUserMedia(constraints);
};
});
this._initScriptInjected = true;
this._logger.info('Audio getUserMedia override injected');
}
/**
* Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context).
*/ */
async initialize(): Promise<void> { async initialize(): Promise<void> {
if (this._audioContext) { if (this._audioContext) {
@ -27,30 +88,23 @@ export class AudioProcedure {
this._logger.info('Initializing audio context...'); this._logger.info('Initializing audio context...');
await this._page.evaluate(() => { await this._page.evaluate(() => {
// Create a global audio context // The __ttsAudioContext was created by the init script.
const AudioContext = window.AudioContext || (window as any).webkitAudioContext; // Resume it now (requires user gesture - joining meeting counts).
const ctx = new AudioContext(); const ctx = (window as any).__ttsAudioContext as AudioContext;
(window as any).__audioContext = ctx; if (ctx && ctx.state === 'suspended') {
(window as any).__audioQueue = []; ctx.resume();
(window as any).__isPlaying = false; }
// Create a MediaStream destination so audio is routed into the // If init script didn't run (e.g. page navigated before injection),
// browser's virtual microphone (picked up by Teams) instead of // create fallback audio infrastructure
// the default speaker output (ctx.destination). if (!ctx) {
const streamDest = ctx.createMediaStreamDestination(); const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
(window as any).__audioStreamDest = streamDest; const newCtx = new AudioContextClass();
const streamDest = newCtx.createMediaStreamDestination();
// Expose the stream so headless Chromium can pipe it as mic input. (window as any).__ttsAudioContext = newCtx;
// navigator.mediaDevices.getUserMedia will be overridden to return this stream. (window as any).__ttsStreamDest = streamDest;
const audioStream = streamDest.stream; (window as any).__ttsAudioStream = streamDest.stream;
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices); }
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
// If requesting audio only, return our TTS stream
if (constraints && constraints.audio && !constraints.video) {
return audioStream;
}
return originalGetUserMedia(constraints);
};
}); });
this._audioContext = true; this._audioContext = true;
@ -59,7 +113,7 @@ export class AudioProcedure {
/** /**
* Play audio in the browser. * Play audio in the browser.
* The audio will be heard by other meeting participants. * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
* *
* @param audioData Base64 encoded audio data * @param audioData Base64 encoded audio data
* @param format Audio format (mp3, wav, pcm) * @param format Audio format (mp3, wav, pcm)
@ -73,8 +127,13 @@ export class AudioProcedure {
try { try {
await this._page.evaluate(async ({ audioData, format }) => { await this._page.evaluate(async ({ audioData, format }) => {
const ctx = (window as any).__audioContext as AudioContext; const ctx = (window as any).__ttsAudioContext as AudioContext;
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!ctx || !streamDest) {
throw new Error('Audio context not initialized');
}
// Resume context if suspended // Resume context if suspended
if (ctx.state === 'suspended') { if (ctx.state === 'suspended') {
await ctx.resume(); await ctx.resume();
@ -95,22 +154,19 @@ export class AudioProcedure {
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000); audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
const channelData = audioBuffer.getChannelData(0); const channelData = audioBuffer.getChannelData(0);
for (let i = 0; i < pcmData.length; i++) { for (let i = 0; i < pcmData.length; i++) {
channelData[i] = pcmData[i] / 32768; // Convert to float channelData[i] = pcmData[i] / 32768;
} }
} else { } else {
// MP3/WAV: Use decodeAudioData // MP3/WAV: Use decodeAudioData
audioBuffer = await ctx.decodeAudioData(bytes.buffer); audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
} }
// Create source and play through the MediaStream destination // Play through the MediaStreamDestination -> Teams mic input
// so audio is routed into the Teams microphone input, not speakers
const source = ctx.createBufferSource(); const source = ctx.createBufferSource();
source.buffer = audioBuffer; source.buffer = audioBuffer;
const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode; source.connect(streamDest);
source.connect(streamDest || ctx.destination);
source.start(0); source.start(0);
// Return a promise that resolves when playback ends
return new Promise<void>((resolve) => { return new Promise<void>((resolve) => {
source.onended = () => resolve(); source.onended = () => resolve();
}); });
@ -129,7 +185,7 @@ export class AudioProcedure {
async stopAudio(): Promise<void> { async stopAudio(): Promise<void> {
try { try {
await this._page.evaluate(() => { await this._page.evaluate(() => {
const ctx = (window as any).__audioContext as AudioContext; const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) { if (ctx) {
ctx.suspend(); ctx.suspend();
} }
@ -145,7 +201,7 @@ export class AudioProcedure {
async cleanup(): Promise<void> { async cleanup(): Promise<void> {
try { try {
await this._page.evaluate(() => { await this._page.evaluate(() => {
const ctx = (window as any).__audioContext as AudioContext; const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) { if (ctx) {
ctx.close(); ctx.close();
} }

View file

@ -332,6 +332,32 @@ export class CaptionsProcedure {
// Look for the spoken language dropdown/combobox // Look for the spoken language dropdown/combobox
let languageSet = false; let languageSet = false;
// First, log what's visible in the settings panel for debugging
const panelInfo = await this._page.evaluate(() => {
const selects = document.querySelectorAll('select');
const comboboxes = document.querySelectorAll('[role="combobox"]');
const listboxes = document.querySelectorAll('[role="listbox"]');
const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]');
const allButtons = document.querySelectorAll('button');
const buttonsWithText = Array.from(allButtons)
.map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`)
.filter(t => t.length > 10)
.slice(0, 10);
return {
selects: selects.length,
comboboxes: comboboxes.length,
listboxes: listboxes.length,
dropdowns: dropdowns.length,
buttons: buttonsWithText,
bodySnippet: document.body?.innerText?.substring(0, 800) || '',
};
});
this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`);
this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`);
this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`);
// Strategy A: Standard selectors
const dropdownSelectors = [ const dropdownSelectors = [
'select[aria-label*="spoken language" i]', 'select[aria-label*="spoken language" i]',
'select[aria-label*="Meeting spoken language" i]', 'select[aria-label*="Meeting spoken language" i]',
@ -339,7 +365,7 @@ export class CaptionsProcedure {
'[data-tid="spoken-language-dropdown"]', '[data-tid="spoken-language-dropdown"]',
'div[role="combobox"]', 'div[role="combobox"]',
'div[role="listbox"]', 'div[role="listbox"]',
'select', // Generic fallback 'select',
]; ];
for (const selector of dropdownSelectors) { for (const selector of dropdownSelectors) {
@ -350,7 +376,6 @@ export class CaptionsProcedure {
const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase()); const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase());
if (tagName === 'select') { if (tagName === 'select') {
// Native select element
for (const name of targetNames) { for (const name of targetNames) {
try { try {
await this._page.selectOption(selector, { label: name }); await this._page.selectOption(selector, { label: name });
@ -368,7 +393,6 @@ export class CaptionsProcedure {
for (const name of targetNames) { for (const name of targetNames) {
try { try {
// Try role="option" first, then generic text search
const optionSelectors = [ const optionSelectors = [
`[role="option"]:has-text("${name}")`, `[role="option"]:has-text("${name}")`,
`li:has-text("${name}")`, `li:has-text("${name}")`,
@ -397,6 +421,69 @@ export class CaptionsProcedure {
} }
} }
// Strategy B: DOM evaluation fallback - find any dropdown-like element and interact
if (!languageSet) {
this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...');
languageSet = await this._page.evaluate((names: string[]) => {
// Find all elements that could be dropdowns (Fluent UI uses various patterns)
const candidates = document.querySelectorAll(
'[role="combobox"], [role="listbox"], select, ' +
'[class*="dropdown" i], [class*="Dropdown"], ' +
'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' +
'[aria-expanded]'
);
for (let i = 0; i < candidates.length; i++) {
const el = candidates[i] as HTMLElement;
const label = el.getAttribute('aria-label') || '';
const nearbyText = el.parentElement?.innerText || '';
// Check if this dropdown is related to language
const isLanguageRelated =
label.toLowerCase().includes('language') ||
label.toLowerCase().includes('sprache') ||
nearbyText.toLowerCase().includes('spoken language') ||
nearbyText.toLowerCase().includes('gesprochene sprache');
if (isLanguageRelated || candidates.length === 1) {
// Click to open the dropdown
el.click();
// Wait a frame for options to render
return new Promise<boolean>((resolve) => {
requestAnimationFrame(() => {
requestAnimationFrame(() => {
// Look for options
const options = document.querySelectorAll(
'[role="option"], [role="menuitem"], li[class*="option" i]'
);
for (let j = 0; j < options.length; j++) {
const opt = options[j] as HTMLElement;
const optText = opt.innerText?.trim() || '';
if (names.some(n => optText.includes(n))) {
opt.click();
resolve(true);
return;
}
}
resolve(false);
});
});
});
}
}
return Promise.resolve(false);
}, targetNames);
if (languageSet) {
this._logger.info('Selected spoken language via DOM evaluation fallback');
await this._page.waitForTimeout(500);
}
}
if (!languageSet) { if (!languageSet) {
this._logger.warn('Could not find/select spoken language in dropdown'); this._logger.warn('Could not find/select spoken language in dropdown');
} }

View file

@ -264,12 +264,14 @@ export class JoinProcedure {
* Check if the bot is currently in the meeting (admitted from lobby). * Check if the bot is currently in the meeting (admitted from lobby).
* Primary selector: button[id="hangup-button"] (confirmed by Recall.ai). * Primary selector: button[id="hangup-button"] (confirmed by Recall.ai).
* Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign. * Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign.
*
* For authenticated joins, Teams v2 sometimes renders differently.
* Additional fallback: check the URL for meeting patterns and DOM for call UI.
*/ */
async isInMeeting(options: { waitForSeconds?: number } = {}): Promise<boolean> { async isInMeeting(options: { waitForSeconds?: number } = {}): Promise<boolean> {
const timeout = (options.waitForSeconds || 5) * 1000; const timeout = (options.waitForSeconds || 5) * 1000;
// Primary selector - confirmed by Recall.ai (Jan 2025) // Primary selectors - known meeting UI elements
// Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button"
const inMeetingSelectors = [ const inMeetingSelectors = [
'button[id="hangup-button"]', 'button[id="hangup-button"]',
'button[id="callingButtons-showMoreBtn"]', 'button[id="callingButtons-showMoreBtn"]',
@ -278,6 +280,16 @@ export class JoinProcedure {
'[data-tid="call-composite"]', '[data-tid="call-composite"]',
'button[aria-label*="Leave"]', 'button[aria-label*="Leave"]',
'[data-tid="callingButtons-showMoreBtn"]', '[data-tid="callingButtons-showMoreBtn"]',
// Teams v2 (2025+) additional selectors
'[data-tid="call-controls"]',
'[data-tid="meeting-composite"]',
'div[data-tid="video-gallery"]',
'button[aria-label*="Hang up"]',
'button[aria-label*="leave" i]',
// Mic/Camera toggle buttons are only visible in an active call
'button[id="microphone-button"]',
'button[data-tid="toggle-mute"]',
'[data-tid="microphone-button"]',
]; ];
try { try {
@ -287,8 +299,35 @@ export class JoinProcedure {
}); });
return true; return true;
} catch { } catch {
return false; // Selector-based detection failed, try DOM evaluation as fallback
} }
// Fallback: evaluate the page for meeting indicators
try {
const inMeeting = await this._page.evaluate(() => {
// Check for call-related aria roles and meeting elements
const bodyText = document.body?.innerText || '';
const meetingIndicators = [
'Leave', // Leave button text
'Mute', // Mic mute button
'Unmute', // Mic unmute button
'Turn off camera', // Camera control
'Turn on camera',
'Share', // Share screen
];
const found = meetingIndicators.filter(ind => bodyText.includes(ind));
// Need at least 2 meeting indicators to confirm we're in a meeting
return found.length >= 2;
});
if (inMeeting) {
this._logger.info('Detected meeting via DOM text analysis (fallback)');
return true;
}
} catch {
// Page may not be ready
}
return false;
} }
/** /**

View file

@ -437,7 +437,9 @@ export class BotOrchestrator {
headless: config.botHeadless, headless: config.botHeadless,
args: [ args: [
'--use-fake-ui-for-media-stream', // Auto-accept media permissions '--use-fake-ui-for-media-stream', // Auto-accept media permissions
'--use-fake-device-for-media-stream', // Use fake devices // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
// We override getUserMedia via addInitScript to return a MediaStreamDestination
// that we control, so TTS audio can be injected into Teams' mic input.
'--disable-web-security', '--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process', '--disable-features=IsolateOrigins,site-per-process',
'--autoplay-policy=no-user-gesture-required', '--autoplay-policy=no-user-gesture-required',
@ -468,6 +470,10 @@ export class BotOrchestrator {
); );
this._audioProcedure = new AudioProcedure(this._page, this._logger); this._audioProcedure = new AudioProcedure(this._page, this._logger);
// Inject audio getUserMedia override BEFORE any navigation
// This ensures Teams gets our controlled audio stream when it calls getUserMedia
await this._audioProcedure.injectAudioOverride();
// Handle page errors // Handle page errors
this._page.on('pageerror', (error) => { this._page.on('pageerror', (error) => {
this._logger.error('Page error:', error); this._logger.error('Page error:', error);
@ -537,11 +543,18 @@ export class BotOrchestrator {
// - Page is transitioning between states // - Page is transitioning between states
// Only give up after several consecutive cycles with no signal // Only give up after several consecutive cycles with no signal
consecutiveNoSignal++; consecutiveNoSignal++;
this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`); const currentUrl = this._page?.url() || 'unknown';
this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`);
if (consecutiveNoSignal >= maxNoSignal) { if (consecutiveNoSignal >= maxNoSignal) {
// Take a screenshot for debugging before giving up // Take a screenshot and log page content for debugging before giving up
await this._takeScreenshot('no-meeting-signal'); await this._takeScreenshot('no-meeting-signal');
try {
const bodySnippet = await this._page?.evaluate(() =>
document.body?.innerText?.substring(0, 500) || '(empty)'
);
this._logger.warn(`Page content before giving up: ${bodySnippet}`);
} catch { /* ignore */ }
throw new Error('Bot was removed from lobby or meeting ended'); throw new Error('Bot was removed from lobby or meeting ended');
} }
} }