From bd63dfc40ab4c97a841ea470a956651bde4923af Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 15 Feb 2026 22:28:51 +0100 Subject: [PATCH] fix: auth join detection, caption language dropdown, audio injection via getUserMedia override Co-authored-by: Cursor --- src/bot/audioProcedure.ts | 132 +++++++++++++++++++++++++---------- src/bot/captionsProcedure.ts | 93 +++++++++++++++++++++++- src/bot/joinProcedure.ts | 45 +++++++++++- src/bot/orchestrator.ts | 19 ++++- 4 files changed, 242 insertions(+), 47 deletions(-) diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts index 7927657..3a90ac0 100644 --- a/src/bot/audioProcedure.ts +++ b/src/bot/audioProcedure.ts @@ -3,12 +3,19 @@ import { Logger } from 'winston'; /** * Handles audio playback in the Teams meeting. - * Injects TTS audio into the browser to be played through the meeting. + * + * Architecture: + * - Before any page loads, we inject an init script that overrides getUserMedia + * to return a MediaStream from a MediaStreamDestination we control. + * - When Teams calls getUserMedia({audio: true}), it gets our custom stream. + * - When TTS audio is played, it's piped into the same MediaStreamDestination, + * so Teams picks it up as microphone input and sends it via WebRTC. */ export class AudioProcedure { private _page: Page; private _logger: Logger; private _audioContext: boolean = false; + private _initScriptInjected: boolean = false; constructor(page: Page, logger: Logger) { this._page = page; @@ -16,8 +23,62 @@ export class AudioProcedure { } /** - * Initialize the audio context in the browser. - * Must be called after user interaction (joining meeting counts). + * Inject the getUserMedia override BEFORE any page navigation. + * This MUST be called before navigating to Teams. + * Uses page.addInitScript so it runs in every new document context. + */ + async injectAudioOverride(): Promise { + if (this._initScriptInjected) { + return; + } + + this._logger.info('Injecting audio getUserMedia override...'); + + await this._page.addInitScript(() => { + // Create a shared AudioContext and MediaStreamDestination + // These persist across the page lifetime + const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; + const ctx = new AudioContextClass(); + const streamDest = ctx.createMediaStreamDestination(); + + // Store globally for later TTS injection + (window as any).__ttsAudioContext = ctx; + (window as any).__ttsStreamDest = streamDest; + (window as any).__ttsAudioStream = streamDest.stream; + + // Override getUserMedia to return our controlled stream for audio requests + const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices); + navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => { + if (constraints && constraints.audio) { + // Return our TTS-injectable audio stream + // If video is also requested, combine our audio with real/fake video + if (constraints.video) { + try { + const videoStream = await originalGetUserMedia({ video: constraints.video }); + const combinedStream = new MediaStream(); + // Add our audio track + streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t)); + // Add their video track + videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t)); + return combinedStream; + } catch { + // If video fails, just return audio + return streamDest.stream; + } + } + return streamDest.stream; + } + return originalGetUserMedia(constraints); + }; + }); + + this._initScriptInjected = true; + this._logger.info('Audio getUserMedia override injected'); + } + + /** + * Initialize the audio context in the browser for TTS playback. + * Must be called after joining the meeting (user gesture context). */ async initialize(): Promise { if (this._audioContext) { @@ -27,30 +88,23 @@ export class AudioProcedure { this._logger.info('Initializing audio context...'); await this._page.evaluate(() => { - // Create a global audio context - const AudioContext = window.AudioContext || (window as any).webkitAudioContext; - const ctx = new AudioContext(); - (window as any).__audioContext = ctx; - (window as any).__audioQueue = []; - (window as any).__isPlaying = false; + // The __ttsAudioContext was created by the init script. + // Resume it now (requires user gesture - joining meeting counts). + const ctx = (window as any).__ttsAudioContext as AudioContext; + if (ctx && ctx.state === 'suspended') { + ctx.resume(); + } - // Create a MediaStream destination so audio is routed into the - // browser's virtual microphone (picked up by Teams) instead of - // the default speaker output (ctx.destination). - const streamDest = ctx.createMediaStreamDestination(); - (window as any).__audioStreamDest = streamDest; - - // Expose the stream so headless Chromium can pipe it as mic input. - // navigator.mediaDevices.getUserMedia will be overridden to return this stream. - const audioStream = streamDest.stream; - const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices); - navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => { - // If requesting audio only, return our TTS stream - if (constraints && constraints.audio && !constraints.video) { - return audioStream; - } - return originalGetUserMedia(constraints); - }; + // If init script didn't run (e.g. page navigated before injection), + // create fallback audio infrastructure + if (!ctx) { + const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; + const newCtx = new AudioContextClass(); + const streamDest = newCtx.createMediaStreamDestination(); + (window as any).__ttsAudioContext = newCtx; + (window as any).__ttsStreamDest = streamDest; + (window as any).__ttsAudioStream = streamDest.stream; + } }); this._audioContext = true; @@ -59,7 +113,7 @@ export class AudioProcedure { /** * Play audio in the browser. - * The audio will be heard by other meeting participants. + * Audio is piped into the MediaStreamDestination that Teams uses as mic input. * * @param audioData Base64 encoded audio data * @param format Audio format (mp3, wav, pcm) @@ -73,8 +127,13 @@ export class AudioProcedure { try { await this._page.evaluate(async ({ audioData, format }) => { - const ctx = (window as any).__audioContext as AudioContext; - + const ctx = (window as any).__ttsAudioContext as AudioContext; + const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode; + + if (!ctx || !streamDest) { + throw new Error('Audio context not initialized'); + } + // Resume context if suspended if (ctx.state === 'suspended') { await ctx.resume(); @@ -95,22 +154,19 @@ export class AudioProcedure { audioBuffer = ctx.createBuffer(1, pcmData.length, 16000); const channelData = audioBuffer.getChannelData(0); for (let i = 0; i < pcmData.length; i++) { - channelData[i] = pcmData[i] / 32768; // Convert to float + channelData[i] = pcmData[i] / 32768; } } else { // MP3/WAV: Use decodeAudioData - audioBuffer = await ctx.decodeAudioData(bytes.buffer); + audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0)); } - // Create source and play through the MediaStream destination - // so audio is routed into the Teams microphone input, not speakers + // Play through the MediaStreamDestination -> Teams mic input const source = ctx.createBufferSource(); source.buffer = audioBuffer; - const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode; - source.connect(streamDest || ctx.destination); + source.connect(streamDest); source.start(0); - // Return a promise that resolves when playback ends return new Promise((resolve) => { source.onended = () => resolve(); }); @@ -129,7 +185,7 @@ export class AudioProcedure { async stopAudio(): Promise { try { await this._page.evaluate(() => { - const ctx = (window as any).__audioContext as AudioContext; + const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx) { ctx.suspend(); } @@ -145,7 +201,7 @@ export class AudioProcedure { async cleanup(): Promise { try { await this._page.evaluate(() => { - const ctx = (window as any).__audioContext as AudioContext; + const ctx = (window as any).__ttsAudioContext as AudioContext; if (ctx) { ctx.close(); } diff --git a/src/bot/captionsProcedure.ts b/src/bot/captionsProcedure.ts index a79b1f3..20c58b9 100644 --- a/src/bot/captionsProcedure.ts +++ b/src/bot/captionsProcedure.ts @@ -332,6 +332,32 @@ export class CaptionsProcedure { // Look for the spoken language dropdown/combobox let languageSet = false; + + // First, log what's visible in the settings panel for debugging + const panelInfo = await this._page.evaluate(() => { + const selects = document.querySelectorAll('select'); + const comboboxes = document.querySelectorAll('[role="combobox"]'); + const listboxes = document.querySelectorAll('[role="listbox"]'); + const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]'); + const allButtons = document.querySelectorAll('button'); + const buttonsWithText = Array.from(allButtons) + .map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`) + .filter(t => t.length > 10) + .slice(0, 10); + return { + selects: selects.length, + comboboxes: comboboxes.length, + listboxes: listboxes.length, + dropdowns: dropdowns.length, + buttons: buttonsWithText, + bodySnippet: document.body?.innerText?.substring(0, 800) || '', + }; + }); + this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`); + this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`); + this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`); + + // Strategy A: Standard selectors const dropdownSelectors = [ 'select[aria-label*="spoken language" i]', 'select[aria-label*="Meeting spoken language" i]', @@ -339,7 +365,7 @@ export class CaptionsProcedure { '[data-tid="spoken-language-dropdown"]', 'div[role="combobox"]', 'div[role="listbox"]', - 'select', // Generic fallback + 'select', ]; for (const selector of dropdownSelectors) { @@ -350,7 +376,6 @@ export class CaptionsProcedure { const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase()); if (tagName === 'select') { - // Native select element for (const name of targetNames) { try { await this._page.selectOption(selector, { label: name }); @@ -368,7 +393,6 @@ export class CaptionsProcedure { for (const name of targetNames) { try { - // Try role="option" first, then generic text search const optionSelectors = [ `[role="option"]:has-text("${name}")`, `li:has-text("${name}")`, @@ -397,6 +421,69 @@ export class CaptionsProcedure { } } + // Strategy B: DOM evaluation fallback - find any dropdown-like element and interact + if (!languageSet) { + this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...'); + + languageSet = await this._page.evaluate((names: string[]) => { + // Find all elements that could be dropdowns (Fluent UI uses various patterns) + const candidates = document.querySelectorAll( + '[role="combobox"], [role="listbox"], select, ' + + '[class*="dropdown" i], [class*="Dropdown"], ' + + 'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' + + '[aria-expanded]' + ); + + for (let i = 0; i < candidates.length; i++) { + const el = candidates[i] as HTMLElement; + const label = el.getAttribute('aria-label') || ''; + const nearbyText = el.parentElement?.innerText || ''; + + // Check if this dropdown is related to language + const isLanguageRelated = + label.toLowerCase().includes('language') || + label.toLowerCase().includes('sprache') || + nearbyText.toLowerCase().includes('spoken language') || + nearbyText.toLowerCase().includes('gesprochene sprache'); + + if (isLanguageRelated || candidates.length === 1) { + // Click to open the dropdown + el.click(); + + // Wait a frame for options to render + return new Promise((resolve) => { + requestAnimationFrame(() => { + requestAnimationFrame(() => { + // Look for options + const options = document.querySelectorAll( + '[role="option"], [role="menuitem"], li[class*="option" i]' + ); + + for (let j = 0; j < options.length; j++) { + const opt = options[j] as HTMLElement; + const optText = opt.innerText?.trim() || ''; + + if (names.some(n => optText.includes(n))) { + opt.click(); + resolve(true); + return; + } + } + resolve(false); + }); + }); + }); + } + } + return Promise.resolve(false); + }, targetNames); + + if (languageSet) { + this._logger.info('Selected spoken language via DOM evaluation fallback'); + await this._page.waitForTimeout(500); + } + } + if (!languageSet) { this._logger.warn('Could not find/select spoken language in dropdown'); } diff --git a/src/bot/joinProcedure.ts b/src/bot/joinProcedure.ts index debd78d..0665986 100644 --- a/src/bot/joinProcedure.ts +++ b/src/bot/joinProcedure.ts @@ -264,12 +264,14 @@ export class JoinProcedure { * Check if the bot is currently in the meeting (admitted from lobby). * Primary selector: button[id="hangup-button"] (confirmed by Recall.ai). * Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign. + * + * For authenticated joins, Teams v2 sometimes renders differently. + * Additional fallback: check the URL for meeting patterns and DOM for call UI. */ async isInMeeting(options: { waitForSeconds?: number } = {}): Promise { const timeout = (options.waitForSeconds || 5) * 1000; - // Primary selector - confirmed by Recall.ai (Jan 2025) - // Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button" + // Primary selectors - known meeting UI elements const inMeetingSelectors = [ 'button[id="hangup-button"]', 'button[id="callingButtons-showMoreBtn"]', @@ -278,6 +280,16 @@ export class JoinProcedure { '[data-tid="call-composite"]', 'button[aria-label*="Leave"]', '[data-tid="callingButtons-showMoreBtn"]', + // Teams v2 (2025+) additional selectors + '[data-tid="call-controls"]', + '[data-tid="meeting-composite"]', + 'div[data-tid="video-gallery"]', + 'button[aria-label*="Hang up"]', + 'button[aria-label*="leave" i]', + // Mic/Camera toggle buttons are only visible in an active call + 'button[id="microphone-button"]', + 'button[data-tid="toggle-mute"]', + '[data-tid="microphone-button"]', ]; try { @@ -287,8 +299,35 @@ export class JoinProcedure { }); return true; } catch { - return false; + // Selector-based detection failed, try DOM evaluation as fallback } + + // Fallback: evaluate the page for meeting indicators + try { + const inMeeting = await this._page.evaluate(() => { + // Check for call-related aria roles and meeting elements + const bodyText = document.body?.innerText || ''; + const meetingIndicators = [ + 'Leave', // Leave button text + 'Mute', // Mic mute button + 'Unmute', // Mic unmute button + 'Turn off camera', // Camera control + 'Turn on camera', + 'Share', // Share screen + ]; + const found = meetingIndicators.filter(ind => bodyText.includes(ind)); + // Need at least 2 meeting indicators to confirm we're in a meeting + return found.length >= 2; + }); + if (inMeeting) { + this._logger.info('Detected meeting via DOM text analysis (fallback)'); + return true; + } + } catch { + // Page may not be ready + } + + return false; } /** diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts index 44803ab..0eb347a 100644 --- a/src/bot/orchestrator.ts +++ b/src/bot/orchestrator.ts @@ -437,7 +437,9 @@ export class BotOrchestrator { headless: config.botHeadless, args: [ '--use-fake-ui-for-media-stream', // Auto-accept media permissions - '--use-fake-device-for-media-stream', // Use fake devices + // NOTE: --use-fake-device-for-media-stream is intentionally NOT used. + // We override getUserMedia via addInitScript to return a MediaStreamDestination + // that we control, so TTS audio can be injected into Teams' mic input. '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process', '--autoplay-policy=no-user-gesture-required', @@ -468,6 +470,10 @@ export class BotOrchestrator { ); this._audioProcedure = new AudioProcedure(this._page, this._logger); + // Inject audio getUserMedia override BEFORE any navigation + // This ensures Teams gets our controlled audio stream when it calls getUserMedia + await this._audioProcedure.injectAudioOverride(); + // Handle page errors this._page.on('pageerror', (error) => { this._logger.error('Page error:', error); @@ -537,11 +543,18 @@ export class BotOrchestrator { // - Page is transitioning between states // Only give up after several consecutive cycles with no signal consecutiveNoSignal++; - this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`); + const currentUrl = this._page?.url() || 'unknown'; + this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`); if (consecutiveNoSignal >= maxNoSignal) { - // Take a screenshot for debugging before giving up + // Take a screenshot and log page content for debugging before giving up await this._takeScreenshot('no-meeting-signal'); + try { + const bodySnippet = await this._page?.evaluate(() => + document.body?.innerText?.substring(0, 500) || '(empty)' + ); + this._logger.warn(`Page content before giving up: ${bodySnippet}`); + } catch { /* ignore */ } throw new Error('Bot was removed from lobby or meeting ended'); } }