From bd63dfc40ab4c97a841ea470a956651bde4923af Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sun, 15 Feb 2026 22:28:51 +0100
Subject: [PATCH] fix: auth join detection, caption language dropdown, audio
injection via getUserMedia override
Co-authored-by: Cursor
---
src/bot/audioProcedure.ts | 132 +++++++++++++++++++++++++----------
src/bot/captionsProcedure.ts | 93 +++++++++++++++++++++++-
src/bot/joinProcedure.ts | 45 +++++++++++-
src/bot/orchestrator.ts | 19 ++++-
4 files changed, 242 insertions(+), 47 deletions(-)
diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index 7927657..3a90ac0 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -3,12 +3,19 @@ import { Logger } from 'winston';
/**
* Handles audio playback in the Teams meeting.
- * Injects TTS audio into the browser to be played through the meeting.
+ *
+ * Architecture:
+ * - Before any page loads, we inject an init script that overrides getUserMedia
+ * to return a MediaStream from a MediaStreamDestination we control.
+ * - When Teams calls getUserMedia({audio: true}), it gets our custom stream.
+ * - When TTS audio is played, it's piped into the same MediaStreamDestination,
+ * so Teams picks it up as microphone input and sends it via WebRTC.
*/
export class AudioProcedure {
private _page: Page;
private _logger: Logger;
private _audioContext: boolean = false;
+ private _initScriptInjected: boolean = false;
constructor(page: Page, logger: Logger) {
this._page = page;
@@ -16,8 +23,62 @@ export class AudioProcedure {
}
/**
- * Initialize the audio context in the browser.
- * Must be called after user interaction (joining meeting counts).
+ * Inject the getUserMedia override BEFORE any page navigation.
+ * This MUST be called before navigating to Teams.
+ * Uses page.addInitScript so it runs in every new document context.
+ */
+ async injectAudioOverride(): Promise {
+ if (this._initScriptInjected) {
+ return;
+ }
+
+ this._logger.info('Injecting audio getUserMedia override...');
+
+ await this._page.addInitScript(() => {
+ // Create a shared AudioContext and MediaStreamDestination
+ // These persist across the page lifetime
+ const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
+ const ctx = new AudioContextClass();
+ const streamDest = ctx.createMediaStreamDestination();
+
+ // Store globally for later TTS injection
+ (window as any).__ttsAudioContext = ctx;
+ (window as any).__ttsStreamDest = streamDest;
+ (window as any).__ttsAudioStream = streamDest.stream;
+
+ // Override getUserMedia to return our controlled stream for audio requests
+ const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
+ navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
+ if (constraints && constraints.audio) {
+ // Return our TTS-injectable audio stream
+ // If video is also requested, combine our audio with real/fake video
+ if (constraints.video) {
+ try {
+ const videoStream = await originalGetUserMedia({ video: constraints.video });
+ const combinedStream = new MediaStream();
+ // Add our audio track
+ streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t));
+ // Add their video track
+ videoStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
+ return combinedStream;
+ } catch {
+ // If video fails, just return audio
+ return streamDest.stream;
+ }
+ }
+ return streamDest.stream;
+ }
+ return originalGetUserMedia(constraints);
+ };
+ });
+
+ this._initScriptInjected = true;
+ this._logger.info('Audio getUserMedia override injected');
+ }
+
+ /**
+ * Initialize the audio context in the browser for TTS playback.
+ * Must be called after joining the meeting (user gesture context).
*/
async initialize(): Promise {
if (this._audioContext) {
@@ -27,30 +88,23 @@ export class AudioProcedure {
this._logger.info('Initializing audio context...');
await this._page.evaluate(() => {
- // Create a global audio context
- const AudioContext = window.AudioContext || (window as any).webkitAudioContext;
- const ctx = new AudioContext();
- (window as any).__audioContext = ctx;
- (window as any).__audioQueue = [];
- (window as any).__isPlaying = false;
+ // The __ttsAudioContext was created by the init script.
+ // Resume it now (requires user gesture - joining meeting counts).
+ const ctx = (window as any).__ttsAudioContext as AudioContext;
+ if (ctx && ctx.state === 'suspended') {
+ ctx.resume();
+ }
- // Create a MediaStream destination so audio is routed into the
- // browser's virtual microphone (picked up by Teams) instead of
- // the default speaker output (ctx.destination).
- const streamDest = ctx.createMediaStreamDestination();
- (window as any).__audioStreamDest = streamDest;
-
- // Expose the stream so headless Chromium can pipe it as mic input.
- // navigator.mediaDevices.getUserMedia will be overridden to return this stream.
- const audioStream = streamDest.stream;
- const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
- navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
- // If requesting audio only, return our TTS stream
- if (constraints && constraints.audio && !constraints.video) {
- return audioStream;
- }
- return originalGetUserMedia(constraints);
- };
+ // If init script didn't run (e.g. page navigated before injection),
+ // create fallback audio infrastructure
+ if (!ctx) {
+ const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
+ const newCtx = new AudioContextClass();
+ const streamDest = newCtx.createMediaStreamDestination();
+ (window as any).__ttsAudioContext = newCtx;
+ (window as any).__ttsStreamDest = streamDest;
+ (window as any).__ttsAudioStream = streamDest.stream;
+ }
});
this._audioContext = true;
@@ -59,7 +113,7 @@ export class AudioProcedure {
/**
* Play audio in the browser.
- * The audio will be heard by other meeting participants.
+ * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
*
* @param audioData Base64 encoded audio data
* @param format Audio format (mp3, wav, pcm)
@@ -73,8 +127,13 @@ export class AudioProcedure {
try {
await this._page.evaluate(async ({ audioData, format }) => {
- const ctx = (window as any).__audioContext as AudioContext;
-
+ const ctx = (window as any).__ttsAudioContext as AudioContext;
+ const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
+
+ if (!ctx || !streamDest) {
+ throw new Error('Audio context not initialized');
+ }
+
// Resume context if suspended
if (ctx.state === 'suspended') {
await ctx.resume();
@@ -95,22 +154,19 @@ export class AudioProcedure {
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
const channelData = audioBuffer.getChannelData(0);
for (let i = 0; i < pcmData.length; i++) {
- channelData[i] = pcmData[i] / 32768; // Convert to float
+ channelData[i] = pcmData[i] / 32768;
}
} else {
// MP3/WAV: Use decodeAudioData
- audioBuffer = await ctx.decodeAudioData(bytes.buffer);
+ audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
}
- // Create source and play through the MediaStream destination
- // so audio is routed into the Teams microphone input, not speakers
+ // Play through the MediaStreamDestination -> Teams mic input
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
- const streamDest = (window as any).__audioStreamDest as MediaStreamAudioDestinationNode;
- source.connect(streamDest || ctx.destination);
+ source.connect(streamDest);
source.start(0);
- // Return a promise that resolves when playback ends
return new Promise((resolve) => {
source.onended = () => resolve();
});
@@ -129,7 +185,7 @@ export class AudioProcedure {
async stopAudio(): Promise {
try {
await this._page.evaluate(() => {
- const ctx = (window as any).__audioContext as AudioContext;
+ const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) {
ctx.suspend();
}
@@ -145,7 +201,7 @@ export class AudioProcedure {
async cleanup(): Promise {
try {
await this._page.evaluate(() => {
- const ctx = (window as any).__audioContext as AudioContext;
+ const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) {
ctx.close();
}
diff --git a/src/bot/captionsProcedure.ts b/src/bot/captionsProcedure.ts
index a79b1f3..20c58b9 100644
--- a/src/bot/captionsProcedure.ts
+++ b/src/bot/captionsProcedure.ts
@@ -332,6 +332,32 @@ export class CaptionsProcedure {
// Look for the spoken language dropdown/combobox
let languageSet = false;
+
+ // First, log what's visible in the settings panel for debugging
+ const panelInfo = await this._page.evaluate(() => {
+ const selects = document.querySelectorAll('select');
+ const comboboxes = document.querySelectorAll('[role="combobox"]');
+ const listboxes = document.querySelectorAll('[role="listbox"]');
+ const dropdowns = document.querySelectorAll('[class*="dropdown" i], [class*="Dropdown" i]');
+ const allButtons = document.querySelectorAll('button');
+ const buttonsWithText = Array.from(allButtons)
+ .map(b => `${b.tagName}[${b.getAttribute('aria-label') || b.textContent?.trim().substring(0, 40)}]`)
+ .filter(t => t.length > 10)
+ .slice(0, 10);
+ return {
+ selects: selects.length,
+ comboboxes: comboboxes.length,
+ listboxes: listboxes.length,
+ dropdowns: dropdowns.length,
+ buttons: buttonsWithText,
+ bodySnippet: document.body?.innerText?.substring(0, 800) || '',
+ };
+ });
+ this._logger.info(`Caption settings panel - selects: ${panelInfo.selects}, comboboxes: ${panelInfo.comboboxes}, listboxes: ${panelInfo.listboxes}, dropdowns: ${panelInfo.dropdowns}`);
+ this._logger.info(`Panel buttons: ${JSON.stringify(panelInfo.buttons)}`);
+ this._logger.debug(`Panel text: ${panelInfo.bodySnippet.substring(0, 300)}`);
+
+ // Strategy A: Standard selectors
const dropdownSelectors = [
'select[aria-label*="spoken language" i]',
'select[aria-label*="Meeting spoken language" i]',
@@ -339,7 +365,7 @@ export class CaptionsProcedure {
'[data-tid="spoken-language-dropdown"]',
'div[role="combobox"]',
'div[role="listbox"]',
- 'select', // Generic fallback
+ 'select',
];
for (const selector of dropdownSelectors) {
@@ -350,7 +376,6 @@ export class CaptionsProcedure {
const tagName = await dropdown.evaluate(el => el.tagName.toLowerCase());
if (tagName === 'select') {
- // Native select element
for (const name of targetNames) {
try {
await this._page.selectOption(selector, { label: name });
@@ -368,7 +393,6 @@ export class CaptionsProcedure {
for (const name of targetNames) {
try {
- // Try role="option" first, then generic text search
const optionSelectors = [
`[role="option"]:has-text("${name}")`,
`li:has-text("${name}")`,
@@ -397,6 +421,69 @@ export class CaptionsProcedure {
}
}
+ // Strategy B: DOM evaluation fallback - find any dropdown-like element and interact
+ if (!languageSet) {
+ this._logger.info('Standard dropdown selectors failed, trying DOM evaluation fallback...');
+
+ languageSet = await this._page.evaluate((names: string[]) => {
+ // Find all elements that could be dropdowns (Fluent UI uses various patterns)
+ const candidates = document.querySelectorAll(
+ '[role="combobox"], [role="listbox"], select, ' +
+ '[class*="dropdown" i], [class*="Dropdown"], ' +
+ 'button[aria-haspopup="listbox"], button[aria-haspopup="true"], ' +
+ '[aria-expanded]'
+ );
+
+ for (let i = 0; i < candidates.length; i++) {
+ const el = candidates[i] as HTMLElement;
+ const label = el.getAttribute('aria-label') || '';
+ const nearbyText = el.parentElement?.innerText || '';
+
+ // Check if this dropdown is related to language
+ const isLanguageRelated =
+ label.toLowerCase().includes('language') ||
+ label.toLowerCase().includes('sprache') ||
+ nearbyText.toLowerCase().includes('spoken language') ||
+ nearbyText.toLowerCase().includes('gesprochene sprache');
+
+ if (isLanguageRelated || candidates.length === 1) {
+ // Click to open the dropdown
+ el.click();
+
+ // Wait a frame for options to render
+ return new Promise((resolve) => {
+ requestAnimationFrame(() => {
+ requestAnimationFrame(() => {
+ // Look for options
+ const options = document.querySelectorAll(
+ '[role="option"], [role="menuitem"], li[class*="option" i]'
+ );
+
+ for (let j = 0; j < options.length; j++) {
+ const opt = options[j] as HTMLElement;
+ const optText = opt.innerText?.trim() || '';
+
+ if (names.some(n => optText.includes(n))) {
+ opt.click();
+ resolve(true);
+ return;
+ }
+ }
+ resolve(false);
+ });
+ });
+ });
+ }
+ }
+ return Promise.resolve(false);
+ }, targetNames);
+
+ if (languageSet) {
+ this._logger.info('Selected spoken language via DOM evaluation fallback');
+ await this._page.waitForTimeout(500);
+ }
+ }
+
if (!languageSet) {
this._logger.warn('Could not find/select spoken language in dropdown');
}
diff --git a/src/bot/joinProcedure.ts b/src/bot/joinProcedure.ts
index debd78d..0665986 100644
--- a/src/bot/joinProcedure.ts
+++ b/src/bot/joinProcedure.ts
@@ -264,12 +264,14 @@ export class JoinProcedure {
* Check if the bot is currently in the meeting (admitted from lobby).
* Primary selector: button[id="hangup-button"] (confirmed by Recall.ai).
* Note: Teams uses `id` (not `data-tid`) for the hangup button since 2025 redesign.
+ *
+ * For authenticated joins, Teams v2 sometimes renders differently.
+ * Additional fallback: check the URL for meeting patterns and DOM for call UI.
*/
async isInMeeting(options: { waitForSeconds?: number } = {}): Promise {
const timeout = (options.waitForSeconds || 5) * 1000;
- // Primary selector - confirmed by Recall.ai (Jan 2025)
- // Note: Teams now uses id="hangup-button" instead of data-tid="hangup-button"
+ // Primary selectors - known meeting UI elements
const inMeetingSelectors = [
'button[id="hangup-button"]',
'button[id="callingButtons-showMoreBtn"]',
@@ -278,6 +280,16 @@ export class JoinProcedure {
'[data-tid="call-composite"]',
'button[aria-label*="Leave"]',
'[data-tid="callingButtons-showMoreBtn"]',
+ // Teams v2 (2025+) additional selectors
+ '[data-tid="call-controls"]',
+ '[data-tid="meeting-composite"]',
+ 'div[data-tid="video-gallery"]',
+ 'button[aria-label*="Hang up"]',
+ 'button[aria-label*="leave" i]',
+ // Mic/Camera toggle buttons are only visible in an active call
+ 'button[id="microphone-button"]',
+ 'button[data-tid="toggle-mute"]',
+ '[data-tid="microphone-button"]',
];
try {
@@ -287,8 +299,35 @@ export class JoinProcedure {
});
return true;
} catch {
- return false;
+ // Selector-based detection failed, try DOM evaluation as fallback
}
+
+ // Fallback: evaluate the page for meeting indicators
+ try {
+ const inMeeting = await this._page.evaluate(() => {
+ // Check for call-related aria roles and meeting elements
+ const bodyText = document.body?.innerText || '';
+ const meetingIndicators = [
+ 'Leave', // Leave button text
+ 'Mute', // Mic mute button
+ 'Unmute', // Mic unmute button
+ 'Turn off camera', // Camera control
+ 'Turn on camera',
+ 'Share', // Share screen
+ ];
+ const found = meetingIndicators.filter(ind => bodyText.includes(ind));
+ // Need at least 2 meeting indicators to confirm we're in a meeting
+ return found.length >= 2;
+ });
+ if (inMeeting) {
+ this._logger.info('Detected meeting via DOM text analysis (fallback)');
+ return true;
+ }
+ } catch {
+ // Page may not be ready
+ }
+
+ return false;
}
/**
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index 44803ab..0eb347a 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -437,7 +437,9 @@ export class BotOrchestrator {
headless: config.botHeadless,
args: [
'--use-fake-ui-for-media-stream', // Auto-accept media permissions
- '--use-fake-device-for-media-stream', // Use fake devices
+ // NOTE: --use-fake-device-for-media-stream is intentionally NOT used.
+ // We override getUserMedia via addInitScript to return a MediaStreamDestination
+ // that we control, so TTS audio can be injected into Teams' mic input.
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--autoplay-policy=no-user-gesture-required',
@@ -468,6 +470,10 @@ export class BotOrchestrator {
);
this._audioProcedure = new AudioProcedure(this._page, this._logger);
+ // Inject audio getUserMedia override BEFORE any navigation
+ // This ensures Teams gets our controlled audio stream when it calls getUserMedia
+ await this._audioProcedure.injectAudioOverride();
+
// Handle page errors
this._page.on('pageerror', (error) => {
this._logger.error('Page error:', error);
@@ -537,11 +543,18 @@ export class BotOrchestrator {
// - Page is transitioning between states
// Only give up after several consecutive cycles with no signal
consecutiveNoSignal++;
- this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), waiting...`);
+ const currentUrl = this._page?.url() || 'unknown';
+ this._logger.info(`No lobby/meeting signal detected (attempt ${consecutiveNoSignal}/${maxNoSignal}), URL: ${currentUrl}`);
if (consecutiveNoSignal >= maxNoSignal) {
- // Take a screenshot for debugging before giving up
+ // Take a screenshot and log page content for debugging before giving up
await this._takeScreenshot('no-meeting-signal');
+ try {
+ const bodySnippet = await this._page?.evaluate(() =>
+ document.body?.innerText?.substring(0, 500) || '(empty)'
+ );
+ this._logger.warn(`Page content before giving up: ${bodySnippet}`);
+ } catch { /* ignore */ }
throw new Error('Bot was removed from lobby or meeting ended');
}
}