From 533e976039155bbafa46baf6343bbe3590ad6c63 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Fri, 27 Feb 2026 12:27:02 +0100
Subject: [PATCH] Voice: play TTS in correct frame (iframe), add mic toggle
fallbacks
Made-with: Cursor
---
src/bot/audioProcedure.ts | 42 ++++++++++++++++++++++++++++++++++++++-
src/bot/orchestrator.ts | 35 +++++++++++++++++++++++++-------
2 files changed, 69 insertions(+), 8 deletions(-)
diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index 37a900c..0b9ce6c 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -92,6 +92,41 @@ export class AudioProcedure {
this._logger.info('Audio getUserMedia override injected');
}
+ /**
+ * Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
+ * Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
+ * play into the wrong streamDest. Returns the frame to use, or null for main page.
+ */
+ private async _getTtsFrame(): Promise<{ evaluate: typeof this._page.evaluate } | null> {
+ const frames = this._page.frames();
+ for (const frame of frames) {
+ try {
+ const match = await frame.evaluate(() => {
+ const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
+ const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
+ if (!pcs.length || !streamDest) return false;
+ const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
+ if (!ttsTrackId) return false;
+ for (const pc of pcs) {
+ const senders = pc.getSenders?.() || [];
+ for (const s of senders) {
+ if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
+ }
+ }
+ return false;
+ });
+ if (match) {
+ this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
+ return frame;
+ }
+ } catch {
+ // Frame may be detached
+ }
+ }
+ this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
+ return null;
+ }
+
/**
* Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context).
@@ -198,6 +233,8 @@ export class AudioProcedure {
/**
* Internal: Play audio in the browser (single clip, no queuing).
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
+ * Teams meeting may run in an iframe; we must play in the frame that has the
+ * RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants).
*/
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise {
if (!this._audioContext) {
@@ -206,8 +243,11 @@ export class AudioProcedure {
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
+ const targetFrame = await this._getTtsFrame();
+ const evalTarget = targetFrame || this._page;
+
try {
- const playbackDiag = await this._page.evaluate(async ({ audioData, format }) => {
+ const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index 3b44f20..39bca99 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -493,6 +493,12 @@ export class BotOrchestrator {
const micToggle = await this._pollForElement([
'input[data-tid="toggle-audio"]',
'[data-tid="toggle-audio"]',
+ 'input[data-tid="toggle-mute"]',
+ '[data-tid="toggle-mute"]',
+ 'button[id="microphone-button"]',
+ 'button[data-inp="microphone-button"]',
+ 'button[aria-label*="microphone" i]',
+ 'button[aria-label*="Mikrofon" i]',
'input[role="switch"][title*="microphone" i]',
'input[role="switch"][title*="Mikrofon" i]',
'input[role="switch"][title*="mic" i]',
@@ -501,14 +507,29 @@ export class BotOrchestrator {
if (!micToggle) return;
- const state = await micToggle.evaluate((el: HTMLInputElement) => ({
- checked: el.checked,
- dataCid: el.getAttribute('data-cid') || '',
- title: el.getAttribute('title') || '',
- }));
- this._logger.info(`Mic state: checked=${state.checked}, data-cid="${state.dataCid}", title="${state.title}"`);
+ const state = await micToggle.evaluate((el: HTMLElement) => {
+ const input = el as HTMLInputElement;
+ const label = (el.getAttribute('aria-label') || el.getAttribute('title') || '').toLowerCase();
+ const isInput = el.tagName === 'INPUT';
+ const checked = isInput ? input.checked : undefined;
+ const looksMuted = !isInput && (
+ label.includes('unmute') || label.includes('einschalten') ||
+ label.includes('turn on') || label.includes('turn microphone on')
+ );
+ return {
+ checked,
+ dataCid: el.getAttribute('data-cid') || '',
+ title: el.getAttribute('title') || '',
+ tagName: el.tagName,
+ looksMuted,
+ };
+ });
+ this._logger.info(`Mic state: checked=${state.checked}, data-cid="${state.dataCid}", tag=${state.tagName}, looksMuted=${state.looksMuted}`);
- if (!state.checked) {
+ const needsClick = state.tagName === 'INPUT'
+ ? !state.checked
+ : (state.looksMuted === true);
+ if (needsClick) {
await micToggle.click();
this._logger.info('Mic toggled ON');
} else {