From 5f7dc60376e3f4d91858577f2cdd29cc207dbecf Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Fri, 27 Feb 2026 12:51:24 +0100 Subject: [PATCH] Voice: restore sender.replaceTrack for TTS injection, remove broken iframe logic Made-with: Cursor --- src/bot/audioProcedure.ts | 165 ++++++++++---------------------------- 1 file changed, 42 insertions(+), 123 deletions(-) diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts index a89dacc..25faafd 100644 --- a/src/bot/audioProcedure.ts +++ b/src/bot/audioProcedure.ts @@ -1,4 +1,4 @@ -import { Frame, Page } from 'playwright'; +import { Page } from 'playwright'; import { Logger } from 'winston'; /** @@ -86,47 +86,36 @@ export class AudioProcedure { return realStream; }; + // Force all RTCPeerConnection audio senders to use our TTS track. + // This ensures Teams actually sends our audio even if getUserMedia + // override happened in a different context or was renegotiated. + (window as any).__forceTtsTrackToSenders = async () => { + const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[]; + const ttsTrack = streamDest.stream.getAudioTracks()?.[0]; + if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' }; + + let replaced = 0; + for (const pc of pcs) { + try { + const senders = pc.getSenders?.() || []; + for (const sender of senders) { + if (sender?.track?.kind === 'audio') { + await sender.replaceTrack(ttsTrack); + replaced++; + } + } + } catch { + // ignore per peer connection + } + } + return { replaced, pcs: pcs?.length || 0, reason: 'ok' }; + }; }); this._initScriptInjected = true; this._logger.info('Audio getUserMedia override injected'); } - /** - * Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection. - * Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd - * play into the wrong streamDest. Returns the frame to use, or null for main page. - */ - private async _getTtsFrame(): Promise { - const frames = this._page.frames(); - for (const frame of frames) { - try { - const match = await frame.evaluate(() => { - const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[]; - const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode; - if (!pcs.length || !streamDest) return false; - const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id; - if (!ttsTrackId) return false; - for (const pc of pcs) { - const senders = pc.getSenders?.() || []; - for (const s of senders) { - if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true; - } - } - return false; - }); - if (match) { - this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`); - return frame; - } - } catch { - // Frame may be detached - } - } - this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)'); - return null; - } - /** * Initialize the audio context in the browser for TTS playback. * Must be called after joining the meeting (user gesture context). @@ -233,8 +222,8 @@ export class AudioProcedure { /** * Internal: Play audio in the browser (single clip, no queuing). * Audio is piped into the MediaStreamDestination that Teams uses as mic input. - * Teams meeting may run in an iframe; we must play in the frame that has the - * RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants). + * Before playback, forces all WebRTC audio senders to use the TTS track + * (sender.replaceTrack) so Teams transmits our audio to participants. */ private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise { if (!this._audioContext) { @@ -243,66 +232,31 @@ export class AudioProcedure { this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`); - const targetFrame = await this._getTtsFrame(); - const evalTarget = targetFrame || this._page; - try { - const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => { + // Force all outgoing audio senders to use the TTS track + const senderInjectInfo = await this._page.evaluate(async () => { + const forceFn = (window as any).__forceTtsTrackToSenders; + if (typeof forceFn === 'function') { + return await forceFn(); + } + return { replaced: 0, pcs: 0, reason: 'force-function-missing' }; + }); + this._logger.info( + `TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'}`, + ); + + await this._page.evaluate(async ({ audioData, format }) => { const ctx = (window as any).__ttsAudioContext as AudioContext; const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode; - const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[]; if (!ctx || !streamDest) { throw new Error('Audio context not initialized'); } - const collectWebRtcAudioStats = async () => { - let senderCount = 0; - let bytesSentTotal = 0; - let packetsSentTotal = 0; - const tracks: Array> = []; - - for (const pc of pcs) { - const senders = pc.getSenders?.() || []; - for (const sender of senders) { - if (!sender?.track || sender.track.kind !== 'audio') continue; - senderCount++; - tracks.push({ - id: sender.track.id, - label: sender.track.label, - enabled: sender.track.enabled, - muted: sender.track.muted, - readyState: sender.track.readyState, - }); - try { - const stats = await sender.getStats(); - stats.forEach((report) => { - if (report.type === 'outbound-rtp' && (report as any).kind === 'audio') { - bytesSentTotal += Number((report as any).bytesSent || 0); - packetsSentTotal += Number((report as any).packetsSent || 0); - } - }); - } catch { - // ignore stats errors per sender - } - } - } - - return { - pcs: pcs.length, - senderCount, - bytesSentTotal, - packetsSentTotal, - tracks, - }; - }; - - // Resume context if suspended if (ctx.state === 'suspended') { await ctx.resume(); } - // Decode base64 to ArrayBuffer const binaryString = atob(audioData); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { @@ -312,7 +266,6 @@ export class AudioProcedure { let audioBuffer: AudioBuffer; if (format === 'pcm') { - // PCM: Assume 16-bit mono 16kHz const pcmData = new Int16Array(bytes.buffer); audioBuffer = ctx.createBuffer(1, pcmData.length, 16000); const channelData = audioBuffer.getChannelData(0); @@ -320,60 +273,26 @@ export class AudioProcedure { channelData[i] = pcmData[i] / 32768; } } else { - // MP3/WAV: Use decodeAudioData (slice to avoid detached buffer) audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0)); } - const before = await collectWebRtcAudioStats(); - - // Hypothesis B: verify TTS track matches PC sender track - const ttsTrack = streamDest.stream.getAudioTracks()[0]; - const ttsTrackId = ttsTrack?.id || null; - const senderTrackIds: string[] = []; - for (const pc of pcs) { - const senders = pc.getSenders?.() || []; - for (const s of senders) { - if (s?.track?.kind === 'audio') senderTrackIds.push(s.track.id); - } - } - const trackMatch = ttsTrackId && senderTrackIds.includes(ttsTrackId); - - // Play through the MediaStreamDestination -> Teams mic input const source = ctx.createBufferSource(); source.buffer = audioBuffer; source.connect(streamDest); source.start(0); - return new Promise((resolve) => { + return new Promise((resolve) => { source.onended = () => { try { source.disconnect(); } catch { // already disconnected } - resolve(null); - }; - }).then(async () => { - const after = await collectWebRtcAudioStats(); - return { - before, - after, - deltaBytes: after.bytesSentTotal - before.bytesSentTotal, - deltaPackets: after.packetsSentTotal - before.packetsSentTotal, - ttsTrackId, - senderTrackIds, - trackMatch, + resolve(); }; }); }, { audioData, format }); - this._logger.info( - `[Voice] TTS track vs PC: ttsTrackId=${playbackDiag?.ttsTrackId ?? 'n/a'} senderTrackIds=[${(playbackDiag?.senderTrackIds ?? []).join(',')}] trackMatch=${playbackDiag?.trackMatch ?? false}`, - ); - this._logger.info( - `TTS WebRTC diagnostics: pcs=${playbackDiag?.after?.pcs ?? 0}, senders=${playbackDiag?.after?.senderCount ?? 0}, ` + - `deltaBytes=${playbackDiag?.deltaBytes ?? 0}, deltaPackets=${playbackDiag?.deltaPackets ?? 0}`, - ); this._logger.info('Audio playback completed'); } catch (error) { this._logger.error('Error playing audio:', error);