Voice: restore sender.replaceTrack for TTS injection, remove broken iframe logic

Made-with: Cursor
This commit is contained in:
ValueOn AG 2026-02-27 12:51:24 +01:00
parent b02bfd4c8f
commit 5f7dc60376

View file

@ -1,4 +1,4 @@
import { Frame, Page } from 'playwright'; import { Page } from 'playwright';
import { Logger } from 'winston'; import { Logger } from 'winston';
/** /**
@ -86,47 +86,36 @@ export class AudioProcedure {
return realStream; return realStream;
}; };
// Force all RTCPeerConnection audio senders to use our TTS track.
// This ensures Teams actually sends our audio even if getUserMedia
// override happened in a different context or was renegotiated.
(window as any).__forceTtsTrackToSenders = async () => {
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
let replaced = 0;
for (const pc of pcs) {
try {
const senders = pc.getSenders?.() || [];
for (const sender of senders) {
if (sender?.track?.kind === 'audio') {
await sender.replaceTrack(ttsTrack);
replaced++;
}
}
} catch {
// ignore per peer connection
}
}
return { replaced, pcs: pcs?.length || 0, reason: 'ok' };
};
}); });
this._initScriptInjected = true; this._initScriptInjected = true;
this._logger.info('Audio getUserMedia override injected'); this._logger.info('Audio getUserMedia override injected');
} }
/**
* Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
* Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
* play into the wrong streamDest. Returns the frame to use, or null for main page.
*/
private async _getTtsFrame(): Promise<Frame | null> {
const frames = this._page.frames();
for (const frame of frames) {
try {
const match = await frame.evaluate(() => {
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!pcs.length || !streamDest) return false;
const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
if (!ttsTrackId) return false;
for (const pc of pcs) {
const senders = pc.getSenders?.() || [];
for (const s of senders) {
if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
}
}
return false;
});
if (match) {
this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
return frame;
}
} catch {
// Frame may be detached
}
}
this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
return null;
}
/** /**
* Initialize the audio context in the browser for TTS playback. * Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context). * Must be called after joining the meeting (user gesture context).
@ -233,8 +222,8 @@ export class AudioProcedure {
/** /**
* Internal: Play audio in the browser (single clip, no queuing). * Internal: Play audio in the browser (single clip, no queuing).
* Audio is piped into the MediaStreamDestination that Teams uses as mic input. * Audio is piped into the MediaStreamDestination that Teams uses as mic input.
* Teams meeting may run in an iframe; we must play in the frame that has the * Before playback, forces all WebRTC audio senders to use the TTS track
* RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants). * (sender.replaceTrack) so Teams transmits our audio to participants.
*/ */
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> { private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
if (!this._audioContext) { if (!this._audioContext) {
@ -243,66 +232,31 @@ export class AudioProcedure {
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`); this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
const targetFrame = await this._getTtsFrame();
const evalTarget = targetFrame || this._page;
try { try {
const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => { // Force all outgoing audio senders to use the TTS track
const senderInjectInfo = await this._page.evaluate(async () => {
const forceFn = (window as any).__forceTtsTrackToSenders;
if (typeof forceFn === 'function') {
return await forceFn();
}
return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
});
this._logger.info(
`TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'}`,
);
await this._page.evaluate(async ({ audioData, format }) => {
const ctx = (window as any).__ttsAudioContext as AudioContext; const ctx = (window as any).__ttsAudioContext as AudioContext;
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode; const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
if (!ctx || !streamDest) { if (!ctx || !streamDest) {
throw new Error('Audio context not initialized'); throw new Error('Audio context not initialized');
} }
const collectWebRtcAudioStats = async () => {
let senderCount = 0;
let bytesSentTotal = 0;
let packetsSentTotal = 0;
const tracks: Array<Record<string, any>> = [];
for (const pc of pcs) {
const senders = pc.getSenders?.() || [];
for (const sender of senders) {
if (!sender?.track || sender.track.kind !== 'audio') continue;
senderCount++;
tracks.push({
id: sender.track.id,
label: sender.track.label,
enabled: sender.track.enabled,
muted: sender.track.muted,
readyState: sender.track.readyState,
});
try {
const stats = await sender.getStats();
stats.forEach((report) => {
if (report.type === 'outbound-rtp' && (report as any).kind === 'audio') {
bytesSentTotal += Number((report as any).bytesSent || 0);
packetsSentTotal += Number((report as any).packetsSent || 0);
}
});
} catch {
// ignore stats errors per sender
}
}
}
return {
pcs: pcs.length,
senderCount,
bytesSentTotal,
packetsSentTotal,
tracks,
};
};
// Resume context if suspended
if (ctx.state === 'suspended') { if (ctx.state === 'suspended') {
await ctx.resume(); await ctx.resume();
} }
// Decode base64 to ArrayBuffer
const binaryString = atob(audioData); const binaryString = atob(audioData);
const bytes = new Uint8Array(binaryString.length); const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) { for (let i = 0; i < binaryString.length; i++) {
@ -312,7 +266,6 @@ export class AudioProcedure {
let audioBuffer: AudioBuffer; let audioBuffer: AudioBuffer;
if (format === 'pcm') { if (format === 'pcm') {
// PCM: Assume 16-bit mono 16kHz
const pcmData = new Int16Array(bytes.buffer); const pcmData = new Int16Array(bytes.buffer);
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000); audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
const channelData = audioBuffer.getChannelData(0); const channelData = audioBuffer.getChannelData(0);
@ -320,60 +273,26 @@ export class AudioProcedure {
channelData[i] = pcmData[i] / 32768; channelData[i] = pcmData[i] / 32768;
} }
} else { } else {
// MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0)); audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
} }
const before = await collectWebRtcAudioStats();
// Hypothesis B: verify TTS track matches PC sender track
const ttsTrack = streamDest.stream.getAudioTracks()[0];
const ttsTrackId = ttsTrack?.id || null;
const senderTrackIds: string[] = [];
for (const pc of pcs) {
const senders = pc.getSenders?.() || [];
for (const s of senders) {
if (s?.track?.kind === 'audio') senderTrackIds.push(s.track.id);
}
}
const trackMatch = ttsTrackId && senderTrackIds.includes(ttsTrackId);
// Play through the MediaStreamDestination -> Teams mic input
const source = ctx.createBufferSource(); const source = ctx.createBufferSource();
source.buffer = audioBuffer; source.buffer = audioBuffer;
source.connect(streamDest); source.connect(streamDest);
source.start(0); source.start(0);
return new Promise((resolve) => { return new Promise<void>((resolve) => {
source.onended = () => { source.onended = () => {
try { try {
source.disconnect(); source.disconnect();
} catch { } catch {
// already disconnected // already disconnected
} }
resolve(null); resolve();
};
}).then(async () => {
const after = await collectWebRtcAudioStats();
return {
before,
after,
deltaBytes: after.bytesSentTotal - before.bytesSentTotal,
deltaPackets: after.packetsSentTotal - before.packetsSentTotal,
ttsTrackId,
senderTrackIds,
trackMatch,
}; };
}); });
}, { audioData, format }); }, { audioData, format });
this._logger.info(
`[Voice] TTS track vs PC: ttsTrackId=${playbackDiag?.ttsTrackId ?? 'n/a'} senderTrackIds=[${(playbackDiag?.senderTrackIds ?? []).join(',')}] trackMatch=${playbackDiag?.trackMatch ?? false}`,
);
this._logger.info(
`TTS WebRTC diagnostics: pcs=${playbackDiag?.after?.pcs ?? 0}, senders=${playbackDiag?.after?.senderCount ?? 0}, ` +
`deltaBytes=${playbackDiag?.deltaBytes ?? 0}, deltaPackets=${playbackDiag?.deltaPackets ?? 0}`,
);
this._logger.info('Audio playback completed'); this._logger.info('Audio playback completed');
} catch (error) { } catch (error) {
this._logger.error('Error playing audio:', error); this._logger.error('Error playing audio:', error);