Voice: restore sender.replaceTrack for TTS injection, remove broken iframe logic
Made-with: Cursor
This commit is contained in:
parent
b02bfd4c8f
commit
5f7dc60376
1 changed files with 42 additions and 123 deletions
|
|
@ -1,4 +1,4 @@
|
|||
import { Frame, Page } from 'playwright';
|
||||
import { Page } from 'playwright';
|
||||
import { Logger } from 'winston';
|
||||
|
||||
/**
|
||||
|
|
@ -86,47 +86,36 @@ export class AudioProcedure {
|
|||
return realStream;
|
||||
};
|
||||
|
||||
// Force all RTCPeerConnection audio senders to use our TTS track.
|
||||
// This ensures Teams actually sends our audio even if getUserMedia
|
||||
// override happened in a different context or was renegotiated.
|
||||
(window as any).__forceTtsTrackToSenders = async () => {
|
||||
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
||||
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
|
||||
if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
|
||||
|
||||
let replaced = 0;
|
||||
for (const pc of pcs) {
|
||||
try {
|
||||
const senders = pc.getSenders?.() || [];
|
||||
for (const sender of senders) {
|
||||
if (sender?.track?.kind === 'audio') {
|
||||
await sender.replaceTrack(ttsTrack);
|
||||
replaced++;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore per peer connection
|
||||
}
|
||||
}
|
||||
return { replaced, pcs: pcs?.length || 0, reason: 'ok' };
|
||||
};
|
||||
});
|
||||
|
||||
this._initScriptInjected = true;
|
||||
this._logger.info('Audio getUserMedia override injected');
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
|
||||
* Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
|
||||
* play into the wrong streamDest. Returns the frame to use, or null for main page.
|
||||
*/
|
||||
private async _getTtsFrame(): Promise<Frame | null> {
|
||||
const frames = this._page.frames();
|
||||
for (const frame of frames) {
|
||||
try {
|
||||
const match = await frame.evaluate(() => {
|
||||
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
||||
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
||||
if (!pcs.length || !streamDest) return false;
|
||||
const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
|
||||
if (!ttsTrackId) return false;
|
||||
for (const pc of pcs) {
|
||||
const senders = pc.getSenders?.() || [];
|
||||
for (const s of senders) {
|
||||
if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (match) {
|
||||
this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
|
||||
return frame;
|
||||
}
|
||||
} catch {
|
||||
// Frame may be detached
|
||||
}
|
||||
}
|
||||
this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the audio context in the browser for TTS playback.
|
||||
* Must be called after joining the meeting (user gesture context).
|
||||
|
|
@ -233,8 +222,8 @@ export class AudioProcedure {
|
|||
/**
|
||||
* Internal: Play audio in the browser (single clip, no queuing).
|
||||
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
|
||||
* Teams meeting may run in an iframe; we must play in the frame that has the
|
||||
* RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants).
|
||||
* Before playback, forces all WebRTC audio senders to use the TTS track
|
||||
* (sender.replaceTrack) so Teams transmits our audio to participants.
|
||||
*/
|
||||
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
|
||||
if (!this._audioContext) {
|
||||
|
|
@ -243,66 +232,31 @@ export class AudioProcedure {
|
|||
|
||||
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
|
||||
|
||||
const targetFrame = await this._getTtsFrame();
|
||||
const evalTarget = targetFrame || this._page;
|
||||
|
||||
try {
|
||||
const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => {
|
||||
// Force all outgoing audio senders to use the TTS track
|
||||
const senderInjectInfo = await this._page.evaluate(async () => {
|
||||
const forceFn = (window as any).__forceTtsTrackToSenders;
|
||||
if (typeof forceFn === 'function') {
|
||||
return await forceFn();
|
||||
}
|
||||
return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
|
||||
});
|
||||
this._logger.info(
|
||||
`TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'}`,
|
||||
);
|
||||
|
||||
await this._page.evaluate(async ({ audioData, format }) => {
|
||||
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
||||
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
||||
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
||||
|
||||
if (!ctx || !streamDest) {
|
||||
throw new Error('Audio context not initialized');
|
||||
}
|
||||
|
||||
const collectWebRtcAudioStats = async () => {
|
||||
let senderCount = 0;
|
||||
let bytesSentTotal = 0;
|
||||
let packetsSentTotal = 0;
|
||||
const tracks: Array<Record<string, any>> = [];
|
||||
|
||||
for (const pc of pcs) {
|
||||
const senders = pc.getSenders?.() || [];
|
||||
for (const sender of senders) {
|
||||
if (!sender?.track || sender.track.kind !== 'audio') continue;
|
||||
senderCount++;
|
||||
tracks.push({
|
||||
id: sender.track.id,
|
||||
label: sender.track.label,
|
||||
enabled: sender.track.enabled,
|
||||
muted: sender.track.muted,
|
||||
readyState: sender.track.readyState,
|
||||
});
|
||||
try {
|
||||
const stats = await sender.getStats();
|
||||
stats.forEach((report) => {
|
||||
if (report.type === 'outbound-rtp' && (report as any).kind === 'audio') {
|
||||
bytesSentTotal += Number((report as any).bytesSent || 0);
|
||||
packetsSentTotal += Number((report as any).packetsSent || 0);
|
||||
}
|
||||
});
|
||||
} catch {
|
||||
// ignore stats errors per sender
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
pcs: pcs.length,
|
||||
senderCount,
|
||||
bytesSentTotal,
|
||||
packetsSentTotal,
|
||||
tracks,
|
||||
};
|
||||
};
|
||||
|
||||
// Resume context if suspended
|
||||
if (ctx.state === 'suspended') {
|
||||
await ctx.resume();
|
||||
}
|
||||
|
||||
// Decode base64 to ArrayBuffer
|
||||
const binaryString = atob(audioData);
|
||||
const bytes = new Uint8Array(binaryString.length);
|
||||
for (let i = 0; i < binaryString.length; i++) {
|
||||
|
|
@ -312,7 +266,6 @@ export class AudioProcedure {
|
|||
let audioBuffer: AudioBuffer;
|
||||
|
||||
if (format === 'pcm') {
|
||||
// PCM: Assume 16-bit mono 16kHz
|
||||
const pcmData = new Int16Array(bytes.buffer);
|
||||
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
|
||||
const channelData = audioBuffer.getChannelData(0);
|
||||
|
|
@ -320,60 +273,26 @@ export class AudioProcedure {
|
|||
channelData[i] = pcmData[i] / 32768;
|
||||
}
|
||||
} else {
|
||||
// MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
|
||||
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
||||
}
|
||||
|
||||
const before = await collectWebRtcAudioStats();
|
||||
|
||||
// Hypothesis B: verify TTS track matches PC sender track
|
||||
const ttsTrack = streamDest.stream.getAudioTracks()[0];
|
||||
const ttsTrackId = ttsTrack?.id || null;
|
||||
const senderTrackIds: string[] = [];
|
||||
for (const pc of pcs) {
|
||||
const senders = pc.getSenders?.() || [];
|
||||
for (const s of senders) {
|
||||
if (s?.track?.kind === 'audio') senderTrackIds.push(s.track.id);
|
||||
}
|
||||
}
|
||||
const trackMatch = ttsTrackId && senderTrackIds.includes(ttsTrackId);
|
||||
|
||||
// Play through the MediaStreamDestination -> Teams mic input
|
||||
const source = ctx.createBufferSource();
|
||||
source.buffer = audioBuffer;
|
||||
source.connect(streamDest);
|
||||
source.start(0);
|
||||
|
||||
return new Promise((resolve) => {
|
||||
return new Promise<void>((resolve) => {
|
||||
source.onended = () => {
|
||||
try {
|
||||
source.disconnect();
|
||||
} catch {
|
||||
// already disconnected
|
||||
}
|
||||
resolve(null);
|
||||
};
|
||||
}).then(async () => {
|
||||
const after = await collectWebRtcAudioStats();
|
||||
return {
|
||||
before,
|
||||
after,
|
||||
deltaBytes: after.bytesSentTotal - before.bytesSentTotal,
|
||||
deltaPackets: after.packetsSentTotal - before.packetsSentTotal,
|
||||
ttsTrackId,
|
||||
senderTrackIds,
|
||||
trackMatch,
|
||||
resolve();
|
||||
};
|
||||
});
|
||||
}, { audioData, format });
|
||||
|
||||
this._logger.info(
|
||||
`[Voice] TTS track vs PC: ttsTrackId=${playbackDiag?.ttsTrackId ?? 'n/a'} senderTrackIds=[${(playbackDiag?.senderTrackIds ?? []).join(',')}] trackMatch=${playbackDiag?.trackMatch ?? false}`,
|
||||
);
|
||||
this._logger.info(
|
||||
`TTS WebRTC diagnostics: pcs=${playbackDiag?.after?.pcs ?? 0}, senders=${playbackDiag?.after?.senderCount ?? 0}, ` +
|
||||
`deltaBytes=${playbackDiag?.deltaBytes ?? 0}, deltaPackets=${playbackDiag?.deltaPackets ?? 0}`,
|
||||
);
|
||||
this._logger.info('Audio playback completed');
|
||||
} catch (error) {
|
||||
this._logger.error('Error playing audio:', error);
|
||||
|
|
|
|||
Loading…
Reference in a new issue