Voice: restore sender.replaceTrack for TTS injection, remove broken iframe logic
Made-with: Cursor
This commit is contained in:
parent
b02bfd4c8f
commit
5f7dc60376
1 changed files with 42 additions and 123 deletions
|
|
@ -1,4 +1,4 @@
|
||||||
import { Frame, Page } from 'playwright';
|
import { Page } from 'playwright';
|
||||||
import { Logger } from 'winston';
|
import { Logger } from 'winston';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -86,47 +86,36 @@ export class AudioProcedure {
|
||||||
return realStream;
|
return realStream;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Force all RTCPeerConnection audio senders to use our TTS track.
|
||||||
|
// This ensures Teams actually sends our audio even if getUserMedia
|
||||||
|
// override happened in a different context or was renegotiated.
|
||||||
|
(window as any).__forceTtsTrackToSenders = async () => {
|
||||||
|
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
||||||
|
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
|
||||||
|
if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
|
||||||
|
|
||||||
|
let replaced = 0;
|
||||||
|
for (const pc of pcs) {
|
||||||
|
try {
|
||||||
|
const senders = pc.getSenders?.() || [];
|
||||||
|
for (const sender of senders) {
|
||||||
|
if (sender?.track?.kind === 'audio') {
|
||||||
|
await sender.replaceTrack(ttsTrack);
|
||||||
|
replaced++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore per peer connection
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { replaced, pcs: pcs?.length || 0, reason: 'ok' };
|
||||||
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
this._initScriptInjected = true;
|
this._initScriptInjected = true;
|
||||||
this._logger.info('Audio getUserMedia override injected');
|
this._logger.info('Audio getUserMedia override injected');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
|
|
||||||
* Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
|
|
||||||
* play into the wrong streamDest. Returns the frame to use, or null for main page.
|
|
||||||
*/
|
|
||||||
private async _getTtsFrame(): Promise<Frame | null> {
|
|
||||||
const frames = this._page.frames();
|
|
||||||
for (const frame of frames) {
|
|
||||||
try {
|
|
||||||
const match = await frame.evaluate(() => {
|
|
||||||
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
|
||||||
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
|
||||||
if (!pcs.length || !streamDest) return false;
|
|
||||||
const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
|
|
||||||
if (!ttsTrackId) return false;
|
|
||||||
for (const pc of pcs) {
|
|
||||||
const senders = pc.getSenders?.() || [];
|
|
||||||
for (const s of senders) {
|
|
||||||
if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
if (match) {
|
|
||||||
this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
|
|
||||||
return frame;
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Frame may be detached
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the audio context in the browser for TTS playback.
|
* Initialize the audio context in the browser for TTS playback.
|
||||||
* Must be called after joining the meeting (user gesture context).
|
* Must be called after joining the meeting (user gesture context).
|
||||||
|
|
@ -233,8 +222,8 @@ export class AudioProcedure {
|
||||||
/**
|
/**
|
||||||
* Internal: Play audio in the browser (single clip, no queuing).
|
* Internal: Play audio in the browser (single clip, no queuing).
|
||||||
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
|
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
|
||||||
* Teams meeting may run in an iframe; we must play in the frame that has the
|
* Before playback, forces all WebRTC audio senders to use the TTS track
|
||||||
* RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants).
|
* (sender.replaceTrack) so Teams transmits our audio to participants.
|
||||||
*/
|
*/
|
||||||
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
|
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
|
||||||
if (!this._audioContext) {
|
if (!this._audioContext) {
|
||||||
|
|
@ -243,66 +232,31 @@ export class AudioProcedure {
|
||||||
|
|
||||||
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
|
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
|
||||||
|
|
||||||
const targetFrame = await this._getTtsFrame();
|
|
||||||
const evalTarget = targetFrame || this._page;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => {
|
// Force all outgoing audio senders to use the TTS track
|
||||||
|
const senderInjectInfo = await this._page.evaluate(async () => {
|
||||||
|
const forceFn = (window as any).__forceTtsTrackToSenders;
|
||||||
|
if (typeof forceFn === 'function') {
|
||||||
|
return await forceFn();
|
||||||
|
}
|
||||||
|
return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
|
||||||
|
});
|
||||||
|
this._logger.info(
|
||||||
|
`TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
await this._page.evaluate(async ({ audioData, format }) => {
|
||||||
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
const ctx = (window as any).__ttsAudioContext as AudioContext;
|
||||||
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
|
||||||
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
|
|
||||||
|
|
||||||
if (!ctx || !streamDest) {
|
if (!ctx || !streamDest) {
|
||||||
throw new Error('Audio context not initialized');
|
throw new Error('Audio context not initialized');
|
||||||
}
|
}
|
||||||
|
|
||||||
const collectWebRtcAudioStats = async () => {
|
|
||||||
let senderCount = 0;
|
|
||||||
let bytesSentTotal = 0;
|
|
||||||
let packetsSentTotal = 0;
|
|
||||||
const tracks: Array<Record<string, any>> = [];
|
|
||||||
|
|
||||||
for (const pc of pcs) {
|
|
||||||
const senders = pc.getSenders?.() || [];
|
|
||||||
for (const sender of senders) {
|
|
||||||
if (!sender?.track || sender.track.kind !== 'audio') continue;
|
|
||||||
senderCount++;
|
|
||||||
tracks.push({
|
|
||||||
id: sender.track.id,
|
|
||||||
label: sender.track.label,
|
|
||||||
enabled: sender.track.enabled,
|
|
||||||
muted: sender.track.muted,
|
|
||||||
readyState: sender.track.readyState,
|
|
||||||
});
|
|
||||||
try {
|
|
||||||
const stats = await sender.getStats();
|
|
||||||
stats.forEach((report) => {
|
|
||||||
if (report.type === 'outbound-rtp' && (report as any).kind === 'audio') {
|
|
||||||
bytesSentTotal += Number((report as any).bytesSent || 0);
|
|
||||||
packetsSentTotal += Number((report as any).packetsSent || 0);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch {
|
|
||||||
// ignore stats errors per sender
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
pcs: pcs.length,
|
|
||||||
senderCount,
|
|
||||||
bytesSentTotal,
|
|
||||||
packetsSentTotal,
|
|
||||||
tracks,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
// Resume context if suspended
|
|
||||||
if (ctx.state === 'suspended') {
|
if (ctx.state === 'suspended') {
|
||||||
await ctx.resume();
|
await ctx.resume();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Decode base64 to ArrayBuffer
|
|
||||||
const binaryString = atob(audioData);
|
const binaryString = atob(audioData);
|
||||||
const bytes = new Uint8Array(binaryString.length);
|
const bytes = new Uint8Array(binaryString.length);
|
||||||
for (let i = 0; i < binaryString.length; i++) {
|
for (let i = 0; i < binaryString.length; i++) {
|
||||||
|
|
@ -312,7 +266,6 @@ export class AudioProcedure {
|
||||||
let audioBuffer: AudioBuffer;
|
let audioBuffer: AudioBuffer;
|
||||||
|
|
||||||
if (format === 'pcm') {
|
if (format === 'pcm') {
|
||||||
// PCM: Assume 16-bit mono 16kHz
|
|
||||||
const pcmData = new Int16Array(bytes.buffer);
|
const pcmData = new Int16Array(bytes.buffer);
|
||||||
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
|
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
|
||||||
const channelData = audioBuffer.getChannelData(0);
|
const channelData = audioBuffer.getChannelData(0);
|
||||||
|
|
@ -320,60 +273,26 @@ export class AudioProcedure {
|
||||||
channelData[i] = pcmData[i] / 32768;
|
channelData[i] = pcmData[i] / 32768;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
|
|
||||||
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
const before = await collectWebRtcAudioStats();
|
|
||||||
|
|
||||||
// Hypothesis B: verify TTS track matches PC sender track
|
|
||||||
const ttsTrack = streamDest.stream.getAudioTracks()[0];
|
|
||||||
const ttsTrackId = ttsTrack?.id || null;
|
|
||||||
const senderTrackIds: string[] = [];
|
|
||||||
for (const pc of pcs) {
|
|
||||||
const senders = pc.getSenders?.() || [];
|
|
||||||
for (const s of senders) {
|
|
||||||
if (s?.track?.kind === 'audio') senderTrackIds.push(s.track.id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const trackMatch = ttsTrackId && senderTrackIds.includes(ttsTrackId);
|
|
||||||
|
|
||||||
// Play through the MediaStreamDestination -> Teams mic input
|
|
||||||
const source = ctx.createBufferSource();
|
const source = ctx.createBufferSource();
|
||||||
source.buffer = audioBuffer;
|
source.buffer = audioBuffer;
|
||||||
source.connect(streamDest);
|
source.connect(streamDest);
|
||||||
source.start(0);
|
source.start(0);
|
||||||
|
|
||||||
return new Promise((resolve) => {
|
return new Promise<void>((resolve) => {
|
||||||
source.onended = () => {
|
source.onended = () => {
|
||||||
try {
|
try {
|
||||||
source.disconnect();
|
source.disconnect();
|
||||||
} catch {
|
} catch {
|
||||||
// already disconnected
|
// already disconnected
|
||||||
}
|
}
|
||||||
resolve(null);
|
resolve();
|
||||||
};
|
|
||||||
}).then(async () => {
|
|
||||||
const after = await collectWebRtcAudioStats();
|
|
||||||
return {
|
|
||||||
before,
|
|
||||||
after,
|
|
||||||
deltaBytes: after.bytesSentTotal - before.bytesSentTotal,
|
|
||||||
deltaPackets: after.packetsSentTotal - before.packetsSentTotal,
|
|
||||||
ttsTrackId,
|
|
||||||
senderTrackIds,
|
|
||||||
trackMatch,
|
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}, { audioData, format });
|
}, { audioData, format });
|
||||||
|
|
||||||
this._logger.info(
|
|
||||||
`[Voice] TTS track vs PC: ttsTrackId=${playbackDiag?.ttsTrackId ?? 'n/a'} senderTrackIds=[${(playbackDiag?.senderTrackIds ?? []).join(',')}] trackMatch=${playbackDiag?.trackMatch ?? false}`,
|
|
||||||
);
|
|
||||||
this._logger.info(
|
|
||||||
`TTS WebRTC diagnostics: pcs=${playbackDiag?.after?.pcs ?? 0}, senders=${playbackDiag?.after?.senderCount ?? 0}, ` +
|
|
||||||
`deltaBytes=${playbackDiag?.deltaBytes ?? 0}, deltaPackets=${playbackDiag?.deltaPackets ?? 0}`,
|
|
||||||
);
|
|
||||||
this._logger.info('Audio playback completed');
|
this._logger.info('Audio playback completed');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this._logger.error('Error playing audio:', error);
|
this._logger.error('Error playing audio:', error);
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue