diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts
index a89dacc..25faafd 100644
--- a/src/bot/audioProcedure.ts
+++ b/src/bot/audioProcedure.ts
@@ -1,4 +1,4 @@
-import { Frame, Page } from 'playwright';
+import { Page } from 'playwright';
import { Logger } from 'winston';
/**
@@ -86,47 +86,36 @@ export class AudioProcedure {
return realStream;
};
+ // Force all RTCPeerConnection audio senders to use our TTS track.
+ // This ensures Teams actually sends our audio even if getUserMedia
+ // override happened in a different context or was renegotiated.
+ (window as any).__forceTtsTrackToSenders = async () => {
+ const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
+ const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
+ if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
+
+ let replaced = 0;
+ for (const pc of pcs) {
+ try {
+ const senders = pc.getSenders?.() || [];
+ for (const sender of senders) {
+ if (sender?.track?.kind === 'audio') {
+ await sender.replaceTrack(ttsTrack);
+ replaced++;
+ }
+ }
+ } catch {
+ // ignore per peer connection
+ }
+ }
+ return { replaced, pcs: pcs?.length || 0, reason: 'ok' };
+ };
});
this._initScriptInjected = true;
this._logger.info('Audio getUserMedia override injected');
}
- /**
- * Find the frame whose MediaStreamDestination track is used by the RTCPeerConnection.
- * Teams meeting often runs in an iframe; page.evaluate runs in main frame, so we'd
- * play into the wrong streamDest. Returns the frame to use, or null for main page.
- */
- private async _getTtsFrame(): Promise {
- const frames = this._page.frames();
- for (const frame of frames) {
- try {
- const match = await frame.evaluate(() => {
- const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
- const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
- if (!pcs.length || !streamDest) return false;
- const ttsTrackId = streamDest.stream.getAudioTracks()[0]?.id;
- if (!ttsTrackId) return false;
- for (const pc of pcs) {
- const senders = pc.getSenders?.() || [];
- for (const s of senders) {
- if (s?.track?.kind === 'audio' && s.track.id === ttsTrackId) return true;
- }
- }
- return false;
- });
- if (match) {
- this._logger.info(`[Voice] Using frame for TTS (track match): ${frame.url().substring(0, 80)}`);
- return frame;
- }
- } catch {
- // Frame may be detached
- }
- }
- this._logger.warn('[Voice] No frame with matching TTS track; using main page (voice may not reach participants)');
- return null;
- }
-
/**
* Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context).
@@ -233,8 +222,8 @@ export class AudioProcedure {
/**
* Internal: Play audio in the browser (single clip, no queuing).
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
- * Teams meeting may run in an iframe; we must play in the frame that has the
- * RTCPeerConnection (otherwise trackMatch=false, voice does not reach participants).
+ * Before playback, forces all WebRTC audio senders to use the TTS track
+ * (sender.replaceTrack) so Teams transmits our audio to participants.
*/
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise {
if (!this._audioContext) {
@@ -243,66 +232,31 @@ export class AudioProcedure {
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
- const targetFrame = await this._getTtsFrame();
- const evalTarget = targetFrame || this._page;
-
try {
- const playbackDiag = await evalTarget.evaluate(async ({ audioData, format }) => {
+ // Force all outgoing audio senders to use the TTS track
+ const senderInjectInfo = await this._page.evaluate(async () => {
+ const forceFn = (window as any).__forceTtsTrackToSenders;
+ if (typeof forceFn === 'function') {
+ return await forceFn();
+ }
+ return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
+ });
+ this._logger.info(
+ `TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'}`,
+ );
+
+ await this._page.evaluate(async ({ audioData, format }) => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
- const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
if (!ctx || !streamDest) {
throw new Error('Audio context not initialized');
}
- const collectWebRtcAudioStats = async () => {
- let senderCount = 0;
- let bytesSentTotal = 0;
- let packetsSentTotal = 0;
- const tracks: Array> = [];
-
- for (const pc of pcs) {
- const senders = pc.getSenders?.() || [];
- for (const sender of senders) {
- if (!sender?.track || sender.track.kind !== 'audio') continue;
- senderCount++;
- tracks.push({
- id: sender.track.id,
- label: sender.track.label,
- enabled: sender.track.enabled,
- muted: sender.track.muted,
- readyState: sender.track.readyState,
- });
- try {
- const stats = await sender.getStats();
- stats.forEach((report) => {
- if (report.type === 'outbound-rtp' && (report as any).kind === 'audio') {
- bytesSentTotal += Number((report as any).bytesSent || 0);
- packetsSentTotal += Number((report as any).packetsSent || 0);
- }
- });
- } catch {
- // ignore stats errors per sender
- }
- }
- }
-
- return {
- pcs: pcs.length,
- senderCount,
- bytesSentTotal,
- packetsSentTotal,
- tracks,
- };
- };
-
- // Resume context if suspended
if (ctx.state === 'suspended') {
await ctx.resume();
}
- // Decode base64 to ArrayBuffer
const binaryString = atob(audioData);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
@@ -312,7 +266,6 @@ export class AudioProcedure {
let audioBuffer: AudioBuffer;
if (format === 'pcm') {
- // PCM: Assume 16-bit mono 16kHz
const pcmData = new Int16Array(bytes.buffer);
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
const channelData = audioBuffer.getChannelData(0);
@@ -320,60 +273,26 @@ export class AudioProcedure {
channelData[i] = pcmData[i] / 32768;
}
} else {
- // MP3/WAV: Use decodeAudioData (slice to avoid detached buffer)
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
}
- const before = await collectWebRtcAudioStats();
-
- // Hypothesis B: verify TTS track matches PC sender track
- const ttsTrack = streamDest.stream.getAudioTracks()[0];
- const ttsTrackId = ttsTrack?.id || null;
- const senderTrackIds: string[] = [];
- for (const pc of pcs) {
- const senders = pc.getSenders?.() || [];
- for (const s of senders) {
- if (s?.track?.kind === 'audio') senderTrackIds.push(s.track.id);
- }
- }
- const trackMatch = ttsTrackId && senderTrackIds.includes(ttsTrackId);
-
- // Play through the MediaStreamDestination -> Teams mic input
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
source.connect(streamDest);
source.start(0);
- return new Promise((resolve) => {
+ return new Promise((resolve) => {
source.onended = () => {
try {
source.disconnect();
} catch {
// already disconnected
}
- resolve(null);
- };
- }).then(async () => {
- const after = await collectWebRtcAudioStats();
- return {
- before,
- after,
- deltaBytes: after.bytesSentTotal - before.bytesSentTotal,
- deltaPackets: after.packetsSentTotal - before.packetsSentTotal,
- ttsTrackId,
- senderTrackIds,
- trackMatch,
+ resolve();
};
});
}, { audioData, format });
- this._logger.info(
- `[Voice] TTS track vs PC: ttsTrackId=${playbackDiag?.ttsTrackId ?? 'n/a'} senderTrackIds=[${(playbackDiag?.senderTrackIds ?? []).join(',')}] trackMatch=${playbackDiag?.trackMatch ?? false}`,
- );
- this._logger.info(
- `TTS WebRTC diagnostics: pcs=${playbackDiag?.after?.pcs ?? 0}, senders=${playbackDiag?.after?.senderCount ?? 0}, ` +
- `deltaBytes=${playbackDiag?.deltaBytes ?? 0}, deltaPackets=${playbackDiag?.deltaPackets ?? 0}`,
- );
this._logger.info('Audio playback completed');
} catch (error) {
this._logger.error('Error playing audio:', error);