service-teams-browser-bot/src/bot/audioProcedure.ts
2026-05-12 17:49:26 +02:00

447 lines
16 KiB
TypeScript

import { Page } from 'playwright';
import { Logger } from 'winston';
import { poweronMediaPatchInstall } from './mediaGetUserMediaPatch';
/**
* Handles audio playback in the Teams meeting.
*
* Architecture:
* - Browser launches with --use-fake-device-for-media-stream so Teams sees
* real-looking devices (no "no audio/video" modal).
* - Before any page loads, we inject an init script that wraps getUserMedia.
* - When Teams calls getUserMedia, the wrapper:
* 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
* 2. Replaces the audio track with one from our MediaStreamDestination
* 3. Returns the modified stream; optional canvas video track instead of fake video
* - When TTS audio is played, it's piped into the MediaStreamDestination,
* and Teams sends it via WebRTC to other meeting participants.
*/
export type AudioProcedureOptions = {
useCanvasVideo?: boolean;
/** Shown in the center of the canvas (e.g. bot display name) */
displayLabel?: string;
/** Hex/CSS color of the static avatar background (default: light blue). */
avatarBgColor?: string;
/** Hex/CSS color of the centered display label (default: dark blue). */
avatarTextColor?: string;
};
export class AudioProcedure {
private _page: Page;
private _logger: Logger;
private _useCanvasVideo: boolean;
private _displayLabel: string;
private _avatarBgColor: string;
private _avatarTextColor: string;
private _audioContext: boolean = false;
private _initScriptInjected: boolean = false;
private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = [];
private _isPlaying: boolean = false;
private _stopRequested: boolean = false;
constructor(page: Page, logger: Logger, options?: AudioProcedureOptions) {
this._page = page;
this._logger = logger;
this._useCanvasVideo = !!options?.useCanvasVideo;
this._displayLabel = (options?.displayLabel || 'Bot').trim() || 'Bot';
this._avatarBgColor = (options?.avatarBgColor || '').trim() || '#a8d4f0';
this._avatarTextColor = (options?.avatarTextColor || '').trim() || '#1a3552';
}
/**
* Inject the getUserMedia wrapper BEFORE any page navigation.
* This MUST be called before navigating to Teams.
* Uses browserContext.addInitScript so the hook runs in the main page and
* in embedded frames (Teams often runs media/WebRTC in an iframe; page-only
* injection would miss getUserMedia and you would only see the fake device).
*/
async injectAudioOverride(): Promise<void> {
if (this._initScriptInjected) {
return;
}
this._logger.info(
`Injecting audio getUserMedia override (canvasVideo=${this._useCanvasVideo}, label="${this._displayLabel}")...`,
);
await this._page.context().addInitScript(poweronMediaPatchInstall, {
useCanvasVideo: this._useCanvasVideo,
displayLabel: this._displayLabel,
avatarBgColor: this._avatarBgColor,
avatarTextColor: this._avatarTextColor,
});
this._initScriptInjected = true;
this._logger.info('Audio getUserMedia override injected');
}
/**
* Re-run the media patch in every frame. Needed when Teams replaces the document
* in an iframe (addInitScript runs too early) or overwrites getUserMedia.
*/
async reinstallMediaPatchInAllFrames(): Promise<void> {
const payload = {
useCanvasVideo: this._useCanvasVideo,
displayLabel: this._displayLabel,
avatarBgColor: this._avatarBgColor,
avatarTextColor: this._avatarTextColor,
};
for (const frame of this._page.frames()) {
try {
await frame.evaluate(poweronMediaPatchInstall, payload);
} catch (e) {
this._logger.info(`[mediaPatch] frame skipped: ${e}`);
}
}
await this._forceCanvasVideoInAllFrames('reinstall');
}
/**
* Replace outbound video in every frame. Teams may run WebRTC in a subframe;
* only touching the main window leaves Chromium's default fake (green) video.
*/
private async _forceCanvasVideoInAllFrames(phase: string): Promise<void> {
if (!this._useCanvasVideo) {
return;
}
const parts: string[] = [];
for (const frame of this._page.frames()) {
try {
const r = await frame.evaluate(async () => {
const w = window as any;
w.__startBotAvatarStream?.();
return w.__forceVideoTrackToSenders?.();
});
const shortUrl = (() => {
try {
return frame.url().substring(0, 100);
} catch {
return '(no-url)';
}
})();
const rr: any = r || {};
const vsArr = (rr.videoStats || []) as any[];
const vs = vsArr.length
? vsArr.map(v => `${v.kind}:b=${v.bytes},p=${v.packets},fEnc=${v.framesEncoded},fSent=${v.framesSent},fps=${v.fps},${v.w}x${v.h}`).join(' | ')
: 'none';
parts.push(
`[${shortUrl}] r=${rr.replaced ?? 0} add=${rr.added ?? 0} pcs=${rr.pcs ?? 0} `
+ `tx=${rr.totalTransceivers ?? 0} vidTx=${rr.videoTransceivers ?? 0} `
+ `vidWith=${rr.videoSendersWithTrack ?? 0} vidNoTrack=${rr.videoSendersWithoutTrack ?? 0} `
+ `dirB=[${(rr.directionsBefore || []).join(',')}] dirA=[${(rr.directionsAfter || []).join(',')}] `
+ `cd=[${(rr.currentDirections || []).join(',')}] `
+ `track=${rr.trackId || 'n/a'}(en=${rr.trackEnabled},rs=${rr.trackReady},mu=${rr.trackMuted}) `
+ `vstats=[${vs}] ${rr.reason || ''}`.trim(),
);
} catch (e: any) {
parts.push(`err=${String(e?.message || e).slice(0, 64)}`);
}
}
this._logger.info(`Canvas video ${phase}: ${parts.join(' | ')}`);
}
/**
* Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context).
*/
async initialize(): Promise<void> {
if (this._audioContext) {
return;
}
this._logger.info('Initializing audio context...');
await this._page.evaluate(() => {
// The __ttsAudioContext was created by the init script.
// Resume it now (requires user gesture - joining meeting counts).
const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx && ctx.state === 'suspended') {
ctx.resume();
}
// If init script didn't run (e.g. page navigated before injection),
// create fallback audio infrastructure
if (!ctx) {
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext;
const newCtx = new AudioContextClass();
const streamDest = newCtx.createMediaStreamDestination();
(window as any).__ttsAudioContext = newCtx;
(window as any).__ttsStreamDest = streamDest;
(window as any).__ttsAudioStream = streamDest.stream;
}
});
if (this._useCanvasVideo) {
await this._forceCanvasVideoInAllFrames('init');
}
this._audioContext = true;
this._logger.info('Audio context initialized');
}
/**
* Queue audio for sequential playback.
* Audio is never played in parallel -- each clip waits for the previous one to finish.
*
* @param audioData Base64 encoded audio data
* @param format Audio format (mp3, wav, pcm)
*/
async playAudio(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
// Add to queue
this._audioQueue.push({ audioData, format });
this._logger.info(`Audio queued (queue size: ${this._audioQueue.length}, playing: ${this._isPlaying})`);
// If not currently playing, start processing the queue
if (!this._isPlaying) {
await this._processAudioQueue();
}
}
/**
* Process the audio queue sequentially.
*/
private async _processAudioQueue(): Promise<void> {
if (this._isPlaying) return;
this._isPlaying = true;
this._stopRequested = false;
while (this._audioQueue.length > 0 && !this._stopRequested) {
const item = this._audioQueue.shift()!;
try {
await this._playAudioInternal(item.audioData, item.format);
} catch (error) {
this._logger.error('Error playing queued audio:', error);
}
}
if (this._stopRequested) {
this._audioQueue = [];
this._logger.info('Audio queue cleared due to stop request');
}
this._isPlaying = false;
this._stopRequested = false;
}
/**
* Stop all audio immediately: stop current playback and clear the queue.
* Called when a user says "<botname> STOP" or similar.
*/
async stopAllAudio(): Promise<void> {
this._logger.info('Stop all audio requested');
this._stopRequested = true;
this._audioQueue = [];
try {
await this._page.evaluate(() => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) {
// Suspend immediately stops all audio output
ctx.suspend();
// Resume after a short delay so future audio can play
setTimeout(() => ctx.resume(), 100);
}
});
} catch {
// Page might not be ready
}
}
/**
* Internal: Play audio in the browser (single clip, no queuing).
* Audio is piped into the MediaStreamDestination that Teams uses as mic input.
* Before playback, forces all WebRTC audio senders to use the TTS track
* (sender.replaceTrack) so Teams transmits our audio to participants.
*/
private async _playAudioInternal(audioData: string, format: 'mp3' | 'wav' | 'pcm'): Promise<void> {
if (!this._audioContext) {
await this.initialize();
}
this._logger.info(`Playing audio (format: ${format}, size: ${audioData.length} bytes base64)`);
try {
// Force all outgoing audio senders to use the TTS track
const senderInjectInfo = await this._page.evaluate(async () => {
const forceFn = (window as any).__forceTtsTrackToSenders;
if (typeof forceFn === 'function') {
return await forceFn();
}
return { replaced: 0, pcs: 0, reason: 'force-function-missing' };
});
// #region agent log
const diag = senderInjectInfo?.diag || {};
this._logger.info(
`TTS sender injection: replaced=${senderInjectInfo?.replaced ?? 0}, pcs=${senderInjectInfo?.pcs ?? 0}, reason=${senderInjectInfo?.reason || 'n/a'} ` +
`ttsTrack=${diag.ttsTrackId || 'n/a'}(enabled=${diag.ttsTrackEnabled},state=${diag.ttsTrackReadyState},muted=${diag.ttsTrackMuted}) ` +
`beforeSenders=[${(diag.beforeSenderTrackIds || []).join(',')}] afterSenders=[${(diag.afterSenderTrackIds || []).join(',')}] ` +
`afterEnabled=${diag.afterSenderTrackEnabled} afterState=${diag.afterSenderTrackReadyState} forcedEnabled=${diag.forcedEnabled || false}`,
);
// #endregion
if (this._useCanvasVideo) {
await this._forceCanvasVideoInAllFrames('tts');
}
// Collect WebRTC stats BEFORE playback
// #region agent log
const statsBefore = await this._page.evaluate(async () => {
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
let bytesSent = 0; let packetsSent = 0;
for (const pc of pcs) {
for (const s of (pc.getSenders?.() || [])) {
if (s?.track?.kind !== 'audio') continue;
try {
const stats = await s.getStats();
stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } });
} catch {}
}
}
return { bytesSent, packetsSent };
});
// #endregion
await this._page.evaluate(async ({ audioData, format }) => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
const streamDest = (window as any).__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!ctx || !streamDest) {
throw new Error('Audio context not initialized');
}
if (ctx.state === 'suspended') {
await ctx.resume();
}
const binaryString = atob(audioData);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
let audioBuffer: AudioBuffer;
if (format === 'pcm') {
const pcmData = new Int16Array(bytes.buffer);
audioBuffer = ctx.createBuffer(1, pcmData.length, 16000);
const channelData = audioBuffer.getChannelData(0);
for (let i = 0; i < pcmData.length; i++) {
channelData[i] = pcmData[i] / 32768;
}
} else {
audioBuffer = await ctx.decodeAudioData(bytes.buffer.slice(0));
}
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
source.connect(streamDest);
source.start(0);
return new Promise<void>((resolve) => {
source.onended = () => {
try {
source.disconnect();
} catch {
// already disconnected
}
resolve();
};
});
}, { audioData, format });
// Collect WebRTC stats AFTER playback
// #region agent log
const statsAfter = await this._page.evaluate(async () => {
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
let bytesSent = 0; let packetsSent = 0;
for (const pc of pcs) {
for (const s of (pc.getSenders?.() || [])) {
if (s?.track?.kind !== 'audio') continue;
try {
const stats = await s.getStats();
stats.forEach((r: any) => { if (r.type === 'outbound-rtp' && r.kind === 'audio') { bytesSent += Number(r.bytesSent || 0); packetsSent += Number(r.packetsSent || 0); } });
} catch {}
}
}
// Also check current sender track state
const senderInfo: any[] = [];
for (const pc of pcs) {
for (const s of (pc.getSenders?.() || [])) {
if (s?.track?.kind !== 'audio') continue;
senderInfo.push({ id: s.track.id, enabled: s.track.enabled, readyState: s.track.readyState, muted: s.track.muted });
}
}
return { bytesSent, packetsSent, senderInfo };
});
this._logger.info(
`[Voice] WebRTC stats: before(bytes=${statsBefore.bytesSent},pkts=${statsBefore.packetsSent}) ` +
`after(bytes=${statsAfter.bytesSent},pkts=${statsAfter.packetsSent}) ` +
`delta(bytes=${statsAfter.bytesSent - statsBefore.bytesSent},pkts=${statsAfter.packetsSent - statsBefore.packetsSent}) ` +
`senders=${JSON.stringify(statsAfter.senderInfo)}`,
);
// #endregion
this._logger.info('Audio playback completed');
} catch (error) {
this._logger.error('Error playing audio:', error);
throw error;
}
}
/**
* Stop any currently playing audio.
*/
async stopAudio(): Promise<void> {
try {
await this._page.evaluate(() => {
const ctx = (window as any).__ttsAudioContext as AudioContext;
if (ctx) {
ctx.suspend();
}
});
} catch {
// Ignore errors
}
}
/**
* Clean up audio resources.
*/
async cleanup(): Promise<void> {
try {
for (const frame of this._page.frames()) {
try {
await frame.evaluate(() => {
const w = window as any;
if (w.__botAvatarDrawInterval) {
clearInterval(w.__botAvatarDrawInterval);
w.__botAvatarDrawInterval = null;
}
if (w.__botAvatarVideoTrack) {
try {
w.__botAvatarVideoTrack.stop();
} catch {
// ignore
}
w.__botAvatarVideoTrack = null;
}
if (w.__botAvatarCanvas && w.__botAvatarCanvas.remove) {
w.__botAvatarCanvas.remove();
w.__botAvatarCanvas = null;
}
w.__botAvatarStreamStarted = false;
const actx = w.__ttsAudioContext as AudioContext;
if (actx) {
actx.close();
}
});
} catch {
// cross-origin or closed frame
}
}
} catch {
// Page might be closed
}
this._audioContext = false;
}
}