service-teams-browser-bot/src/bot/mediaGetUserMediaPatch.ts
2026-05-12 19:16:07 +02:00

573 lines
20 KiB
TypeScript

/**
* Injected in the browser: wraps getUserMedia, TTS destination, optional canvas
* video. Must be a single self-contained function for Playwright serialization.
* Re-calling this on the same document re-patches gUM and reuses the saved
* Chromium getUserMedia + AudioContext when present (Teams can replace
* navigator.mediaDevices.getUserMedia after a document/iframe refresh).
*/
export type MediaGetUserMediaPatchOptions = {
useCanvasVideo: boolean;
displayLabel: string;
/** Hex/CSS color of the static avatar background (default: light blue). */
avatarBgColor?: string;
/** Hex/CSS color of the centered display label (default: dark blue). */
avatarTextColor?: string;
/** Base64-encoded image/video data for custom bot avatar. */
avatarMediaData?: string;
/** MIME type of the avatar media (e.g. image/png, video/mp4). */
avatarMediaType?: string;
};
export const poweronMediaPatchInstall = (opts: MediaGetUserMediaPatchOptions) => {
'use strict';
const { useCanvasVideo, displayLabel } = opts;
const avatarBgColor = opts.avatarBgColor || '#a8d4f0';
const avatarTextColor = opts.avatarTextColor || '#1a3552';
const avatarMediaData = opts.avatarMediaData || '';
const avatarMediaType = opts.avatarMediaType || '';
const w: any = window as any;
if (!w.__gumChromium) {
w.__gumChromium = (navigator.mediaDevices as any).getUserMedia.bind(navigator.mediaDevices);
}
// Patch RTCPeerConnection.prototype methods once per realm to observe Teams'
// track placement + SDP negotiation. We DO NOT modify any tracks here; gUM
// already returns the canvas video track to Teams, so the right track is
// placed on the sender automatically. We only OBSERVE so we can diagnose
// what Teams does (or fails to do).
if (!w.__poweronRtcPatched && (window as any).RTCPeerConnection) {
w.__poweronRtcPatched = true;
const RTCProto: any = (window as any).RTCPeerConnection.prototype;
const _origAddTrack = RTCProto.addTrack;
const _origAddTransceiver = RTCProto.addTransceiver;
const _origRemoveTrack = RTCProto.removeTrack;
const _origReplaceTrackProto = (window as any).RTCRtpSender?.prototype?.replaceTrack;
const _origSetLocalDescription = RTCProto.setLocalDescription;
const _origSetRemoteDescription = RTCProto.setRemoteDescription;
RTCProto.addTrack = function (track: MediaStreamTrack, ...streams: MediaStream[]) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.addTrack kind=' + (track && track.kind)
+ ' id=' + (track && track.id)
+ ' enabled=' + (track && track.enabled),
);
} catch {
// ignore
}
const sender = _origAddTrack.call(this, track, ...streams);
try {
if (useCanvasVideo && track && track.kind === 'video') {
const list = (w.__poweronVideoSenders = w.__poweronVideoSenders || []);
list.push({ sender, originalTrackId: track.id });
}
} catch {
// ignore
}
return sender;
};
RTCProto.addTransceiver = function (trackOrKind: any, init?: any) {
try {
const k = typeof trackOrKind === 'string' ? trackOrKind : trackOrKind?.kind;
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.addTransceiver kind=' + k
+ ' direction=' + (init && init.direction),
);
} catch {
// ignore
}
return _origAddTransceiver.call(this, trackOrKind, init);
};
if (_origRemoveTrack) {
RTCProto.removeTrack = function (sender: any) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.removeTrack senderTrackKind='
+ (sender && sender.track && sender.track.kind)
+ ' senderTrackId=' + (sender && sender.track && sender.track.id),
);
} catch {
// ignore
}
return _origRemoveTrack.call(this, sender);
};
}
if (_origReplaceTrackProto) {
(window as any).RTCRtpSender.prototype.replaceTrack = function (track: any) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] sender.replaceTrack(by=teams?) currentKind='
+ (this.track && this.track.kind)
+ ' newKind=' + (track && track.kind)
+ ' newId=' + (track && track.id),
);
} catch {
// ignore
}
return _origReplaceTrackProto.call(this, track);
};
}
const _logSdp = (label: string, sdp?: string) => {
try {
if (!sdp) {
// eslint-disable-next-line no-console
console.log('[AudioPlayback] ' + label + ' sdp=<none>');
return;
}
const lines = sdp.split(/\r?\n/);
const interesting: string[] = [];
let curM = '';
for (const ln of lines) {
if (ln.startsWith('m=')) {
curM = ln.slice(2, 7);
interesting.push('M:' + ln);
} else if (
ln.startsWith('a=sendrecv')
|| ln.startsWith('a=sendonly')
|| ln.startsWith('a=recvonly')
|| ln.startsWith('a=inactive')
) {
interesting.push(curM + ':' + ln);
} else if (ln.startsWith('a=mid:')) {
interesting.push(curM + ':' + ln);
}
}
// eslint-disable-next-line no-console
console.log('[AudioPlayback] ' + label + ' ' + interesting.join(' | '));
} catch {
// ignore
}
};
RTCProto.setLocalDescription = function (desc?: any) {
try {
const t = desc && (desc.type || desc.sdp ? desc.type : 'auto');
_logSdp('setLocalDescription type=' + t, desc && desc.sdp);
} catch {
// ignore
}
return _origSetLocalDescription.call(this, desc);
};
RTCProto.setRemoteDescription = function (desc?: any) {
try {
const t = desc && desc.type;
_logSdp('setRemoteDescription type=' + t, desc && desc.sdp);
} catch {
// ignore
}
return _origSetRemoteDescription.call(this, desc);
};
}
if (!w.__ttsStreamDest) {
const AudioContextClass = (window as any).AudioContext || (window as any).webkitAudioContext;
const ctx: AudioContext = new AudioContextClass();
const streamDest: MediaStreamAudioDestinationNode = ctx.createMediaStreamDestination();
w.__ttsAudioContext = ctx;
w.__ttsStreamDest = streamDest;
w.__ttsAudioStream = streamDest.stream;
}
const streamDest = w.__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!streamDest) {
return;
}
const _hasCustomMedia = !!(avatarMediaData && avatarMediaType);
const _isCustomVideo = _hasCustomMedia && avatarMediaType.startsWith('video/');
const _fps = _isCustomVideo ? 15 : 2;
w.__startBotAvatarStream = () => {
if (
w.__botAvatarStreamStarted
&& w.__botAvatarVideoTrack
&& w.__botAvatarVideoTrack.readyState === 'live'
&& w.__botAvatarCanvas
&& w.__botAvatarCanvas.isConnected
) {
return;
}
if (w.__botAvatarDrawInterval) {
clearInterval(w.__botAvatarDrawInterval);
w.__botAvatarDrawInterval = null;
}
try {
w.__botAvatarVideoTrack?.stop?.();
} catch {
// ignore
}
w.__botAvatarStreamStarted = true;
w.__botAvatarDisplayLabel = displayLabel;
const canvas = document.createElement('canvas');
canvas.width = _hasCustomMedia ? 1280 : 640;
canvas.height = _hasCustomMedia ? 720 : 360;
canvas.setAttribute('data-poweron-avatar', '1');
// Render at a real size so the compositor produces frames in headless mode.
// captureStream() in headless Chromium can stall when the canvas is 0/invisible.
canvas.style.cssText =
'position:fixed;left:0;top:0;width:160px;height:90px;z-index:2147483646;opacity:0.99;pointer-events:none;background:#000;';
(document.body || document.documentElement).appendChild(canvas);
w.__botAvatarCanvas = canvas;
const c2d = canvas.getContext('2d');
const _isVideo = avatarMediaType.startsWith('video/');
const _drawFallback = () => {
if (!c2d) return;
const wPx = canvas.width;
const hPx = canvas.height;
c2d.fillStyle = avatarBgColor;
c2d.fillRect(0, 0, wPx, hPx);
c2d.fillStyle = avatarTextColor;
c2d.font = 'bold 28px system-ui, "Segoe UI", sans-serif';
c2d.textAlign = 'center';
c2d.textBaseline = 'middle';
const line = (w.__botAvatarDisplayLabel || displayLabel).toString().slice(0, 72);
c2d.fillText(line, wPx / 2, hPx / 2);
};
let _mediaReady = false;
let _mediaElement: any = null;
if (avatarMediaData && avatarMediaType) {
const dataUrl = 'data:' + avatarMediaType + ';base64,' + avatarMediaData;
if (_isVideo) {
const video = document.createElement('video');
video.src = dataUrl;
video.loop = true;
video.muted = true;
video.playsInline = true;
video.style.display = 'none';
(document.body || document.documentElement).appendChild(video);
video.play().catch(() => { /* autoplay blocked — fall back to static */ });
video.addEventListener('playing', () => { _mediaReady = true; }, { once: true });
_mediaElement = video;
} else {
const img = new Image();
img.src = dataUrl;
img.onload = () => { _mediaReady = true; };
_mediaElement = img;
}
}
const draw = () => {
if (!c2d) return;
if (_mediaReady && _mediaElement) {
try {
const cW = canvas.width;
const cH = canvas.height;
const srcW = _isVideo
? (_mediaElement as HTMLVideoElement).videoWidth || cW
: (_mediaElement as HTMLImageElement).naturalWidth || cW;
const srcH = _isVideo
? (_mediaElement as HTMLVideoElement).videoHeight || cH
: (_mediaElement as HTMLImageElement).naturalHeight || cH;
const scale = Math.min(cW / srcW, cH / srcH);
const dW = srcW * scale;
const dH = srcH * scale;
const dX = (cW - dW) / 2;
const dY = (cH - dH) / 2;
c2d.fillStyle = avatarBgColor;
c2d.fillRect(0, 0, cW, cH);
c2d.drawImage(_mediaElement, dX, dY, dW, dH);
return;
} catch {
// corrupted frame — fall through to fallback
}
}
_drawFallback();
};
draw();
// Capture at fps for compositor-driven frames AND also push manual frames
// via requestFrame() each tick for headless reliability.
const cap = (canvas as any).captureStream(_fps) as MediaStream;
w.__botAvatarVideoTrack = cap.getVideoTracks()[0];
w.__botAvatarStreamObj = cap;
if (w.__botAvatarVideoTrack) {
w.__botAvatarVideoTrack.enabled = true;
try {
w.__botAvatarVideoTrack.contentHint = _isCustomVideo ? 'motion' : 'detail';
} catch {
// ignore
}
}
const _tickAndPush = () => {
try {
draw();
} catch {
// ignore
}
try {
const tr: any = w.__botAvatarVideoTrack;
if (tr && typeof tr.requestFrame === 'function') {
tr.requestFrame();
}
} catch {
// ignore
}
};
w.__botAvatarDrawInterval = window.setInterval(_tickAndPush, 1000 / _fps);
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] canvas avatar stream (re)built, videoTrack=',
w.__botAvatarVideoTrack ? w.__botAvatarVideoTrack.id : 'none',
);
};
w.__forceVideoTrackToSenders = async () => {
if (!useCanvasVideo) {
return { replaced: 0, pcs: 0, reason: 'canvas-video-off' };
}
w.__startBotAvatarStream();
const src: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (!src) {
return { replaced: 0, pcs: 0, reason: 'no-avatar-track' };
}
const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[];
let replaced = 0;
let added = 0;
let videoTransceivers = 0;
let videoSendersWithTrack = 0;
let videoSendersWithoutTrack = 0;
let totalTransceivers = 0;
const directionsBefore: string[] = [];
const directionsAfter: string[] = [];
for (const pc of pcs) {
const transceivers = (pc as any).getTransceivers?.() || [];
totalTransceivers += transceivers.length;
// Snapshot signaling/connection state for diagnostics.
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc state sig=' + (pc as any).signalingState
+ ' conn=' + (pc as any).connectionState
+ ' ice=' + (pc as any).iceConnectionState,
);
} catch {
// ignore
}
for (const t of transceivers) {
const sender = t.sender;
if (!sender) {
continue;
}
const senderKind = sender.track?.kind;
const receiverKind = t.receiver?.track?.kind;
const txKind = (t as any).kind || senderKind || receiverKind || null;
if (txKind !== 'video') {
continue;
}
videoTransceivers++;
directionsBefore.push(t.direction);
if (sender.track) {
videoSendersWithTrack++;
} else {
videoSendersWithoutTrack++;
}
// Only replace the track if Teams has fully negotiated the video sender.
// Touching it before currentDirection is set can abort the in-flight
// SDP renegotiation and leave the stream stuck.
const cd = (t as any).currentDirection;
const alreadyOurs = sender.track && sender.track.id === src.id;
if (!cd || cd === 'inactive') {
directionsAfter.push('skip(cd=' + (cd || 'null') + ')');
continue;
}
if (alreadyOurs) {
directionsAfter.push('keep(' + t.direction + ')');
continue;
}
try {
// eslint-disable-next-line no-await-in-loop
await sender.replaceTrack(src.clone());
replaced++;
const tr = sender.track;
if (tr && !tr.enabled) {
tr.enabled = true;
}
directionsAfter.push(t.direction);
} catch (err: any) {
directionsAfter.push('err:' + String(err && err.message ? err.message : err).slice(0, 32));
}
}
}
// Read outbound stats unconditionally so we can see what RTP streams exist.
const videoStats: any[] = [];
const currentDirections: string[] = [];
for (const pc of pcs) {
try {
const transceivers = (pc as any).getTransceivers?.() || [];
for (const t of transceivers) {
const sender = t.sender;
const txKind =
(t as any).kind
|| sender?.track?.kind
|| t.receiver?.track?.kind
|| null;
if (txKind === 'video') {
currentDirections.push(`d=${t.direction}/cd=${(t as any).currentDirection || 'n/a'}`);
}
if (!sender) {
continue;
}
// eslint-disable-next-line no-await-in-loop
const stats = await sender.getStats();
stats.forEach((r: any) => {
if (r.type === 'outbound-rtp') {
videoStats.push({
kind: r.kind || r.mediaType || 'unknown',
bytes: r.bytesSent || 0,
packets: r.packetsSent || 0,
framesEncoded: r.framesEncoded || 0,
framesSent: r.framesSent || 0,
fps: r.framesPerSecond || 0,
w: r.frameWidth || 0,
h: r.frameHeight || 0,
});
}
});
}
} catch {
// ignore
}
}
return {
replaced,
added,
pcs: pcs.length,
reason: 'ok',
videoTransceivers,
videoSendersWithTrack,
videoSendersWithoutTrack,
totalTransceivers,
directionsBefore,
directionsAfter,
currentDirections,
videoStats,
trackId: src.id,
trackEnabled: src.enabled,
trackReady: src.readyState,
trackMuted: src.muted,
};
};
const _wrappedGUM = async (constraints?: MediaStreamConstraints) => {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] gUM call audio=' + !!(constraints && constraints.audio)
+ ' video=' + !!(constraints && constraints.video),
);
// eslint-disable-next-line no-restricted-globals
const realStream = await w.__gumChromium(constraints);
const wantAudio = !!(constraints && constraints.audio);
const wantVideo = !!(constraints && constraints.video);
if (useCanvasVideo && wantVideo) {
w.__startBotAvatarStream();
const vt: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (!vt) {
return realStream;
}
const vClone = vt.clone();
if (wantAudio) {
const combinedStream = new MediaStream();
streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone()));
combinedStream.addTrack(vClone);
try {
realStream.getTracks().forEach(t => t.stop());
} catch {
// ignore
}
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] getUserMedia (canvas+tts): a=' + combinedStream.getAudioTracks().length
+ ' v=' + combinedStream.getVideoTracks().length,
);
return combinedStream;
}
const videoOnly = new MediaStream();
videoOnly.addTrack(vClone);
try {
realStream.getTracks().forEach(t => t.stop());
} catch {
// ignore
}
return videoOnly;
}
if (wantAudio) {
const combinedStream = new MediaStream();
streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone()));
realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] gUM audio: a=' + combinedStream.getAudioTracks().length
+ ' v=' + combinedStream.getVideoTracks().length,
);
return combinedStream;
}
return realStream;
};
try {
Object.defineProperty(navigator.mediaDevices, 'getUserMedia', {
configurable: true,
enumerable: true,
writable: true,
value: _wrappedGUM,
});
} catch {
(navigator.mediaDevices as any).getUserMedia = _wrappedGUM;
}
// Some libraries cache navigator.getUserMedia (legacy)
try {
(navigator as any).getUserMedia = (constraints: MediaStreamConstraints, ok: any, err: any) => {
_wrappedGUM(constraints).then(ok, err);
};
} catch {
// ignore
}
w.__forceTtsTrackToSenders = async () => {
const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[];
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
if (!ttsTrack) {
return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
}
const diag: Record<string, any> = {
ttsTrackId: ttsTrack.id,
ttsTrackEnabled: ttsTrack.enabled,
ttsTrackReadyState: ttsTrack.readyState,
ttsTrackMuted: ttsTrack.muted,
beforeSenderTrackIds: [] as string[],
afterSenderTrackIds: [] as string[],
};
let replaced = 0;
for (const pc of pcs) {
try {
const senders = pc.getSenders?.() || [];
for (const sender of senders) {
if (sender?.track?.kind === 'audio') {
diag.beforeSenderTrackIds.push(sender.track.id);
const freshClone = ttsTrack.clone();
// eslint-disable-next-line no-await-in-loop
await sender.replaceTrack(freshClone);
replaced++;
const afterTrack = sender.track;
diag.afterSenderTrackIds.push(afterTrack ? afterTrack.id : 'null');
diag.afterSenderTrackEnabled = afterTrack ? afterTrack.enabled : undefined;
diag.afterSenderTrackReadyState = afterTrack ? afterTrack.readyState : undefined;
diag.originalTrackState = ttsTrack.readyState;
if (afterTrack && !afterTrack.enabled) {
afterTrack.enabled = true;
diag.forcedEnabled = true;
}
}
}
} catch (err: any) {
diag.error = String(err && err.message ? err.message : err);
}
}
return { replaced, pcs: pcs?.length || 0, reason: 'ok', diag };
};
};