service-teams-browser-bot/src/bot/mediaGetUserMediaPatch.ts
2026-04-25 17:00:32 +02:00

530 lines
18 KiB
TypeScript

/**
* Injected in the browser: wraps getUserMedia, TTS destination, optional canvas
* video. Must be a single self-contained function for Playwright serialization.
* Re-calling this on the same document re-patches gUM and reuses the saved
* Chromium getUserMedia + AudioContext when present (Teams can replace
* navigator.mediaDevices.getUserMedia after a document/iframe refresh).
*/
export type MediaGetUserMediaPatchOptions = {
useCanvasVideo: boolean;
displayLabel: string;
};
export const poweronMediaPatchInstall = (opts: MediaGetUserMediaPatchOptions) => {
'use strict';
const { useCanvasVideo, displayLabel } = opts;
const w: any = window as any;
if (!w.__gumChromium) {
w.__gumChromium = (navigator.mediaDevices as any).getUserMedia.bind(navigator.mediaDevices);
}
// Patch RTCPeerConnection.prototype methods once per realm to observe Teams'
// track placement + SDP negotiation. We DO NOT modify any tracks here; gUM
// already returns the canvas video track to Teams, so the right track is
// placed on the sender automatically. We only OBSERVE so we can diagnose
// what Teams does (or fails to do).
if (!w.__poweronRtcPatched && (window as any).RTCPeerConnection) {
w.__poweronRtcPatched = true;
const RTCProto: any = (window as any).RTCPeerConnection.prototype;
const _origAddTrack = RTCProto.addTrack;
const _origAddTransceiver = RTCProto.addTransceiver;
const _origRemoveTrack = RTCProto.removeTrack;
const _origReplaceTrackProto = (window as any).RTCRtpSender?.prototype?.replaceTrack;
const _origSetLocalDescription = RTCProto.setLocalDescription;
const _origSetRemoteDescription = RTCProto.setRemoteDescription;
RTCProto.addTrack = function (track: MediaStreamTrack, ...streams: MediaStream[]) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.addTrack kind=' + (track && track.kind)
+ ' id=' + (track && track.id)
+ ' enabled=' + (track && track.enabled),
);
} catch {
// ignore
}
const sender = _origAddTrack.call(this, track, ...streams);
try {
if (useCanvasVideo && track && track.kind === 'video') {
const list = (w.__poweronVideoSenders = w.__poweronVideoSenders || []);
list.push({ sender, originalTrackId: track.id });
}
} catch {
// ignore
}
return sender;
};
RTCProto.addTransceiver = function (trackOrKind: any, init?: any) {
try {
const k = typeof trackOrKind === 'string' ? trackOrKind : trackOrKind?.kind;
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.addTransceiver kind=' + k
+ ' direction=' + (init && init.direction),
);
} catch {
// ignore
}
return _origAddTransceiver.call(this, trackOrKind, init);
};
if (_origRemoveTrack) {
RTCProto.removeTrack = function (sender: any) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.removeTrack senderTrackKind='
+ (sender && sender.track && sender.track.kind)
+ ' senderTrackId=' + (sender && sender.track && sender.track.id),
);
} catch {
// ignore
}
return _origRemoveTrack.call(this, sender);
};
}
if (_origReplaceTrackProto) {
(window as any).RTCRtpSender.prototype.replaceTrack = function (track: any) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] sender.replaceTrack(by=teams?) currentKind='
+ (this.track && this.track.kind)
+ ' newKind=' + (track && track.kind)
+ ' newId=' + (track && track.id),
);
} catch {
// ignore
}
return _origReplaceTrackProto.call(this, track);
};
}
const _logSdp = (label: string, sdp?: string) => {
try {
if (!sdp) {
// eslint-disable-next-line no-console
console.log('[AudioPlayback] ' + label + ' sdp=<none>');
return;
}
const lines = sdp.split(/\r?\n/);
const interesting: string[] = [];
let curM = '';
for (const ln of lines) {
if (ln.startsWith('m=')) {
curM = ln.slice(2, 7);
interesting.push('M:' + ln);
} else if (
ln.startsWith('a=sendrecv')
|| ln.startsWith('a=sendonly')
|| ln.startsWith('a=recvonly')
|| ln.startsWith('a=inactive')
) {
interesting.push(curM + ':' + ln);
} else if (ln.startsWith('a=mid:')) {
interesting.push(curM + ':' + ln);
}
}
// eslint-disable-next-line no-console
console.log('[AudioPlayback] ' + label + ' ' + interesting.join(' | '));
} catch {
// ignore
}
};
RTCProto.setLocalDescription = function (desc?: any) {
try {
const t = desc && (desc.type || desc.sdp ? desc.type : 'auto');
_logSdp('setLocalDescription type=' + t, desc && desc.sdp);
} catch {
// ignore
}
return _origSetLocalDescription.call(this, desc);
};
RTCProto.setRemoteDescription = function (desc?: any) {
try {
const t = desc && desc.type;
_logSdp('setRemoteDescription type=' + t, desc && desc.sdp);
} catch {
// ignore
}
return _origSetRemoteDescription.call(this, desc);
};
}
if (!w.__ttsStreamDest) {
const AudioContextClass = (window as any).AudioContext || (window as any).webkitAudioContext;
const ctx: AudioContext = new AudioContextClass();
const streamDest: MediaStreamAudioDestinationNode = ctx.createMediaStreamDestination();
w.__ttsAudioContext = ctx;
w.__ttsStreamDest = streamDest;
w.__ttsAudioStream = streamDest.stream;
}
const streamDest = w.__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!streamDest) {
return;
}
const _fps = 15;
w.__startBotAvatarStream = () => {
if (
w.__botAvatarStreamStarted
&& w.__botAvatarVideoTrack
&& w.__botAvatarVideoTrack.readyState === 'live'
&& w.__botAvatarCanvas
&& w.__botAvatarCanvas.isConnected
) {
return;
}
if (w.__botAvatarDrawInterval) {
clearInterval(w.__botAvatarDrawInterval);
w.__botAvatarDrawInterval = null;
}
try {
w.__botAvatarVideoTrack?.stop?.();
} catch {
// ignore
}
w.__botAvatarStreamStarted = true;
w.__botAvatarDisplayLabel = displayLabel;
const canvas = document.createElement('canvas');
canvas.width = 640;
canvas.height = 360;
canvas.setAttribute('data-poweron-avatar', '1');
// Render at a real size so the compositor produces frames in headless mode.
// captureStream() in headless Chromium can stall when the canvas is 0/invisible.
canvas.style.cssText =
'position:fixed;left:0;top:0;width:160px;height:90px;z-index:2147483646;opacity:0.99;pointer-events:none;background:#000;';
(document.body || document.documentElement).appendChild(canvas);
w.__botAvatarCanvas = canvas;
const c2d = canvas.getContext('2d');
let t = 0;
const draw = () => {
if (!c2d) {
return;
}
t += 0.04;
const wPx = canvas.width;
const hPx = canvas.height;
c2d.fillStyle = '#061525';
c2d.fillRect(0, 0, wPx, hPx);
const g = c2d.createLinearGradient(0, 0, wPx, hPx);
g.addColorStop(0, '#1a4f8c');
g.addColorStop(0.5, '#0c305a');
g.addColorStop(1, '#132e6e');
c2d.fillStyle = g;
c2d.fillRect(0, 0, wPx, hPx);
c2d.strokeStyle = 'rgba(255, 200, 80, 0.95)';
c2d.lineWidth = 3;
c2d.strokeRect(6, 6, wPx - 12, hPx - 12);
c2d.fillStyle = 'rgba(255, 220, 120, 0.95)';
c2d.font = '600 13px system-ui, "Segoe UI", sans-serif';
c2d.textAlign = 'left';
c2d.textBaseline = 'top';
c2d.fillText('PORTA', 14, 10);
c2d.textAlign = 'center';
c2d.textBaseline = 'middle';
c2d.fillStyle = '#ffffff';
c2d.font = 'bold 28px system-ui, "Segoe UI", sans-serif';
const line = (w.__botAvatarDisplayLabel || displayLabel).toString().slice(0, 72);
c2d.fillText(line, wPx / 2, hPx / 2 - 6);
c2d.fillStyle = 'rgba(255,255,255,0.78)';
c2d.font = '14px system-ui, "Segoe UI", sans-serif';
c2d.fillText('poweron', wPx / 2, hPx / 2 + 26);
const pulse = 0.75 + 0.25 * Math.sin(t);
c2d.fillStyle = 'rgba(120, 200, 255, ' + 0.15 * pulse + ')';
c2d.fillRect(0, 0, wPx, 6);
c2d.fillRect(0, hPx - 6, wPx, 6);
};
draw();
// Capture at fps for compositor-driven frames AND also push manual frames
// via requestFrame() each tick for headless reliability.
const cap = (canvas as any).captureStream(_fps) as MediaStream;
w.__botAvatarVideoTrack = cap.getVideoTracks()[0];
w.__botAvatarStreamObj = cap;
if (w.__botAvatarVideoTrack) {
w.__botAvatarVideoTrack.enabled = true;
try {
w.__botAvatarVideoTrack.contentHint = 'motion';
} catch {
// ignore
}
}
const _tickAndPush = () => {
try {
draw();
} catch {
// ignore
}
try {
const tr: any = w.__botAvatarVideoTrack;
if (tr && typeof tr.requestFrame === 'function') {
tr.requestFrame();
}
} catch {
// ignore
}
};
w.__botAvatarDrawInterval = window.setInterval(_tickAndPush, 1000 / _fps);
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] canvas avatar stream (re)built, videoTrack=',
w.__botAvatarVideoTrack ? w.__botAvatarVideoTrack.id : 'none',
);
};
w.__forceVideoTrackToSenders = async () => {
if (!useCanvasVideo) {
return { replaced: 0, pcs: 0, reason: 'canvas-video-off' };
}
w.__startBotAvatarStream();
const src: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (!src) {
return { replaced: 0, pcs: 0, reason: 'no-avatar-track' };
}
const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[];
let replaced = 0;
let added = 0;
let videoTransceivers = 0;
let videoSendersWithTrack = 0;
let videoSendersWithoutTrack = 0;
let totalTransceivers = 0;
const directionsBefore: string[] = [];
const directionsAfter: string[] = [];
for (const pc of pcs) {
const transceivers = (pc as any).getTransceivers?.() || [];
totalTransceivers += transceivers.length;
// Snapshot signaling/connection state for diagnostics.
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc state sig=' + (pc as any).signalingState
+ ' conn=' + (pc as any).connectionState
+ ' ice=' + (pc as any).iceConnectionState,
);
} catch {
// ignore
}
for (const t of transceivers) {
const sender = t.sender;
if (!sender) {
continue;
}
const senderKind = sender.track?.kind;
const receiverKind = t.receiver?.track?.kind;
const txKind = (t as any).kind || senderKind || receiverKind || null;
if (txKind !== 'video') {
continue;
}
videoTransceivers++;
directionsBefore.push(t.direction);
if (sender.track) {
videoSendersWithTrack++;
} else {
videoSendersWithoutTrack++;
}
// Only replace the track if Teams has fully negotiated the video sender.
// Touching it before currentDirection is set can abort the in-flight
// SDP renegotiation and leave the stream stuck.
const cd = (t as any).currentDirection;
const alreadyOurs = sender.track && sender.track.id === src.id;
if (!cd || cd === 'inactive') {
directionsAfter.push('skip(cd=' + (cd || 'null') + ')');
continue;
}
if (alreadyOurs) {
directionsAfter.push('keep(' + t.direction + ')');
continue;
}
try {
// eslint-disable-next-line no-await-in-loop
await sender.replaceTrack(src.clone());
replaced++;
const tr = sender.track;
if (tr && !tr.enabled) {
tr.enabled = true;
}
directionsAfter.push(t.direction);
} catch (err: any) {
directionsAfter.push('err:' + String(err && err.message ? err.message : err).slice(0, 32));
}
}
}
// Read outbound stats unconditionally so we can see what RTP streams exist.
const videoStats: any[] = [];
const currentDirections: string[] = [];
for (const pc of pcs) {
try {
const transceivers = (pc as any).getTransceivers?.() || [];
for (const t of transceivers) {
const sender = t.sender;
const txKind =
(t as any).kind
|| sender?.track?.kind
|| t.receiver?.track?.kind
|| null;
if (txKind === 'video') {
currentDirections.push(`d=${t.direction}/cd=${(t as any).currentDirection || 'n/a'}`);
}
if (!sender) {
continue;
}
// eslint-disable-next-line no-await-in-loop
const stats = await sender.getStats();
stats.forEach((r: any) => {
if (r.type === 'outbound-rtp') {
videoStats.push({
kind: r.kind || r.mediaType || 'unknown',
bytes: r.bytesSent || 0,
packets: r.packetsSent || 0,
framesEncoded: r.framesEncoded || 0,
framesSent: r.framesSent || 0,
fps: r.framesPerSecond || 0,
w: r.frameWidth || 0,
h: r.frameHeight || 0,
});
}
});
}
} catch {
// ignore
}
}
return {
replaced,
added,
pcs: pcs.length,
reason: 'ok',
videoTransceivers,
videoSendersWithTrack,
videoSendersWithoutTrack,
totalTransceivers,
directionsBefore,
directionsAfter,
currentDirections,
videoStats,
trackId: src.id,
trackEnabled: src.enabled,
trackReady: src.readyState,
trackMuted: src.muted,
};
};
const _wrappedGUM = async (constraints?: MediaStreamConstraints) => {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] gUM call audio=' + !!(constraints && constraints.audio)
+ ' video=' + !!(constraints && constraints.video),
);
// eslint-disable-next-line no-restricted-globals
const realStream = await w.__gumChromium(constraints);
const wantAudio = !!(constraints && constraints.audio);
const wantVideo = !!(constraints && constraints.video);
if (useCanvasVideo && wantVideo) {
w.__startBotAvatarStream();
const vt: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (!vt) {
return realStream;
}
const vClone = vt.clone();
if (wantAudio) {
const combinedStream = new MediaStream();
streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone()));
combinedStream.addTrack(vClone);
try {
realStream.getTracks().forEach(t => t.stop());
} catch {
// ignore
}
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] getUserMedia (canvas+tts): a=' + combinedStream.getAudioTracks().length
+ ' v=' + combinedStream.getVideoTracks().length,
);
return combinedStream;
}
const videoOnly = new MediaStream();
videoOnly.addTrack(vClone);
try {
realStream.getTracks().forEach(t => t.stop());
} catch {
// ignore
}
return videoOnly;
}
if (wantAudio) {
const combinedStream = new MediaStream();
streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone()));
realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] gUM audio: a=' + combinedStream.getAudioTracks().length
+ ' v=' + combinedStream.getVideoTracks().length,
);
return combinedStream;
}
return realStream;
};
try {
Object.defineProperty(navigator.mediaDevices, 'getUserMedia', {
configurable: true,
enumerable: true,
writable: true,
value: _wrappedGUM,
});
} catch {
(navigator.mediaDevices as any).getUserMedia = _wrappedGUM;
}
// Some libraries cache navigator.getUserMedia (legacy)
try {
(navigator as any).getUserMedia = (constraints: MediaStreamConstraints, ok: any, err: any) => {
_wrappedGUM(constraints).then(ok, err);
};
} catch {
// ignore
}
w.__forceTtsTrackToSenders = async () => {
const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[];
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
if (!ttsTrack) {
return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
}
const diag: Record<string, any> = {
ttsTrackId: ttsTrack.id,
ttsTrackEnabled: ttsTrack.enabled,
ttsTrackReadyState: ttsTrack.readyState,
ttsTrackMuted: ttsTrack.muted,
beforeSenderTrackIds: [] as string[],
afterSenderTrackIds: [] as string[],
};
let replaced = 0;
for (const pc of pcs) {
try {
const senders = pc.getSenders?.() || [];
for (const sender of senders) {
if (sender?.track?.kind === 'audio') {
diag.beforeSenderTrackIds.push(sender.track.id);
const freshClone = ttsTrack.clone();
// eslint-disable-next-line no-await-in-loop
await sender.replaceTrack(freshClone);
replaced++;
const afterTrack = sender.track;
diag.afterSenderTrackIds.push(afterTrack ? afterTrack.id : 'null');
diag.afterSenderTrackEnabled = afterTrack ? afterTrack.enabled : undefined;
diag.afterSenderTrackReadyState = afterTrack ? afterTrack.readyState : undefined;
diag.originalTrackState = ttsTrack.readyState;
if (afterTrack && !afterTrack.enabled) {
afterTrack.enabled = true;
diag.forcedEnabled = true;
}
}
}
} catch (err: any) {
diag.error = String(err && err.message ? err.message : err);
}
}
return { replaced, pcs: pcs?.length || 0, reason: 'ok', diag };
};
};