This commit is contained in:
ValueOn AG 2026-04-25 01:13:30 +02:00
parent 89e6d442ab
commit 2293ba9552
8 changed files with 1447 additions and 241 deletions

1
package-lock.json generated
View file

@ -7,6 +7,7 @@
"": { "": {
"name": "service-teams-browser-bot", "name": "service-teams-browser-bot",
"version": "1.0.0", "version": "1.0.0",
"hasInstallScript": true,
"dependencies": { "dependencies": {
"dotenv": "^16.4.1", "dotenv": "^16.4.1",
"express": "^4.18.2", "express": "^4.18.2",

View file

@ -154,9 +154,9 @@ export class AudioCaptureProcedure {
async injectCaptureOverride(): Promise<void> { async injectCaptureOverride(): Promise<void> {
if (this._injected) return; if (this._injected) return;
this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper (all frames)...');
await this._page.addInitScript((workletCode: string) => { await this._page.context().addInitScript((workletCode: string) => {
(window as any).__audioCaptureChunks = [] as any[]; (window as any).__audioCaptureChunks = [] as any[];
(window as any).__audioCaptureProcessors = {} as Record<string, any>; (window as any).__audioCaptureProcessors = {} as Record<string, any>;
(window as any).__audioCaptureContexts = {} as Record<string, AudioContext>; (window as any).__audioCaptureContexts = {} as Record<string, AudioContext>;

View file

@ -1,5 +1,6 @@
import { Page } from 'playwright'; import { Page } from 'playwright';
import { Logger } from 'winston'; import { Logger } from 'winston';
import { poweronMediaPatchInstall } from './mediaGetUserMediaPatch';
/** /**
* Handles audio playback in the Teams meeting. * Handles audio playback in the Teams meeting.
@ -11,139 +12,113 @@ import { Logger } from 'winston';
* - When Teams calls getUserMedia, the wrapper: * - When Teams calls getUserMedia, the wrapper:
* 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream) * 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream)
* 2. Replaces the audio track with one from our MediaStreamDestination * 2. Replaces the audio track with one from our MediaStreamDestination
* 3. Returns the modified stream (our audio + Chromium's fake video) * 3. Returns the modified stream; optional canvas video track instead of fake video
* - When TTS audio is played, it's piped into the MediaStreamDestination, * - When TTS audio is played, it's piped into the MediaStreamDestination,
* and Teams sends it via WebRTC to other meeting participants. * and Teams sends it via WebRTC to other meeting participants.
*/ */
export type AudioProcedureOptions = {
useCanvasVideo?: boolean;
/** Shown in the center of the canvas (e.g. bot display name) */
displayLabel?: string;
};
export class AudioProcedure { export class AudioProcedure {
private _page: Page; private _page: Page;
private _logger: Logger; private _logger: Logger;
private _useCanvasVideo: boolean;
private _displayLabel: string;
private _audioContext: boolean = false; private _audioContext: boolean = false;
private _initScriptInjected: boolean = false; private _initScriptInjected: boolean = false;
private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = []; private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = [];
private _isPlaying: boolean = false; private _isPlaying: boolean = false;
private _stopRequested: boolean = false; private _stopRequested: boolean = false;
constructor(page: Page, logger: Logger) { constructor(page: Page, logger: Logger, options?: AudioProcedureOptions) {
this._page = page; this._page = page;
this._logger = logger; this._logger = logger;
this._useCanvasVideo = !!options?.useCanvasVideo;
this._displayLabel = (options?.displayLabel || 'Bot').trim() || 'Bot';
} }
/** /**
* Inject the getUserMedia wrapper BEFORE any page navigation. * Inject the getUserMedia wrapper BEFORE any page navigation.
* This MUST be called before navigating to Teams. * This MUST be called before navigating to Teams.
* Uses page.addInitScript so it runs in every new document context. * Uses browserContext.addInitScript so the hook runs in the main page and
* in embedded frames (Teams often runs media/WebRTC in an iframe; page-only
* injection would miss getUserMedia and you would only see the fake device).
*/ */
async injectAudioOverride(): Promise<void> { async injectAudioOverride(): Promise<void> {
if (this._initScriptInjected) { if (this._initScriptInjected) {
return; return;
} }
this._logger.info('Injecting audio getUserMedia override...'); this._logger.info(
`Injecting audio getUserMedia override (canvasVideo=${this._useCanvasVideo}, label="${this._displayLabel}")...`,
);
await this._page.addInitScript(() => { await this._page.context().addInitScript(poweronMediaPatchInstall, {
// Create a shared AudioContext and MediaStreamDestination for TTS injection useCanvasVideo: this._useCanvasVideo,
const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; displayLabel: this._displayLabel,
const ctx = new AudioContextClass();
const streamDest = ctx.createMediaStreamDestination();
// Store globally for later TTS injection
(window as any).__ttsAudioContext = ctx;
(window as any).__ttsStreamDest = streamDest;
(window as any).__ttsAudioStream = streamDest.stream;
// Wrap getUserMedia to replace audio tracks with our TTS-injectable stream
const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => {
// Get the real stream (from Chromium's fake devices)
const realStream = await originalGetUserMedia(constraints);
if (constraints && constraints.audio) {
// Build a new stream: our TTS audio track + their video tracks
const combinedStream = new MediaStream();
// Clone the TTS track so Teams can't kill the original via track.stop()
streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t.clone()));
// Keep the real video tracks (from fake camera)
realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
// Diagnostic signal for production logs: confirms override really feeds Teams.
try {
const audioTracks = combinedStream.getAudioTracks();
const videoTracks = combinedStream.getVideoTracks();
console.log(
`[AudioPlayback] getUserMedia override active: audioTracks=${audioTracks.length}, videoTracks=${videoTracks.length}, audioLabel="${audioTracks[0]?.label || 'n/a'}"`,
);
} catch {
// ignore
}
return combinedStream;
}
// No audio requested - return the real stream as-is
return realStream;
};
// Force all RTCPeerConnection audio senders to use our TTS track.
// This ensures Teams actually sends our audio even if getUserMedia
// override happened in a different context or was renegotiated.
(window as any).__forceTtsTrackToSenders = async () => {
const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[];
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
// #region agent log
const diag: Record<string, any> = {
ttsTrackId: ttsTrack.id,
ttsTrackEnabled: ttsTrack.enabled,
ttsTrackReadyState: ttsTrack.readyState,
ttsTrackMuted: ttsTrack.muted,
beforeSenderTrackIds: [] as string[],
afterSenderTrackIds: [] as string[],
};
// #endregion
let replaced = 0;
for (const pc of pcs) {
try {
const senders = pc.getSenders?.() || [];
for (const sender of senders) {
if (sender?.track?.kind === 'audio') {
// #region agent log
diag.beforeSenderTrackIds.push(sender.track.id);
// #endregion
const freshClone = ttsTrack.clone();
await sender.replaceTrack(freshClone);
replaced++;
// #region agent log
const afterTrack = sender.track;
diag.afterSenderTrackIds.push(afterTrack?.id || 'null');
diag.afterSenderTrackEnabled = afterTrack?.enabled;
diag.afterSenderTrackReadyState = afterTrack?.readyState;
diag.originalTrackState = ttsTrack.readyState;
if (afterTrack && !afterTrack.enabled) {
afterTrack.enabled = true;
diag.forcedEnabled = true;
}
// #endregion
}
}
} catch (err: any) {
// #region agent log
diag.error = String(err?.message || err);
// #endregion
}
}
return { replaced, pcs: pcs?.length || 0, reason: 'ok', diag };
};
}); });
this._initScriptInjected = true; this._initScriptInjected = true;
this._logger.info('Audio getUserMedia override injected'); this._logger.info('Audio getUserMedia override injected');
} }
/**
* Re-run the media patch in every frame. Needed when Teams replaces the document
* in an iframe (addInitScript runs too early) or overwrites getUserMedia.
*/
async reinstallMediaPatchInAllFrames(): Promise<void> {
const payload = { useCanvasVideo: this._useCanvasVideo, displayLabel: this._displayLabel };
for (const frame of this._page.frames()) {
try {
await frame.evaluate(poweronMediaPatchInstall, payload);
} catch (e) {
this._logger.info(`[mediaPatch] frame skipped: ${e}`);
}
}
await this._forceCanvasVideoInAllFrames('reinstall');
}
/**
* Replace outbound video in every frame. Teams may run WebRTC in a subframe;
* only touching the main window leaves Chromium's default fake (green) video.
*/
private async _forceCanvasVideoInAllFrames(phase: string): Promise<void> {
if (!this._useCanvasVideo) {
return;
}
const parts: string[] = [];
for (const frame of this._page.frames()) {
try {
const r = await frame.evaluate(async () => {
const w = window as any;
w.__startBotAvatarStream?.();
return w.__forceVideoTrackToSenders?.();
});
const shortUrl = (() => {
try {
return frame.url().substring(0, 100);
} catch {
return '(no-url)';
}
})();
const rr: any = r || {};
parts.push(
`[${shortUrl}] r=${rr.replaced ?? 0} add=${rr.added ?? 0} pcs=${rr.pcs ?? 0} `
+ `tx=${rr.totalTransceivers ?? 0} vidTx=${rr.videoTransceivers ?? 0} `
+ `vidWith=${rr.videoSendersWithTrack ?? 0} vidNoTrack=${rr.videoSendersWithoutTrack ?? 0} `
+ `dirB=[${(rr.directionsBefore || []).join(',')}] dirA=[${(rr.directionsAfter || []).join(',')}] `
+ `${rr.reason || ''}`.trim(),
);
} catch (e: any) {
parts.push(`err=${String(e?.message || e).slice(0, 64)}`);
}
}
this._logger.info(`Canvas video ${phase}: ${parts.join(' | ')}`);
}
/** /**
* Initialize the audio context in the browser for TTS playback. * Initialize the audio context in the browser for TTS playback.
* Must be called after joining the meeting (user gesture context). * Must be called after joining the meeting (user gesture context).
@ -175,6 +150,10 @@ export class AudioProcedure {
} }
}); });
if (this._useCanvasVideo) {
await this._forceCanvasVideoInAllFrames('init');
}
this._audioContext = true; this._audioContext = true;
this._logger.info('Audio context initialized'); this._logger.info('Audio context initialized');
} }
@ -279,6 +258,10 @@ export class AudioProcedure {
); );
// #endregion // #endregion
if (this._useCanvasVideo) {
await this._forceCanvasVideoInAllFrames('tts');
}
// Collect WebRTC stats BEFORE playback // Collect WebRTC stats BEFORE playback
// #region agent log // #region agent log
const statsBefore = await this._page.evaluate(async () => { const statsBefore = await this._page.evaluate(async () => {
@ -405,12 +388,36 @@ export class AudioProcedure {
*/ */
async cleanup(): Promise<void> { async cleanup(): Promise<void> {
try { try {
await this._page.evaluate(() => { for (const frame of this._page.frames()) {
const ctx = (window as any).__ttsAudioContext as AudioContext; try {
if (ctx) { await frame.evaluate(() => {
ctx.close(); const w = window as any;
if (w.__botAvatarDrawInterval) {
clearInterval(w.__botAvatarDrawInterval);
w.__botAvatarDrawInterval = null;
}
if (w.__botAvatarVideoTrack) {
try {
w.__botAvatarVideoTrack.stop();
} catch {
// ignore
}
w.__botAvatarVideoTrack = null;
}
if (w.__botAvatarCanvas && w.__botAvatarCanvas.remove) {
w.__botAvatarCanvas.remove();
w.__botAvatarCanvas = null;
}
w.__botAvatarStreamStarted = false;
const actx = w.__ttsAudioContext as AudioContext;
if (actx) {
actx.close();
}
});
} catch {
// cross-origin or closed frame
} }
}); }
} catch { } catch {
// Page might be closed // Page might be closed
} }

View file

@ -19,6 +19,69 @@ export class BackgroundProcedure {
this._logger = logger; this._logger = logger;
} }
/**
* Open background effects and select "no" virtual background (camera only).
* Teams can show a flat green/gray placeholder when a background effect is
* on even when the feed is a fake or canvas source.
*/
async trySelectNoVirtualBackground(): Promise<boolean> {
try {
const opened = await this._openBackgroundEffectsPanel();
if (!opened) {
return false;
}
await this._page.waitForTimeout(500);
const noEffectSelectors: string[] = [
'button[aria-label*="None" i]',
'button[aria-label*="Kein" i]',
'button[aria-label*="ohne" i]',
'button[aria-label*="off" i][aria-label*="background" i]',
'button[aria-label*="Hintergrund entfernen" i]',
'[data-tid="background-item-none"]',
'button[role="tab"][name="None" i]',
];
for (const sel of noEffectSelectors) {
const btn = await this._page.$(sel);
if (btn) {
await btn.click();
this._logger.info(`Selected no background effect: ${sel}`);
await this._page.waitForTimeout(500);
await this._dismissPanelIfOpen();
return true;
}
}
// First gallery tile (often "none" or blur off) in many Teams builds
const tile = await this._page.$(
'[data-tid="background-image"], [class*="background-item"], li[role="listitem"] button',
);
if (tile) {
await tile.click();
this._logger.info('Clicked first background effects tile (often no effect)');
await this._page.waitForTimeout(400);
await this._dismissPanelIfOpen();
return true;
}
await this._dismissPanelIfOpen();
this._logger.warn('Could not find "no background" control');
return false;
} catch (e) {
this._logger.warn(`trySelectNoVirtualBackground: ${e}`);
return false;
}
}
private async _dismissPanelIfOpen(): Promise<void> {
try {
await this._page.keyboard.press('Escape');
await this._page.waitForTimeout(200);
} catch {
// ignore
}
}
/** /**
* Set a virtual background from a URL on the Teams pre-join screen. * Set a virtual background from a URL on the Teams pre-join screen.
* *

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,385 @@
/**
* Injected in the browser: wraps getUserMedia, TTS destination, optional canvas
* video. Must be a single self-contained function for Playwright serialization.
* Re-calling this on the same document re-patches gUM and reuses the saved
* Chromium getUserMedia + AudioContext when present (Teams can replace
* navigator.mediaDevices.getUserMedia after a document/iframe refresh).
*/
export type MediaGetUserMediaPatchOptions = {
useCanvasVideo: boolean;
displayLabel: string;
};
export const poweronMediaPatchInstall = (opts: MediaGetUserMediaPatchOptions) => {
'use strict';
const { useCanvasVideo, displayLabel } = opts;
const w: any = window as any;
if (!w.__gumChromium) {
w.__gumChromium = (navigator.mediaDevices as any).getUserMedia.bind(navigator.mediaDevices);
}
// Patch RTCPeerConnection.prototype methods once per realm to observe + react to Teams' track placement.
if (!w.__poweronRtcPatched && (window as any).RTCPeerConnection) {
w.__poweronRtcPatched = true;
const RTCProto: any = (window as any).RTCPeerConnection.prototype;
const _origAddTrack = RTCProto.addTrack;
const _origAddTransceiver = RTCProto.addTransceiver;
RTCProto.addTrack = function (track: MediaStreamTrack, ...streams: MediaStream[]) {
try {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.addTrack kind=' + (track && track.kind)
+ ' id=' + (track && track.id)
+ ' enabled=' + (track && track.enabled),
);
} catch {
// ignore
}
let useTrack: MediaStreamTrack = track;
try {
if (useCanvasVideo && track && track.kind === 'video') {
if (typeof w.__startBotAvatarStream === 'function') {
w.__startBotAvatarStream();
}
const av: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (av && av.readyState === 'live') {
try {
track.stop();
} catch {
// ignore
}
useTrack = av.clone();
// eslint-disable-next-line no-console
console.log('[AudioPlayback] pc.addTrack swapped video -> avatar id=' + useTrack.id);
}
}
} catch {
// ignore
}
return _origAddTrack.call(this, useTrack, ...streams);
};
RTCProto.addTransceiver = function (trackOrKind: any, init?: any) {
try {
const k = typeof trackOrKind === 'string' ? trackOrKind : trackOrKind?.kind;
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] pc.addTransceiver kind=' + k
+ ' direction=' + (init && init.direction),
);
} catch {
// ignore
}
return _origAddTransceiver.call(this, trackOrKind, init);
};
}
if (!w.__ttsStreamDest) {
const AudioContextClass = (window as any).AudioContext || (window as any).webkitAudioContext;
const ctx: AudioContext = new AudioContextClass();
const streamDest: MediaStreamAudioDestinationNode = ctx.createMediaStreamDestination();
w.__ttsAudioContext = ctx;
w.__ttsStreamDest = streamDest;
w.__ttsAudioStream = streamDest.stream;
}
const streamDest = w.__ttsStreamDest as MediaStreamAudioDestinationNode;
if (!streamDest) {
return;
}
const _fps = 15;
w.__startBotAvatarStream = () => {
if (
w.__botAvatarStreamStarted
&& w.__botAvatarVideoTrack
&& w.__botAvatarVideoTrack.readyState === 'live'
&& w.__botAvatarCanvas
&& w.__botAvatarCanvas.isConnected
) {
return;
}
if (w.__botAvatarDrawInterval) {
clearInterval(w.__botAvatarDrawInterval);
w.__botAvatarDrawInterval = null;
}
try {
w.__botAvatarVideoTrack?.stop?.();
} catch {
// ignore
}
w.__botAvatarStreamStarted = true;
w.__botAvatarDisplayLabel = displayLabel;
const canvas = document.createElement('canvas');
canvas.width = 640;
canvas.height = 360;
canvas.setAttribute('data-poweron-avatar', '1');
canvas.style.cssText =
'position:fixed;right:0;bottom:0;width:4px;height:4px;z-index:2147483646;opacity:1;pointer-events:none;';
(document.body || document.documentElement).appendChild(canvas);
w.__botAvatarCanvas = canvas;
const c2d = canvas.getContext('2d');
let t = 0;
const draw = () => {
if (!c2d) {
return;
}
t += 0.04;
const wPx = canvas.width;
const hPx = canvas.height;
c2d.fillStyle = '#061525';
c2d.fillRect(0, 0, wPx, hPx);
const g = c2d.createLinearGradient(0, 0, wPx, hPx);
g.addColorStop(0, '#1a4f8c');
g.addColorStop(0.5, '#0c305a');
g.addColorStop(1, '#132e6e');
c2d.fillStyle = g;
c2d.fillRect(0, 0, wPx, hPx);
c2d.strokeStyle = 'rgba(255, 200, 80, 0.95)';
c2d.lineWidth = 3;
c2d.strokeRect(6, 6, wPx - 12, hPx - 12);
c2d.fillStyle = 'rgba(255, 220, 120, 0.95)';
c2d.font = '600 13px system-ui, "Segoe UI", sans-serif';
c2d.textAlign = 'left';
c2d.textBaseline = 'top';
c2d.fillText('PORTA', 14, 10);
c2d.textAlign = 'center';
c2d.textBaseline = 'middle';
c2d.fillStyle = '#ffffff';
c2d.font = 'bold 28px system-ui, "Segoe UI", sans-serif';
const line = (w.__botAvatarDisplayLabel || displayLabel).toString().slice(0, 72);
c2d.fillText(line, wPx / 2, hPx / 2 - 6);
c2d.fillStyle = 'rgba(255,255,255,0.78)';
c2d.font = '14px system-ui, "Segoe UI", sans-serif';
c2d.fillText('poweron', wPx / 2, hPx / 2 + 26);
const pulse = 0.75 + 0.25 * Math.sin(t);
c2d.fillStyle = 'rgba(120, 200, 255, ' + 0.15 * pulse + ')';
c2d.fillRect(0, 0, wPx, 6);
c2d.fillRect(0, hPx - 6, wPx, 6);
};
draw();
w.__botAvatarDrawInterval = window.setInterval(draw, 1000 / _fps);
const cap = canvas.captureStream(_fps);
w.__botAvatarVideoTrack = cap.getVideoTracks()[0];
if (w.__botAvatarVideoTrack) {
w.__botAvatarVideoTrack.enabled = true;
try {
w.__botAvatarVideoTrack.contentHint = 'motion';
} catch {
// ignore
}
}
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] canvas avatar stream (re)built, videoTrack=',
w.__botAvatarVideoTrack ? w.__botAvatarVideoTrack.id : 'none',
);
};
w.__forceVideoTrackToSenders = async () => {
if (!useCanvasVideo) {
return { replaced: 0, pcs: 0, reason: 'canvas-video-off' };
}
w.__startBotAvatarStream();
const src: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (!src) {
return { replaced: 0, pcs: 0, reason: 'no-avatar-track' };
}
const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[];
let replaced = 0;
let added = 0;
let videoTransceivers = 0;
let videoSendersWithTrack = 0;
let videoSendersWithoutTrack = 0;
let totalTransceivers = 0;
const directionsBefore: string[] = [];
const directionsAfter: string[] = [];
for (const pc of pcs) {
const transceivers = (pc as any).getTransceivers?.() || [];
totalTransceivers += transceivers.length;
let pcHasVideoSender = false;
for (const t of transceivers) {
const sender = t.sender;
if (!sender) {
continue;
}
const senderKind = sender.track?.kind;
const receiverKind = t.receiver?.track?.kind;
const txKind = (t as any).kind || senderKind || receiverKind || null;
if (txKind !== 'video') {
continue;
}
videoTransceivers++;
pcHasVideoSender = true;
directionsBefore.push(t.direction);
if (sender.track) {
videoSendersWithTrack++;
} else {
videoSendersWithoutTrack++;
}
try {
// eslint-disable-next-line no-await-in-loop
await sender.replaceTrack(src.clone());
replaced++;
const tr = sender.track;
if (tr && !tr.enabled) {
tr.enabled = true;
}
if (t.direction === 'inactive' || t.direction === 'recvonly') {
try {
t.direction = 'sendrecv';
} catch {
// ignore
}
}
directionsAfter.push(t.direction);
} catch (err: any) {
directionsAfter.push('err:' + String(err && err.message ? err.message : err).slice(0, 32));
}
}
if (!pcHasVideoSender) {
try {
const newSender = (pc as any).addTrack(src.clone(), w.__botAvatarCanvas?.captureStream
? w.__botAvatarCanvas.captureStream(15)
: new MediaStream([src.clone()]));
if (newSender) {
added++;
}
} catch (err) {
directionsAfter.push('addTrack-err:' + String((err as any)?.message || err).slice(0, 32));
}
}
}
return {
replaced,
added,
pcs: pcs.length,
reason: 'ok',
videoTransceivers,
videoSendersWithTrack,
videoSendersWithoutTrack,
totalTransceivers,
directionsBefore,
directionsAfter,
};
};
const _wrappedGUM = async (constraints?: MediaStreamConstraints) => {
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] gUM call audio=' + !!(constraints && constraints.audio)
+ ' video=' + !!(constraints && constraints.video),
);
// eslint-disable-next-line no-restricted-globals
const realStream = await w.__gumChromium(constraints);
const wantAudio = !!(constraints && constraints.audio);
const wantVideo = !!(constraints && constraints.video);
if (useCanvasVideo && wantVideo) {
w.__startBotAvatarStream();
const vt: MediaStreamTrack | undefined = w.__botAvatarVideoTrack;
if (!vt) {
return realStream;
}
const vClone = vt.clone();
if (wantAudio) {
const combinedStream = new MediaStream();
streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone()));
combinedStream.addTrack(vClone);
try {
realStream.getTracks().forEach(t => t.stop());
} catch {
// ignore
}
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] getUserMedia (canvas+tts): a=' + combinedStream.getAudioTracks().length
+ ' v=' + combinedStream.getVideoTracks().length,
);
return combinedStream;
}
const videoOnly = new MediaStream();
videoOnly.addTrack(vClone);
try {
realStream.getTracks().forEach(t => t.stop());
} catch {
// ignore
}
return videoOnly;
}
if (wantAudio) {
const combinedStream = new MediaStream();
streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone()));
realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t));
// eslint-disable-next-line no-console
console.log(
'[AudioPlayback] gUM audio: a=' + combinedStream.getAudioTracks().length
+ ' v=' + combinedStream.getVideoTracks().length,
);
return combinedStream;
}
return realStream;
};
try {
Object.defineProperty(navigator.mediaDevices, 'getUserMedia', {
configurable: true,
enumerable: true,
writable: true,
value: _wrappedGUM,
});
} catch {
(navigator.mediaDevices as any).getUserMedia = _wrappedGUM;
}
// Some libraries cache navigator.getUserMedia (legacy)
try {
(navigator as any).getUserMedia = (constraints: MediaStreamConstraints, ok: any, err: any) => {
_wrappedGUM(constraints).then(ok, err);
};
} catch {
// ignore
}
w.__forceTtsTrackToSenders = async () => {
const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[];
const ttsTrack = streamDest.stream.getAudioTracks()?.[0];
if (!ttsTrack) {
return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' };
}
const diag: Record<string, any> = {
ttsTrackId: ttsTrack.id,
ttsTrackEnabled: ttsTrack.enabled,
ttsTrackReadyState: ttsTrack.readyState,
ttsTrackMuted: ttsTrack.muted,
beforeSenderTrackIds: [] as string[],
afterSenderTrackIds: [] as string[],
};
let replaced = 0;
for (const pc of pcs) {
try {
const senders = pc.getSenders?.() || [];
for (const sender of senders) {
if (sender?.track?.kind === 'audio') {
diag.beforeSenderTrackIds.push(sender.track.id);
const freshClone = ttsTrack.clone();
// eslint-disable-next-line no-await-in-loop
await sender.replaceTrack(freshClone);
replaced++;
const afterTrack = sender.track;
diag.afterSenderTrackIds.push(afterTrack ? afterTrack.id : 'null');
diag.afterSenderTrackEnabled = afterTrack ? afterTrack.enabled : undefined;
diag.afterSenderTrackReadyState = afterTrack ? afterTrack.readyState : undefined;
diag.originalTrackState = ttsTrack.readyState;
if (afterTrack && !afterTrack.enabled) {
afterTrack.enabled = true;
diag.forcedEnabled = true;
}
}
}
} catch (err: any) {
diag.error = String(err && err.message ? err.message : err);
}
}
return { replaced, pcs: pcs?.length || 0, reason: 'ok', diag };
};
};

View file

@ -15,10 +15,11 @@ import { AudioCaptureProcedure } from './audioCaptureProcedure';
import { ChatProcedure, ChatMessageEntry } from './chatProcedure'; import { ChatProcedure, ChatMessageEntry } from './chatProcedure';
import { AuthProcedure, MfaChallenge } from './authProcedure'; import { AuthProcedure, MfaChallenge } from './authProcedure';
import { TeamsActionsService } from './teamsActionsService'; import { TeamsActionsService } from './teamsActionsService';
import { BackgroundProcedure } from './backgroundProcedure';
import { isValidMeetingUrl, getMeetingLaunchUrl, resolveLaunchUrl } from './meetingUrlParser'; import { isValidMeetingUrl, getMeetingLaunchUrl, resolveLaunchUrl } from './meetingUrlParser';
// Camera / fake video injection is disabled for now to focus on stability. // Optional: canvas "avatar" video (config.botUseCanvasVideo) replaces the Chromium
// The Y4M fake video file was causing browser crashes when audio started flowing. // fake test pattern when the camera is on. Y4M file injection remains disabled.
export interface OrchestratorCallbacks { export interface OrchestratorCallbacks {
onStateChange: (state: BotState, message?: string) => void; onStateChange: (state: BotState, message?: string) => void;
@ -76,6 +77,11 @@ export class BotOrchestrator {
private _chatQueueProcessing: boolean = false; private _chatQueueProcessing: boolean = false;
private _mfaResolver: ((response: { action: string; code?: string }) => void) | null = null; private _mfaResolver: ((response: { action: string; code?: string }) => void) | null = null;
/** Debounce Teams iframe navigations (media runs in a child frame) */
private _frameNavMediaRebindTimer: ReturnType<typeof setTimeout> | null = null;
/** Re-apply gUM + video senders for a few seconds after join */
private _canvasRebindTimer: ReturnType<typeof setInterval> | null = null;
constructor( constructor(
sessionId: string, sessionId: string,
meetingUrl: string, meetingUrl: string,
@ -205,6 +211,11 @@ export class BotOrchestrator {
// Ensure microphone is ON (required for voice playback) // Ensure microphone is ON (required for voice playback)
await this._ensureMicOn(); await this._ensureMicOn();
if (config.botUseCanvasVideo) {
await this._ensureCameraOn();
const bg = new BackgroundProcedure(this._page!, this._logger);
void bg.trySelectNoVirtualBackground();
}
// STEP 2: Enter bot name and click "Join now" // STEP 2: Enter bot name and click "Join now"
await this._takeScreenshot('anon-step2-before-join', this._isDebugMode); await this._takeScreenshot('anon-step2-before-join', this._isDebugMode);
@ -234,6 +245,10 @@ export class BotOrchestrator {
// Initialize audio playback // Initialize audio playback
await this._audioProcedure!.initialize(); await this._audioProcedure!.initialize();
if (config.botUseCanvasVideo) {
await this._ensureCameraOnInMeeting();
this._startCanvasRebindAfterJoin();
}
// Enable transcript capture (captions or audio based on transferMode) // Enable transcript capture (captions or audio based on transferMode)
await this._enableTranscriptCapture(); await this._enableTranscriptCapture();
@ -414,6 +429,11 @@ export class BotOrchestrator {
// Ensure microphone is ON before joining (required for voice playback) // Ensure microphone is ON before joining (required for voice playback)
await this._ensureMicOn(); await this._ensureMicOn();
if (config.botUseCanvasVideo) {
await this._ensureCameraOn();
const bg = new BackgroundProcedure(this._page!, this._logger);
void bg.trySelectNoVirtualBackground();
}
// STEP 5: Poll for "Join now" on the pre-join screen // STEP 5: Poll for "Join now" on the pre-join screen
await this._takeScreenshot('step5-before-join-now', this._isDebugMode); await this._takeScreenshot('step5-before-join-now', this._isDebugMode);
@ -436,11 +456,37 @@ export class BotOrchestrator {
this._startKeepAlive(); this._startKeepAlive();
await this._audioProcedure!.initialize(); await this._audioProcedure!.initialize();
if (config.botUseCanvasVideo) {
await this._ensureCameraOnInMeeting();
this._startCanvasRebindAfterJoin();
}
await this._enableTranscriptCapture(); await this._enableTranscriptCapture();
await this._enableChat(); await this._enableChat();
await this._sendJoinGreeting(); await this._sendJoinGreeting();
} }
private _startCanvasRebindAfterJoin(): void {
this._stopCanvasRebindAfterJoin();
if (!config.botUseCanvasVideo || !this._audioProcedure) {
return;
}
let n = 0;
this._canvasRebindTimer = setInterval(() => {
n += 1;
void this._audioProcedure?.reinstallMediaPatchInAllFrames();
if (n >= 35) {
this._stopCanvasRebindAfterJoin();
}
}, 400);
}
private _stopCanvasRebindAfterJoin(): void {
if (this._canvasRebindTimer) {
clearInterval(this._canvasRebindTimer);
this._canvasRebindTimer = null;
}
}
/** /**
* Ensure the camera is turned on in the pre-join screen. * Ensure the camera is turned on in the pre-join screen.
* When camera is on, Teams shows the profile/background image. * When camera is on, Teams shows the profile/background image.
@ -888,6 +934,12 @@ export class BotOrchestrator {
this._isShuttingDown = true; this._isShuttingDown = true;
this._logger.info('Stopping bot...'); this._logger.info('Stopping bot...');
if (this._frameNavMediaRebindTimer) {
clearTimeout(this._frameNavMediaRebindTimer);
this._frameNavMediaRebindTimer = null;
}
this._stopCanvasRebindAfterJoin();
// Stop keepalive first // Stop keepalive first
this._stopKeepAlive(); this._stopKeepAlive();
@ -1077,7 +1129,10 @@ export class BotOrchestrator {
}, },
this._options.language this._options.language
); );
this._audioProcedure = new AudioProcedure(this._page, this._logger); this._audioProcedure = new AudioProcedure(this._page, this._logger, {
useCanvasVideo: config.botUseCanvasVideo,
displayLabel: this._botName,
});
this._teamsActions = new TeamsActionsService(this._page, this._logger); this._teamsActions = new TeamsActionsService(this._page, this._logger);
this._chatProcedure = new ChatProcedure( this._chatProcedure = new ChatProcedure(
this._page, this._page,
@ -1100,6 +1155,19 @@ export class BotOrchestrator {
// Aggressive hybrid mode: always capture meeting audio as transcript source. // Aggressive hybrid mode: always capture meeting audio as transcript source.
await this._audioCaptureProcedure!.injectCaptureOverride(); await this._audioCaptureProcedure!.injectCaptureOverride();
this._page.on('framenavigated', () => {
if (!config.botUseCanvasVideo || !this._audioProcedure) {
return;
}
if (this._frameNavMediaRebindTimer) {
clearTimeout(this._frameNavMediaRebindTimer);
}
this._frameNavMediaRebindTimer = setTimeout(() => {
this._frameNavMediaRebindTimer = null;
void this._audioProcedure?.reinstallMediaPatchInAllFrames();
}, 600);
});
// Handle page errors // Handle page errors
this._page.on('pageerror', (error) => { this._page.on('pageerror', (error) => {
this._logger.error('Page error:', error); this._logger.error('Page error:', error);
@ -1134,6 +1202,7 @@ export class BotOrchestrator {
* Close the browser. * Close the browser.
*/ */
private async _closeBrowser(): Promise<void> { private async _closeBrowser(): Promise<void> {
this._stopCanvasRebindAfterJoin();
try { try {
if (this._page) { if (this._page) {
await this._page.close(); await this._page.close();
@ -1282,43 +1351,31 @@ export class BotOrchestrator {
} }
/** /**
* Send a greeting message in the meeting chat AND via voice after joining. * Signal "bot has joined the meeting" to the Gateway. The Gateway owns
* Uses the bot's display name and the configured language. * greeting generation: it produces a localised greeting via the AI
* Voice greeting confirms that the audio pipeline (TTS -> mic) is working. * service in the configured language + persona, then dispatches it back
* to this bot via the regular `sendChatMessage` command (chat) and the
* `playAudio` pipeline (voice). NO hardcoded greeting strings or
* language branches live in the bot the bot is purely a transport.
*
* We still wait briefly so the chat panel + input have settled in the
* Teams DOM before the Gateway-driven `sendChatMessage` arrives.
*/ */
private async _sendJoinGreeting(): Promise<void> { private async _sendJoinGreeting(): Promise<void> {
try { try {
const firstName = this._botName.split(' ')[0] || this._botName; this._logger.info('Requesting join greeting from Gateway');
const lang = (this._options.language || 'de-DE').toLowerCase();
let greeting: string;
if (lang.startsWith('de')) {
greeting = `Hallo, hier ist ${firstName}. Ich bin bereit.`;
} else if (lang.startsWith('fr')) {
greeting = `Bonjour, c'est ${firstName}. Je suis prête.`;
} else if (lang.startsWith('it')) {
greeting = `Ciao, sono ${firstName}. Sono pronta.`;
} else {
greeting = `Hello, this is ${firstName}. I'm ready.`;
}
this._logger.info(`Sending join greeting (chat + voice): ${greeting}`);
// Brief delay so chat input is ready after panel open (Teams DOM can lag)
await new Promise((r) => setTimeout(r, 800)); await new Promise((r) => setTimeout(r, 800));
// Chat greeting (queued; retries if input not found)
await this.sendChatMessageToMeeting(greeting);
// Voice greeting — ask Gateway to generate TTS and send back playAudio
this._sendToGateway({ this._sendToGateway({
type: 'voiceGreeting', type: 'requestGreeting',
sessionId: this._sessionId, sessionId: this._sessionId,
text: greeting, // Hint the Gateway about display name + language; Gateway already
language: this._options.language || 'de-DE', // has the canonical config but passing them here keeps the contract
// self-contained and avoids a DB lookup just for greeting text.
botName: this._botName,
language: this._options.language || '',
}); });
} catch (error) { } catch (error) {
this._logger.warn('Could not send join greeting:', error); this._logger.warn('Could not request join greeting:', error);
} }
} }
@ -1505,15 +1562,21 @@ export class BotOrchestrator {
fs.writeFileSync(filepath, buffer); fs.writeFileSync(filepath, buffer);
this._logger.info(`Screenshot saved: ${filepath}`); this._logger.info(`Screenshot saved: ${filepath}`);
// Also log as base64 for Azure logs (truncated for readability) // Optional: also stream the PNG as base64 chunks into the log. Nobody
const base64 = buffer.toString('base64'); // parses these chunks back into images — they exist purely so that
this._logger.info(`SCREENSHOT_BASE64_START:${name}`); // cloud deployments without disk access (e.g. Azure App Service) can
// Log in chunks to avoid log line limits // recover screenshots from log search. The UI loads screenshots via
const chunkSize = 50000; // the REST proxy, NOT from these log lines, so we keep this OFF by
for (let i = 0; i < base64.length; i += chunkSize) { // default to avoid spamming the bot log with ~200 KB blobs per shot.
this._logger.info(`SCREENSHOT_CHUNK:${base64.substring(i, i + chunkSize)}`); if (config.screenshotLogBase64) {
const base64 = buffer.toString('base64');
this._logger.info(`SCREENSHOT_BASE64_START:${name}`);
const chunkSize = 50000;
for (let i = 0; i < base64.length; i += chunkSize) {
this._logger.info(`SCREENSHOT_CHUNK:${base64.substring(i, i + chunkSize)}`);
}
this._logger.info(`SCREENSHOT_BASE64_END:${name}`);
} }
this._logger.info(`SCREENSHOT_BASE64_END:${name}`);
} catch (error) { } catch (error) {
this._logger.error('Error taking screenshot:', error); this._logger.error('Error taking screenshot:', error);
} }

View file

@ -14,6 +14,11 @@ export const config = {
// Bot // Bot
botName: process.env.BOT_NAME || 'PowerOn AI', botName: process.env.BOT_NAME || 'PowerOn AI',
botHeadless: process.env.BOT_HEADLESS !== 'false', botHeadless: process.env.BOT_HEADLESS !== 'false',
/**
* Replace Chromium's fake test-pattern video with a canvas stream (gradient + label).
* Unset in production with BOT_USE_CANVAS_VIDEO=false if you need camera off / profile tile only.
*/
botUseCanvasVideo: process.env.BOT_USE_CANVAS_VIDEO !== 'false',
// Logging // Logging
logLevel: process.env.LOG_LEVEL || 'info', logLevel: process.env.LOG_LEVEL || 'info',
@ -22,6 +27,12 @@ export const config = {
// Screenshots // Screenshots
screenshotDir: process.env.SCREENSHOT_DIR || './output/screenshots', screenshotDir: process.env.SCREENSHOT_DIR || './output/screenshots',
screenshotOnError: process.env.SCREENSHOT_ON_ERROR === 'true', screenshotOnError: process.env.SCREENSHOT_ON_ERROR === 'true',
// Stream screenshot bytes as base64 chunks into the bot log. Only useful in
// cloud deployments (e.g. Azure App Service) where the screenshot files on
// disk are not reachable. Locally the UI loads them via the REST proxy
// (/api/teamsbot/{instanceId}/screenshots/{file}), so this just bloats the
// log. Default OFF.
screenshotLogBase64: process.env.SCREENSHOT_LOG_BASE64 === 'true',
// Timeouts (in milliseconds) // Timeouts (in milliseconds)
timeouts: { timeouts: {