diff --git a/package-lock.json b/package-lock.json index 27280e6..f1052e2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,6 +7,7 @@ "": { "name": "service-teams-browser-bot", "version": "1.0.0", + "hasInstallScript": true, "dependencies": { "dotenv": "^16.4.1", "express": "^4.18.2", diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts index 3e9a670..c140fc9 100644 --- a/src/bot/audioCaptureProcedure.ts +++ b/src/bot/audioCaptureProcedure.ts @@ -154,9 +154,9 @@ export class AudioCaptureProcedure { async injectCaptureOverride(): Promise { if (this._injected) return; - this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper...'); + this._logger.info('[AudioCapture] Injecting RTCPeerConnection wrapper (all frames)...'); - await this._page.addInitScript((workletCode: string) => { + await this._page.context().addInitScript((workletCode: string) => { (window as any).__audioCaptureChunks = [] as any[]; (window as any).__audioCaptureProcessors = {} as Record; (window as any).__audioCaptureContexts = {} as Record; diff --git a/src/bot/audioProcedure.ts b/src/bot/audioProcedure.ts index b301482..73bb827 100644 --- a/src/bot/audioProcedure.ts +++ b/src/bot/audioProcedure.ts @@ -1,5 +1,6 @@ import { Page } from 'playwright'; import { Logger } from 'winston'; +import { poweronMediaPatchInstall } from './mediaGetUserMediaPatch'; /** * Handles audio playback in the Teams meeting. @@ -11,139 +12,113 @@ import { Logger } from 'winston'; * - When Teams calls getUserMedia, the wrapper: * 1. Calls the REAL getUserMedia (which returns Chromium's fake device stream) * 2. Replaces the audio track with one from our MediaStreamDestination - * 3. Returns the modified stream (our audio + Chromium's fake video) + * 3. Returns the modified stream; optional canvas video track instead of fake video * - When TTS audio is played, it's piped into the MediaStreamDestination, * and Teams sends it via WebRTC to other meeting participants. */ +export type AudioProcedureOptions = { + useCanvasVideo?: boolean; + /** Shown in the center of the canvas (e.g. bot display name) */ + displayLabel?: string; +}; + export class AudioProcedure { private _page: Page; private _logger: Logger; + private _useCanvasVideo: boolean; + private _displayLabel: string; private _audioContext: boolean = false; private _initScriptInjected: boolean = false; private _audioQueue: Array<{ audioData: string; format: 'mp3' | 'wav' | 'pcm' }> = []; private _isPlaying: boolean = false; private _stopRequested: boolean = false; - constructor(page: Page, logger: Logger) { + constructor(page: Page, logger: Logger, options?: AudioProcedureOptions) { this._page = page; this._logger = logger; + this._useCanvasVideo = !!options?.useCanvasVideo; + this._displayLabel = (options?.displayLabel || 'Bot').trim() || 'Bot'; } /** * Inject the getUserMedia wrapper BEFORE any page navigation. * This MUST be called before navigating to Teams. - * Uses page.addInitScript so it runs in every new document context. + * Uses browserContext.addInitScript so the hook runs in the main page and + * in embedded frames (Teams often runs media/WebRTC in an iframe; page-only + * injection would miss getUserMedia and you would only see the fake device). */ async injectAudioOverride(): Promise { if (this._initScriptInjected) { return; } - this._logger.info('Injecting audio getUserMedia override...'); + this._logger.info( + `Injecting audio getUserMedia override (canvasVideo=${this._useCanvasVideo}, label="${this._displayLabel}")...`, + ); - await this._page.addInitScript(() => { - // Create a shared AudioContext and MediaStreamDestination for TTS injection - const AudioContextClass = window.AudioContext || (window as any).webkitAudioContext; - const ctx = new AudioContextClass(); - const streamDest = ctx.createMediaStreamDestination(); - - // Store globally for later TTS injection - (window as any).__ttsAudioContext = ctx; - (window as any).__ttsStreamDest = streamDest; - (window as any).__ttsAudioStream = streamDest.stream; - - // Wrap getUserMedia to replace audio tracks with our TTS-injectable stream - const originalGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices); - navigator.mediaDevices.getUserMedia = async (constraints?: MediaStreamConstraints) => { - // Get the real stream (from Chromium's fake devices) - const realStream = await originalGetUserMedia(constraints); - - if (constraints && constraints.audio) { - // Build a new stream: our TTS audio track + their video tracks - const combinedStream = new MediaStream(); - - // Clone the TTS track so Teams can't kill the original via track.stop() - streamDest.stream.getAudioTracks().forEach(t => combinedStream.addTrack(t.clone())); - - // Keep the real video tracks (from fake camera) - realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t)); - - // Diagnostic signal for production logs: confirms override really feeds Teams. - try { - const audioTracks = combinedStream.getAudioTracks(); - const videoTracks = combinedStream.getVideoTracks(); - console.log( - `[AudioPlayback] getUserMedia override active: audioTracks=${audioTracks.length}, videoTracks=${videoTracks.length}, audioLabel="${audioTracks[0]?.label || 'n/a'}"`, - ); - } catch { - // ignore - } - - return combinedStream; - } - - // No audio requested - return the real stream as-is - return realStream; - }; - - // Force all RTCPeerConnection audio senders to use our TTS track. - // This ensures Teams actually sends our audio even if getUserMedia - // override happened in a different context or was renegotiated. - (window as any).__forceTtsTrackToSenders = async () => { - const pcs = ((window as any).__audioCapturePeerConnections || []) as RTCPeerConnection[]; - const ttsTrack = streamDest.stream.getAudioTracks()?.[0]; - if (!ttsTrack) return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' }; - - // #region agent log - const diag: Record = { - ttsTrackId: ttsTrack.id, - ttsTrackEnabled: ttsTrack.enabled, - ttsTrackReadyState: ttsTrack.readyState, - ttsTrackMuted: ttsTrack.muted, - beforeSenderTrackIds: [] as string[], - afterSenderTrackIds: [] as string[], - }; - // #endregion - - let replaced = 0; - for (const pc of pcs) { - try { - const senders = pc.getSenders?.() || []; - for (const sender of senders) { - if (sender?.track?.kind === 'audio') { - // #region agent log - diag.beforeSenderTrackIds.push(sender.track.id); - // #endregion - const freshClone = ttsTrack.clone(); - await sender.replaceTrack(freshClone); - replaced++; - // #region agent log - const afterTrack = sender.track; - diag.afterSenderTrackIds.push(afterTrack?.id || 'null'); - diag.afterSenderTrackEnabled = afterTrack?.enabled; - diag.afterSenderTrackReadyState = afterTrack?.readyState; - diag.originalTrackState = ttsTrack.readyState; - if (afterTrack && !afterTrack.enabled) { - afterTrack.enabled = true; - diag.forcedEnabled = true; - } - // #endregion - } - } - } catch (err: any) { - // #region agent log - diag.error = String(err?.message || err); - // #endregion - } - } - return { replaced, pcs: pcs?.length || 0, reason: 'ok', diag }; - }; + await this._page.context().addInitScript(poweronMediaPatchInstall, { + useCanvasVideo: this._useCanvasVideo, + displayLabel: this._displayLabel, }); this._initScriptInjected = true; this._logger.info('Audio getUserMedia override injected'); } + /** + * Re-run the media patch in every frame. Needed when Teams replaces the document + * in an iframe (addInitScript runs too early) or overwrites getUserMedia. + */ + async reinstallMediaPatchInAllFrames(): Promise { + const payload = { useCanvasVideo: this._useCanvasVideo, displayLabel: this._displayLabel }; + for (const frame of this._page.frames()) { + try { + await frame.evaluate(poweronMediaPatchInstall, payload); + } catch (e) { + this._logger.info(`[mediaPatch] frame skipped: ${e}`); + } + } + await this._forceCanvasVideoInAllFrames('reinstall'); + } + + /** + * Replace outbound video in every frame. Teams may run WebRTC in a subframe; + * only touching the main window leaves Chromium's default fake (green) video. + */ + private async _forceCanvasVideoInAllFrames(phase: string): Promise { + if (!this._useCanvasVideo) { + return; + } + const parts: string[] = []; + for (const frame of this._page.frames()) { + try { + const r = await frame.evaluate(async () => { + const w = window as any; + w.__startBotAvatarStream?.(); + return w.__forceVideoTrackToSenders?.(); + }); + const shortUrl = (() => { + try { + return frame.url().substring(0, 100); + } catch { + return '(no-url)'; + } + })(); + const rr: any = r || {}; + parts.push( + `[${shortUrl}] r=${rr.replaced ?? 0} add=${rr.added ?? 0} pcs=${rr.pcs ?? 0} ` + + `tx=${rr.totalTransceivers ?? 0} vidTx=${rr.videoTransceivers ?? 0} ` + + `vidWith=${rr.videoSendersWithTrack ?? 0} vidNoTrack=${rr.videoSendersWithoutTrack ?? 0} ` + + `dirB=[${(rr.directionsBefore || []).join(',')}] dirA=[${(rr.directionsAfter || []).join(',')}] ` + + `${rr.reason || ''}`.trim(), + ); + } catch (e: any) { + parts.push(`err=${String(e?.message || e).slice(0, 64)}`); + } + } + this._logger.info(`Canvas video ${phase}: ${parts.join(' | ')}`); + } + /** * Initialize the audio context in the browser for TTS playback. * Must be called after joining the meeting (user gesture context). @@ -175,6 +150,10 @@ export class AudioProcedure { } }); + if (this._useCanvasVideo) { + await this._forceCanvasVideoInAllFrames('init'); + } + this._audioContext = true; this._logger.info('Audio context initialized'); } @@ -279,6 +258,10 @@ export class AudioProcedure { ); // #endregion + if (this._useCanvasVideo) { + await this._forceCanvasVideoInAllFrames('tts'); + } + // Collect WebRTC stats BEFORE playback // #region agent log const statsBefore = await this._page.evaluate(async () => { @@ -405,12 +388,36 @@ export class AudioProcedure { */ async cleanup(): Promise { try { - await this._page.evaluate(() => { - const ctx = (window as any).__ttsAudioContext as AudioContext; - if (ctx) { - ctx.close(); + for (const frame of this._page.frames()) { + try { + await frame.evaluate(() => { + const w = window as any; + if (w.__botAvatarDrawInterval) { + clearInterval(w.__botAvatarDrawInterval); + w.__botAvatarDrawInterval = null; + } + if (w.__botAvatarVideoTrack) { + try { + w.__botAvatarVideoTrack.stop(); + } catch { + // ignore + } + w.__botAvatarVideoTrack = null; + } + if (w.__botAvatarCanvas && w.__botAvatarCanvas.remove) { + w.__botAvatarCanvas.remove(); + w.__botAvatarCanvas = null; + } + w.__botAvatarStreamStarted = false; + const actx = w.__ttsAudioContext as AudioContext; + if (actx) { + actx.close(); + } + }); + } catch { + // cross-origin or closed frame } - }); + } } catch { // Page might be closed } diff --git a/src/bot/backgroundProcedure.ts b/src/bot/backgroundProcedure.ts index c100dc9..f3d94a1 100644 --- a/src/bot/backgroundProcedure.ts +++ b/src/bot/backgroundProcedure.ts @@ -19,6 +19,69 @@ export class BackgroundProcedure { this._logger = logger; } + /** + * Open background effects and select "no" virtual background (camera only). + * Teams can show a flat green/gray placeholder when a background effect is + * on even when the feed is a fake or canvas source. + */ + async trySelectNoVirtualBackground(): Promise { + try { + const opened = await this._openBackgroundEffectsPanel(); + if (!opened) { + return false; + } + await this._page.waitForTimeout(500); + + const noEffectSelectors: string[] = [ + 'button[aria-label*="None" i]', + 'button[aria-label*="Kein" i]', + 'button[aria-label*="ohne" i]', + 'button[aria-label*="off" i][aria-label*="background" i]', + 'button[aria-label*="Hintergrund entfernen" i]', + '[data-tid="background-item-none"]', + 'button[role="tab"][name="None" i]', + ]; + for (const sel of noEffectSelectors) { + const btn = await this._page.$(sel); + if (btn) { + await btn.click(); + this._logger.info(`Selected no background effect: ${sel}`); + await this._page.waitForTimeout(500); + await this._dismissPanelIfOpen(); + return true; + } + } + + // First gallery tile (often "none" or blur off) in many Teams builds + const tile = await this._page.$( + '[data-tid="background-image"], [class*="background-item"], li[role="listitem"] button', + ); + if (tile) { + await tile.click(); + this._logger.info('Clicked first background effects tile (often no effect)'); + await this._page.waitForTimeout(400); + await this._dismissPanelIfOpen(); + return true; + } + + await this._dismissPanelIfOpen(); + this._logger.warn('Could not find "no background" control'); + return false; + } catch (e) { + this._logger.warn(`trySelectNoVirtualBackground: ${e}`); + return false; + } + } + + private async _dismissPanelIfOpen(): Promise { + try { + await this._page.keyboard.press('Escape'); + await this._page.waitForTimeout(200); + } catch { + // ignore + } + } + /** * Set a virtual background from a URL on the Teams pre-join screen. * diff --git a/src/bot/chatProcedure.ts b/src/bot/chatProcedure.ts index 7368964..d5d92a8 100644 --- a/src/bot/chatProcedure.ts +++ b/src/bot/chatProcedure.ts @@ -47,9 +47,16 @@ export class ChatProcedure { /** * Open the chat panel and start monitoring messages. + * + * Diagnostics are dumped UPFRONT (not just on failure) so even when the + * very first toggle attempt succeeds we still have a record of which + * chat-related controls Teams shipped on this build — important because + * the anonymous / compact in-meeting layout uses different ids than the + * authenticated Teams meeting layout. */ async enableChatMonitoring(): Promise { this._logger.info('Enabling chat monitoring...'); + await this._dumpChatButtonDiagnostics(); await this._openChatPanel(); await this._page.waitForTimeout(2000); @@ -66,24 +73,48 @@ export class ChatProcedure { /** * Check if the chat panel is currently visible by probing for known * UI elements (chat input, message list, or aria-pressed toggle). + * + * IMPORTANT — what we DO NOT accept as proof of an open chat panel: + * * a generic ``[data-tid="ckeditor"]`` / ``[role=textbox]`` somewhere in + * the page. In the anonymous / compact in-meeting layout Teams renders + * a separate compose box (e.g. for reactions / inline comments) whose + * parent has NO data-tid at all and which does NOT post into the + * meeting chat. If we treated that as "panel open" the bot would skip + * the toggle and silently lose every chat send. + * * a "Besprechungschat" / "Meeting chat" heading. Teams keeps the side- + * pane heading mounted even when the pane is ``vdi-occlusion`` / h=0. + * + * What we DO accept: + * * ``#chat-button[aria-pressed="true"]`` — explicit toggle state. + * * a known **chat-side-panel-scoped** input (selectors anchored under + * ``chat-pane-compose-message-footer`` / ``message-pane-footer``). + * * a chat **message list container** with non-trivial height. */ private async _isChatPanelOpen(): Promise { return this._page.evaluate(() => { - // 1. Chat button aria-pressed state (most reliable when available) + // 1. Chat button aria-pressed state. This is the ONLY safe short- + // circuit — Teams keeps the panel "open" semantically even when its + // layout pane is briefly collapsed (h=0, vdi-occlusion). If we were + // stricter here the periodic scan / send path would re-trigger + // _openChatPanel which CLICKS the button — and a click on an already- + // pressed button TOGGLES the panel CLOSED. So aria-pressed=true must + // short-circuit to true. const chatBtn = document.querySelector('#chat-button, button[id="chat-button"]') as HTMLElement | null; if (chatBtn?.getAttribute('aria-pressed') === 'true') return true; - // 2. Chat input / compose box visible (definitive proof the chat panel is open) + // 2. Chat input / compose box visible — but ONLY accept selectors + // that are scoped to the actual chat side-pane (footer ancestors). + // Generic [contenteditable] / [role=textbox] matches would also hit + // the compact in-meeting compose box used by anonymous / pre-join + // overlays, which is NOT the meeting chat. const inputSelectors = [ '[data-tid="ckeditor-replyConversation"]', + '[data-tid="chat-pane-compose-message-footer"] div[contenteditable="true"]', + '[data-tid="chat-pane-compose-message-footer"] div[role="textbox"]', + '[data-tid="message-pane-footer"] div[contenteditable="true"]', + '[data-tid="message-pane-footer"] div[role="textbox"]', 'div[role="textbox"][data-tid*="chat"]', 'div[role="textbox"][data-tid*="message"]', - 'div[role="textbox"][aria-label*="message" i]', - 'div[role="textbox"][aria-label*="Nachricht" i]', - '[contenteditable="true"][aria-label*="message" i]', - '[contenteditable="true"][aria-label*="Nachricht" i]', - 'div[aria-label="Type a message"]', - 'div[aria-label*="Neue Nachricht" i]', ]; for (const sel of inputSelectors) { const el = document.querySelector(sel) as HTMLElement | null; @@ -102,21 +133,24 @@ export class ChatProcedure { if (el && el.offsetHeight > 50) return true; } - // 4. "Meeting chat" / "Besprechungschat" heading visible - const headings = document.querySelectorAll('h2, h3, [role="heading"]'); - for (const h of Array.from(headings)) { - const txt = (h as HTMLElement).innerText?.toLowerCase() || ''; - if (txt.includes('meeting chat') || txt.includes('besprechungschat')) return true; - } - return false; }); } /** * Open the chat panel by clicking the chat button. - * In authenticated Teams, the chat panel may already be open (meeting loads - * from a chat thread). Clicking again would TOGGLE it closed. + * + * In authenticated Teams, the chat panel may already be open (meeting + * loads from a chat thread). Clicking again would TOGGLE it closed — + * that's why we always check ``_isChatPanelOpen()`` first. + * + * The selector list below covers BOTH layouts: + * * authenticated full Teams meeting → ``#chat-button`` etc. + * * anonymous / compact in-meeting toolbar (Teams Live / pre-join stage) + * where the toggle has no stable id and only carries + * ``data-tid``/``aria-label`` hints. We therefore include data-tid- + * based and broader role-based fallbacks so the bot does not silently + * fall back to typing into a non-chat compose box. */ private async _openChatPanel(): Promise { if (await this._isChatPanelOpen()) { @@ -126,11 +160,21 @@ export class ChatProcedure { const chatButtonSelectors = [ '#chat-button', + 'button[id="chat-button"]', + 'button[data-tid="toggle-chat"]', + 'button[data-tid*="chat" i]', + 'button[data-tid*="conversation" i]', 'button[aria-label="Chat"]', 'button[aria-label*="Chat" i]', 'button[aria-label*="Unterhaltung" i]', 'button[aria-label*="Besprechungschat" i]', 'button[aria-label*="Meeting chat" i]', + 'button[title*="Chat" i]', + 'button[title*="Besprechungschat" i]', + // role-based fallbacks for the compact / anonymous toolbar + '[role="button"][aria-label*="Chat" i]', + '[role="button"][aria-label*="Besprechungschat" i]', + '[role="menuitem"][aria-label*="Chat" i]', ]; const maxAttempts = 12; @@ -143,19 +187,20 @@ export class ChatProcedure { } let clicked = false; + const triedSelectors: string[] = []; for (const selector of chatButtonSelectors) { try { const button = await this._page.$(selector); - if (button) { - const isVisible = await button.isVisible().catch(() => false); - if (!isVisible) continue; - await button.click(); - clicked = true; - this._logger.info(`Clicked chat button: ${selector} (attempt ${attempt}/${maxAttempts})`); - break; - } - } catch { - // Continue to next selector + if (!button) continue; + const isVisible = await button.isVisible().catch(() => false); + triedSelectors.push(`${selector}=${isVisible ? 'visible' : 'hidden'}`); + if (!isVisible) continue; + await button.click(); + clicked = true; + this._logger.info(`Clicked chat button: ${selector} (attempt ${attempt}/${maxAttempts})`); + break; + } catch (err) { + triedSelectors.push(`${selector}=err:${String(err).substring(0, 40)}`); } } @@ -168,17 +213,63 @@ export class ChatProcedure { this._logger.info('Chat button clicked but panel not detected yet, waiting before next attempt'); await this._page.waitForTimeout(pollIntervalMs); } else { + // Log which selectors were tried — without this we can't tell whether + // the buttons are missing entirely or just hidden behind another layer. + this._logger.info( + `Chat button not found, retry ${attempt}/${maxAttempts}` + + (triedSelectors.length ? ` | tried: ${triedSelectors.join(', ')}` : ''), + ); + // On the very first miss dump the full button diagnostics so the + // next code change has a real selector hint to work from instead + // of guessing. After that we throttle to avoid log spam. + if (attempt === 1) { + await this._dumpChatButtonDiagnostics(); + } if (attempt < maxAttempts) { - this._logger.info(`Chat button not found, retry ${attempt}/${maxAttempts}`); await this._page.waitForTimeout(pollIntervalMs); } } } this._logger.warn('Could not open chat panel after polling - chat will not work'); + await this._dumpChatButtonDiagnostics(); return false; } + /** + * One-shot diagnostic when _openChatPanel fails: list every button in the + * page whose id / data-tid / aria-label hints at "chat" so we can update + * chatButtonSelectors when Teams ships a UI change. + */ + private async _dumpChatButtonDiagnostics(): Promise { + try { + const dump = await this._page.evaluate(() => { + const all = Array.from(document.querySelectorAll('button')) as HTMLButtonElement[]; + const candidates = all.filter((b) => { + const id = (b.id || '').toLowerCase(); + const tid = (b.getAttribute('data-tid') || '').toLowerCase(); + const aria = (b.getAttribute('aria-label') || '').toLowerCase(); + const title = (b.getAttribute('title') || '').toLowerCase(); + return [id, tid, aria, title].some((v) => + v.includes('chat') || v.includes('conversation') || v.includes('unterhalt') || v.includes('besprechung'), + ); + }); + return candidates.slice(0, 12).map((b) => ({ + id: b.id || '', + tid: b.getAttribute('data-tid') || '', + aria: b.getAttribute('aria-label') || '', + title: b.getAttribute('title') || '', + pressed: b.getAttribute('aria-pressed') || '', + h: b.offsetHeight || 0, + visible: b.offsetParent !== null, + })); + }); + this._logger.warn(`[ChatBtnDiag] candidates=${JSON.stringify(dump)}`); + } catch (e) { + this._logger.warn(`[ChatBtnDiag] failed: ${e}`); + } + } + /** * Subscribe to chat messages using MutationObserver. */ @@ -253,11 +344,19 @@ export class ChatProcedure { } function _extractChatMessage(el: HTMLElement): boolean { + // Strategy 1: explicit data-tid / class-based message wrappers (covers + // legacy AND Fluent UI v9 chat bubbles). The class[*=...] form catches + // ``fui-ChatMessage``/``fui-ChatMyMessage`` etc. without depending on + // any specific data-tid (modern Teams often ships with empty tids). const messageSelectors = [ '[data-tid="chat-message"]', - '.fui-ChatMessage', - '[data-tid*="message-body"]', + '[data-tid="chat-pane-message"]', '[data-tid*="chat-pane-message"]', + '[data-tid*="message-list-item"]', + '[data-tid*="message-body"]', + '[class*="fui-ChatMessage"]', + '[class*="fui-ChatMyMessage"]', + '[role="listitem"]', ]; let messageEl: HTMLElement | null = null; @@ -266,38 +365,60 @@ export class ChatProcedure { if (messageEl) break; } - if (messageEl) { + const _findAuthor = (root: HTMLElement | null): string => { + if (!root) return 'Unknown'; const authorSelectors = [ '[data-tid="message-author"]', '[data-tid="message-author-name"]', - '.fui-ChatMessage__author', '[data-tid*="author"]', + '[class*="ChatMessage__author"]', + '[class*="-author"]', + 'span[class*="fui-Persona__primaryText"]', ]; - let author = 'Unknown'; for (const sel of authorSelectors) { - const authorEl = messageEl.querySelector(sel) || el.querySelector(sel); - if (authorEl?.textContent) { - author = authorEl.textContent.trim(); - break; - } + const authorEl = root.querySelector(sel) || el.querySelector(sel); + const t = (authorEl as HTMLElement | null)?.textContent?.trim(); + if (t) return t; } + return 'Unknown'; + }; + const _findBody = (root: HTMLElement | null): string => { + if (!root) return ''; const bodySelectors = [ + '[data-tid="message-body-content"]', '[data-tid="message-body"]', - '.fui-ChatMessage__body', '[data-tid="chat-message-text"]', '[data-tid*="message-body"]', + '[class*="ChatMessage__body"]', + '[class*="-body"] [class*="content"]', + '[class*="messageContent"]', ]; - let text = ''; for (const sel of bodySelectors) { - const bodyEl = messageEl.querySelector(sel) || el.querySelector(sel); - if (bodyEl) { - text = (bodyEl as HTMLElement).innerText?.trim() || ''; - break; + const bodyEl = root.querySelector(sel) || el.querySelector(sel); + const t = (bodyEl as HTMLElement | null)?.innerText?.trim(); + if (t) return t; + } + return ''; + }; + + if (messageEl) { + const author = _findAuthor(messageEl); + let text = _findBody(messageEl); + + // Last resort: take innerText minus the author name & metadata so we + // at least surface something when the body wrapper changes again. + if (!text) { + const full = (messageEl.innerText || '').trim(); + if (full) { + text = full + .replace(author, '') + .replace(/\b(now|just now|gerade|jetzt)\b/gi, '') + .trim(); } } - if (text && text.length > 0) { + if (text && !_isNoise(text)) { const teamsTs = _extractTeamsTimestamp(messageEl) || _extractTeamsTimestamp(el); (window as any).__onChatMessageEvent({ speaker: author, @@ -310,19 +431,14 @@ export class ChatProcedure { } } - // Strategy 2: Structural fallback for authenticated Teams chat - // Chat messages typically have: author element + body element as children + // Strategy 2: Structural fallback for authenticated Teams chat. const fullText = el.innerText?.trim() || ''; if (!fullText || fullText.length < 2 || _isNoise(fullText)) return false; - - // Skip typing indicators, system messages const tid = el.getAttribute('data-tid') || ''; if (tid === 'typing-indicator') return false; - // Look for elements that look like user messages (have author-like + body-like children) const children = Array.from(el.children) as HTMLElement[]; if (children.length >= 2) { - // Find an element that looks like a name (short text, no data-tid with "body") for (let i = 0; i < children.length - 1; i++) { const candidateName = children[i].innerText?.trim() || ''; const candidateBody = children.slice(i + 1).map(c => c.innerText?.trim()).filter(Boolean).join(' ').trim(); @@ -331,11 +447,16 @@ export class ChatProcedure { candidateName.length > 1 && candidateName.length < 60 && candidateBody.length > 1 && !_isNoise(candidateBody) && - !candidateName.includes('meeting') && !candidateName.includes('Meeting') + !candidateName.toLowerCase().includes('meeting') ) { - // Check if this looks like a time-stamped message (not just any two children) + const childCls = (children[i].className?.toString?.() || ''); const hasTid = children[i].getAttribute('data-tid') || ''; - if (hasTid.includes('author') || hasTid.includes('name') || hasTid.includes('sender')) { + const looksLikeAuthor = + hasTid.includes('author') || + hasTid.includes('name') || + hasTid.includes('sender') || + /author|persona|sender|name/i.test(childCls); + if (looksLikeAuthor) { const teamsTs = _extractTeamsTimestamp(el); (window as any).__onChatMessageEvent({ speaker: candidateName, @@ -353,30 +474,33 @@ export class ChatProcedure { return false; } - // Teams chat containers - try multiple selectors + // Teams chat containers — prefer the most chat-specific tids first. + // IMPORTANT: never use [role="log"] alone here — Teams reuses that role + // for the captions panel, which would silently steal the observer target + // and cause "candidates=0" forever. const chatContainerSelectors = [ '[data-tid="message-pane-list"]', - '[data-tid="chat-pane"]', '[data-tid="chat-pane-list"]', + '[data-tid="chat-pane"]', + '[data-tid="message-pane-layout"]', '.ts-message-list-container', - '[role="log"]', ]; let chatContainer: Element | null = null; let matchedSelector = ''; for (const sel of chatContainerSelectors) { - chatContainer = document.querySelector(sel); - if (chatContainer) { + const el = document.querySelector(sel) as HTMLElement | null; + if (el && (el.offsetHeight > 50 || sel.includes('message-pane-layout'))) { + chatContainer = el; matchedSelector = sel; break; } } if (!chatContainer) { - const candidates = document.querySelectorAll('[data-tid*="chat"], [data-tid*="message"]'); + const candidates = document.querySelectorAll('[data-tid*="chat"], [data-tid*="message-pane"]'); for (const c of Array.from(candidates)) { const cTid = c.getAttribute('data-tid') || ''; - // Prefer larger containers, not buttons or small elements if ((c as HTMLElement).offsetHeight > 50 && c.tagName !== 'BUTTON') { chatContainer = c; matchedSelector = `[data-tid="${cTid}"]`; @@ -385,6 +509,19 @@ export class ChatProcedure { } } + // Last resort: a [role="log"] only if it's actually inside a chat-y + // ancestor (so we don't latch onto the captions panel). + if (!chatContainer) { + const logEls = Array.from(document.querySelectorAll('[role="log"]')) as HTMLElement[]; + for (const log of logEls) { + if (log.closest('[data-tid*="chat"], [data-tid*="message-pane"]')) { + chatContainer = log; + matchedSelector = '[role="log"] in chat ancestor'; + break; + } + } + } + // Use found container or fall back to document.body const target = chatContainer || document.body; @@ -482,63 +619,142 @@ export class ChatProcedure { const results: Array<{ speaker: string; text: string; timestamp: string; teamsTimestamp?: string; messageKey: string }> = []; const seenThisScan = new Set(); - // Strategy 1: known selectors + // Find a chat container — prefer chat-specific tids; never plain + // [role="log"] (captions panel reuses it), unless it has a chat + // ancestor. const containerSelectors = [ - '[data-tid="message-pane-list"]', '[data-tid="chat-pane-list"]', - '[data-tid="chat-pane"]', '[role="log"]', '.ts-message-list-container', + '[data-tid="message-pane-list"]', + '[data-tid="chat-pane-list"]', + '[data-tid="chat-pane"]', + '[data-tid="message-pane-layout"]', + '.ts-message-list-container', ]; - let container: Element | null = null; + let container: HTMLElement | null = null; + let containerSrc = ''; for (const sel of containerSelectors) { - container = document.querySelector(sel); - if (container) break; + const el = document.querySelector(sel) as HTMLElement | null; + if (el && (el.offsetHeight > 50 || sel.includes('message-pane-layout'))) { + container = el; + containerSrc = sel; + break; + } + } + if (!container) { + const logs = Array.from(document.querySelectorAll('[role="log"]')) as HTMLElement[]; + for (const l of logs) { + if (l.closest('[data-tid*="chat"], [data-tid*="message-pane"]') && l.offsetHeight > 50) { + container = l; + containerSrc = '[role="log"] in chat ancestor'; + break; + } + } } + // Modern Teams chat bubbles have NO data-tid on the wrapper — + // we match on Fluent UI v9 class prefixes and role="listitem". const messageSelectors = [ - '[data-tid="chat-message"]', '.fui-ChatMessage', - '[data-tid*="chat-pane-message"]', '[data-tid*="message-body"]', + '[data-tid="chat-message"]', + '[data-tid="chat-pane-message"]', + '[data-tid*="chat-pane-message"]', + '[data-tid*="message-list-item"]', + '[data-tid*="message-body"]', + '[class*="fui-ChatMessage"]', + '[class*="fui-ChatMyMessage"]', + '[role="listitem"]', ]; const target = container || document.body; const candidates = target.querySelectorAll(messageSelectors.join(', ')); - for (const el of Array.from(candidates) as HTMLElement[]) { - const messageEl = el.closest?.('[data-tid="chat-message"], .fui-ChatMessage') || el; - let author = 'Unknown'; - const authorSels = [ + const findAuthor = (root: HTMLElement, fallbackEl: HTMLElement): string => { + const sels = [ '[data-tid="message-author"]', '[data-tid="message-author-name"]', - '.fui-ChatMessage__author', '[data-tid*="author"]', + '[data-tid*="author"]', + '[class*="ChatMessage__author"]', '[class*="-author"]', + 'span[class*="fui-Persona__primaryText"]', ]; - for (const sel of authorSels) { - const authorEl = messageEl.querySelector(sel) || el.querySelector(sel); - if (authorEl?.textContent) { author = authorEl.textContent.trim(); break; } + for (const sel of sels) { + const a = (root.querySelector(sel) || fallbackEl.querySelector(sel)) as HTMLElement | null; + const t = a?.textContent?.trim(); + if (t) return t; } - const bodySels = [ - '[data-tid="message-body"]', '.fui-ChatMessage__body', + return 'Unknown'; + }; + const findBody = (root: HTMLElement, fallbackEl: HTMLElement): string => { + const sels = [ + '[data-tid="message-body-content"]', '[data-tid="message-body"]', '[data-tid="chat-message-text"]', '[data-tid*="message-body"]', + '[class*="ChatMessage__body"]', '[class*="messageContent"]', + '[class*="-body"] [class*="content"]', ]; - let text = ''; - for (const sel of bodySels) { - const bodyEl = messageEl.querySelector(sel) || el.querySelector(sel); - if (bodyEl) { text = (bodyEl as HTMLElement).innerText?.trim() || ''; break; } + for (const sel of sels) { + const b = (root.querySelector(sel) || fallbackEl.querySelector(sel)) as HTMLElement | null; + const t = b?.innerText?.trim(); + if (t) return t; + } + return ''; + }; + + for (const el of Array.from(candidates) as HTMLElement[]) { + const messageEl = (el.closest?.('[data-tid*="chat-message"], [data-tid*="message-list-item"], [class*="fui-ChatMessage"], [class*="fui-ChatMyMessage"]') as HTMLElement | null) || el; + const author = findAuthor(messageEl, el); + let text = findBody(messageEl, el); + if (!text) { + const full = (messageEl.innerText || '').trim(); + if (full) { + text = full + .replace(author, '') + .replace(/\b(now|just now|gerade|jetzt)\b/gi, '') + .trim(); + } } if (!text || text.length < 2 || isNoise(text)) continue; const key = `${author}::${text}`; if (known.has(key) || seenThisScan.has(key)) continue; seenThisScan.add(key); const timeEl = messageEl.querySelector('time[datetime], [data-tid*="timestamp"] time'); - const ts = timeEl?.getAttribute?.('datetime') || new Date().toISOString(); + const ts = (timeEl as HTMLElement | null)?.getAttribute?.('datetime') || new Date().toISOString(); results.push({ speaker: author, text, timestamp: ts, teamsTimestamp: ts, messageKey: key }); } - // Diagnostics (once per 20s, controlled by caller) + // Diagnostics — emit every ~20 s, AND additionally any time the + // scan finds a container but ZERO messages (so we can adapt). let diag: string | undefined; if ((window as any).__chatScanDiagCounter === undefined) (window as any).__chatScanDiagCounter = 0; (window as any).__chatScanDiagCounter++; - if ((window as any).__chatScanDiagCounter % 4 === 1) { + const periodic = (window as any).__chatScanDiagCounter % 4 === 1; + const zeroButContainer = !!container && candidates.length === 0; + if (periodic || zeroButContainer) { const info: string[] = []; - info.push(`container=${container?.tagName || 'body'}[${container?.getAttribute?.('data-tid') || ''}]`); + info.push(`container=${container?.tagName || 'body'}[${container?.getAttribute?.('data-tid') || containerSrc || ''}] h=${container?.offsetHeight || 0}`); info.push(`candidates=${candidates.length}`); - // Dump all chat-ish elements for debugging - const allChat = document.querySelectorAll('[data-tid*="chat"], [data-tid*="message"], [role="log"], .fui-Chat'); + // Modern indicator counts (helps confirm whether messages exist + // under a totally different selector set). + info.push( + `globalCounts={listitem:${document.querySelectorAll('[role="listitem"]').length},` + + `chatMessage:${document.querySelectorAll('[class*="fui-ChatMessage"]').length},` + + `chatMyMessage:${document.querySelectorAll('[class*="fui-ChatMyMessage"]').length},` + + `messageBodyTid:${document.querySelectorAll('[data-tid*="message-body"]').length},` + + `chatPaneMessageTid:${document.querySelectorAll('[data-tid*="chat-pane-message"]').length}}`, + ); + // Dump the container's first children (so we can craft selectors). + if (container) { + const kids = Array.from(container.children).slice(0, 6).map((c) => { + const e = c as HTMLElement; + const tid = e.getAttribute('data-tid') || ''; + const role = e.getAttribute('role') || ''; + const cls = (e.className?.toString?.() || '').substring(0, 60); + const h = e.offsetHeight || 0; + const childCount = e.children.length; + return `<${e.tagName} tid="${tid}" role="${role}" cls="${cls}" h=${h} kids=${childCount}>`; + }); + info.push(`containerChildren=[${kids.join(', ')}]`); + if (zeroButContainer && container.firstElementChild) { + const firstHtml = (container.firstElementChild.outerHTML || '').substring(0, 400).replace(/\s+/g, ' '); + info.push(`firstChildHtml="${firstHtml}"`); + } + } + // All chat-ish elements (legacy diagnostic, kept for context) + const allChat = document.querySelectorAll('[data-tid*="chat"], [data-tid*="message"], [role="log"], [class*="fui-Chat"]'); const tags: string[] = []; for (const c of Array.from(allChat).slice(0, 15)) { const tid = c.getAttribute('data-tid') || ''; @@ -615,13 +831,46 @@ export class ChatProcedure { /** * Send a chat message in the meeting. - * Finds the chat input (with retry), types the message, and sends it. + * + * Renders the body as Markdown -> HTML (bold/italic/code/lists/links/headers) + * and pastes it into the Teams compose box via ``document.execCommand("insertHTML")`` + * so the message appears formatted in the meeting chat. Falls back to plain + * keyboard typing if HTML insertion is rejected by Teams. */ async sendChatMessage(text: string): Promise { this._logger.info(`Sending chat message: ${text.substring(0, 60)}...`); + // IMPORTANT: do NOT call _openChatPanel() from here. The chat panel is + // opened once via enableChatMonitoring() (the toggle entry-point) and + // re-opened by the periodic scan if it ever drops. _openChatPanel clicks + // the chat button — and clicking an already-pressed button toggles the + // panel CLOSED, which silently breaks every subsequent send. + + // Pre-flight: if the chat panel is provably NOT open we must abort + // immediately. Otherwise the input-finding loop below would happily + // match a non-chat compose box (e.g. the compact in-meeting compose + // overlay used in anonymous / pre-join layouts has a generic + // [data-tid="ckeditor"] [role="textbox"] in a floating layer that + // looks like a chat input but does NOT post into the meeting chat). + // Surfacing the failure fast lets the periodic scan re-toggle the + // panel and the Gateway resend the message. + const panelOpen = await this._isChatPanelOpen(); + if (!panelOpen) { + this._logger.warn('Chat panel not open — aborting send so the periodic scan can re-toggle it'); + return false; + } + + // Note: order matters — most specific selectors first; the `chat-pane-compose-message-footer` + // ancestor lookup is needed because Teams Fluent UI v9 scopes the contenteditable inside it. + // Modern Teams meeting chat uses CKEditor 5 (`.ck-editor__editable`) and its compose root + // often has NO `data-tid` at all, so class-/aria-based fallbacks are required. const inputSelectors = [ + // Classic data-tid selectors (older Teams builds) '[data-tid="ckeditor-replyConversation"]', + '[data-tid="chat-pane-compose-message-footer"] div[contenteditable="true"]', + '[data-tid="chat-pane-compose-message-footer"] div[role="textbox"]', + '[data-tid="message-pane-footer"] div[contenteditable="true"]', + '[data-tid="message-pane-footer"] div[role="textbox"]', 'div[role="textbox"][data-tid*="chat"]', 'div[role="textbox"][data-tid*="message"]', 'div[role="textbox"][aria-label*="message" i]', @@ -629,25 +878,50 @@ export class ChatProcedure { 'div[contenteditable="true"][data-tid*="message"]', 'div[contenteditable="true"][data-tid*="chat"]', 'div[contenteditable="true"][aria-label*="message" i]', + 'div[contenteditable="true"][aria-label*="Nachricht" i]', '[aria-label*="Type a new message" i]', '[aria-label="Type a message"]', '[aria-label*="Neue Nachricht eingeben" i]', '[placeholder*="Type a message" i]', '[placeholder*="Nachricht" i]', + // Fluent UI v9 / CKEditor 5 — class-/aria-based fallbacks (no data-tid required) + 'div.ck-editor__editable[contenteditable="true"]', + 'div.ck-content[contenteditable="true"]', + 'div.ck-editor__editable', + 'div[aria-placeholder*="message" i][contenteditable="true"]', + 'div[aria-placeholder*="Nachricht" i][contenteditable="true"]', + 'div[aria-placeholder*="Type a new message" i]', + 'div[aria-label*="Compose" i][role="textbox"]', + 'div[aria-label*="Verfassen" i][role="textbox"]', + // Last-resort generic — only matches if nothing else hit 'div[contenteditable="true"][role="textbox"]', ]; + const html = ChatProcedure._markdownToHtml(text); const maxAttempts = 5; const retryDelayMs = 600; + // Single combined-selector wait: lets Playwright surface the element as + // soon as Teams mounts it (instead of busy-polling N selectors per attempt). + const combinedSelector = inputSelectors.join(', '); + try { + await this._page.waitForSelector(combinedSelector, { state: 'visible', timeout: 5000 }); + } catch { + // Fall through — the per-selector loop below logs richer diagnostics. + } + for (let attempt = 1; attempt <= maxAttempts; attempt++) { try { let input: any = null; + let matchedSelector = ''; for (const selector of inputSelectors) { input = await this._page.$(selector); if (input) { const isVisible = await input.isVisible().catch(() => false); - if (isVisible) break; + if (isVisible) { + matchedSelector = selector; + break; + } await input.dispose(); input = null; } @@ -657,10 +931,131 @@ export class ChatProcedure { await input.click(); await this._page.waitForTimeout(200); - await this._page.keyboard.type(text, { delay: 10 }); + // Snapshot the editor BEFORE writing so we can detect whether each + // stage actually deposited content (defaultPrevented alone is not + // a reliable signal in CKEditor 5). + const beforeLen = await this._page.evaluate((selector) => { + const el = document.querySelector(selector) as HTMLElement | null; + return (el?.textContent ?? '').length; + }, matchedSelector); + + // --------------------------------------------------------------- + // Stage 1: execCommand('insertHTML', html) + // Fast on legacy Teams; CKEditor 5 normally returns false. + // --------------------------------------------------------------- + let stageUsed: 'execCommand' | 'syntheticPaste' | 'plainType' = 'plainType'; + let writeOk = false; + + const execInserted = await this._page.evaluate(({ selector, html: rawHtml }) => { + const el = document.querySelector(selector) as HTMLElement | null; + if (!el) return false; + try { el.focus(); } catch { /* ignore */ } + try { + return document.execCommand('insertHTML', false, rawHtml); + } catch { + return false; + } + }, { selector: matchedSelector, html }); + + if (execInserted) { + const afterLen = await this._page.evaluate((selector) => { + const el = document.querySelector(selector) as HTMLElement | null; + return (el?.textContent ?? '').length; + }, matchedSelector); + if (afterLen > beforeLen) { + stageUsed = 'execCommand'; + writeOk = true; + this._logger.info(`Chat insert OK via execCommand insertHTML (${beforeLen} -> ${afterLen} chars)`); + } + } + + // --------------------------------------------------------------- + // Stage 2: synthetic ClipboardEvent('paste') with text/html + + // text/plain. CKEditor 5 has its own paste-handler which ingests + // the HTML as ONE atomic block (one chat post, multiple paragraphs). + // --------------------------------------------------------------- + if (!writeOk) { + await this._page.evaluate( + async ({ selector, html: rawHtml, plain }) => { + const el = document.querySelector(selector) as HTMLElement | null; + if (!el) return false; + try { el.focus(); } catch { /* ignore */ } + try { + const dt = new DataTransfer(); + dt.setData('text/html', rawHtml); + dt.setData('text/plain', plain); + const evt = new ClipboardEvent('paste', { + clipboardData: dt, + bubbles: true, + cancelable: true, + }); + el.dispatchEvent(evt); + } catch { + return false; + } + return true; + }, + { selector: matchedSelector, html, plain: text }, + ); + await this._page.waitForTimeout(120); + const afterLen = await this._page.evaluate((selector) => { + const el = document.querySelector(selector) as HTMLElement | null; + return (el?.textContent ?? '').length; + }, matchedSelector); + if (afterLen > beforeLen) { + stageUsed = 'syntheticPaste'; + writeOk = true; + this._logger.info(`Chat insert OK via synthetic ClipboardEvent (${beforeLen} -> ${afterLen} chars)`); + } + } + + // --------------------------------------------------------------- + // Stage 3: typed plain text — LAST RESORT. CRITICAL details: + // a) `keyboard.type(text)` treats every `\n` as Enter, which + // Teams maps to "send message" → splits the message into + // one chat post per line. Therefore we split on newlines + // and inject Shift+Enter (= soft break) between segments, + // then a SINGLE Enter at the very end to send. + // b) CKEditor 5 in Teams runs Markdown auto-format heuristics + // while the user types: a leading `* `, `- `, `+ `, or + // `1. ` triggers auto-list conversion which SWALLOWS the + // marker AND the first character of the item ("D" in + // "* Datenmigration" → "atenmigration"). Same trap for + // `# heading`, `**bold**`, and time-like `1:23` patterns. + // We therefore neutralise every line BEFORE typing — drop + // all Markdown markers and replace bullets with the Unicode + // bullet `\u2022` (which is NOT a CKEditor trigger). + // --------------------------------------------------------------- + if (!writeOk) { + const beforeText = text; + const editorSafe = ChatProcedure._neutraliseForPlainType(text); + this._logger.warn( + `Chat insert falling back to plain typing — neutralised markdown markers ` + + `(orig=${beforeText.length} chars, neutral=${editorSafe.length} chars, ` + + `lineBreaks=${(editorSafe.match(/\n/g) || []).length})`, + ); + const segments = editorSafe.split(/\r?\n/); + for (let i = 0; i < segments.length; i++) { + if (i > 0) { + await this._page.keyboard.down('Shift'); + await this._page.keyboard.press('Enter'); + await this._page.keyboard.up('Shift'); + // Give CKEditor a tick to settle the soft-break model + // change before we start typing again, otherwise the + // very first character can occasionally be dropped. + await this._page.waitForTimeout(50); + } + if (segments[i].length > 0) { + await this._page.keyboard.type(segments[i], { delay: 8 }); + } + } + stageUsed = 'plainType'; + writeOk = true; + } + await this._page.waitForTimeout(200); await this._page.keyboard.press('Enter'); - this._logger.info('Chat message sent'); + this._logger.info(`Chat message sent (stage=${stageUsed})`); return true; } @@ -679,9 +1074,290 @@ export class ChatProcedure { } this._logger.warn('Could not find chat input field'); + await this._dumpComposeDiagnostics(); return false; } + /** + * Dump rich diagnostics about contenteditable / textbox candidates in the + * page so we can adapt selectors when Teams ships a UI change. + */ + private async _dumpComposeDiagnostics(): Promise { + try { + const dump = await this._page.evaluate(() => { + const describe = (el: Element): string => { + const e = el as HTMLElement; + const tid = e.getAttribute('data-tid') || ''; + const role = e.getAttribute('role') || ''; + const ariaLabel = e.getAttribute('aria-label') || ''; + const ariaPlaceholder = e.getAttribute('aria-placeholder') || ''; + const ce = e.getAttribute('contenteditable') || ''; + const cls = (e.className?.toString?.() || '').substring(0, 80); + const h = e.offsetHeight || 0; + const w = e.offsetWidth || 0; + // Climb up to find the closest data-tid ancestor for context. + let parentTid = ''; + let walk: Element | null = e.parentElement; + let depth = 0; + while (walk && depth < 6) { + const t = walk.getAttribute('data-tid'); + if (t) { parentTid = t; break; } + walk = walk.parentElement; + depth++; + } + return `<${e.tagName} tid="${tid}" role="${role}" ce="${ce}" ` + + `aria-label="${ariaLabel.substring(0, 40)}" aria-ph="${ariaPlaceholder.substring(0, 40)}" ` + + `cls="${cls}" parentTid="${parentTid}" w=${w} h=${h}>`; + }; + + const editable = Array.from(document.querySelectorAll('[contenteditable="true"]')) + .slice(0, 12).map(describe); + const textboxes = Array.from(document.querySelectorAll('[role="textbox"]')) + .slice(0, 12).map(describe); + const composeFooters = Array.from(document.querySelectorAll( + '[data-tid*="footer"], [data-tid*="compose"], [data-tid*="message-pane"], .ck-editor, .ck-editor__main', + )).slice(0, 12).map(describe); + const chatBtn = document.querySelector('#chat-button') as HTMLElement | null; + const pressed = chatBtn?.getAttribute('aria-pressed') || ''; + return { + chatBtnPressed: pressed, + editable, + textboxes, + composeFooters, + }; + }); + this._logger.warn( + `[ComposeDiag] chatBtnPressed=${dump.chatBtnPressed} | ` + + `editable=${JSON.stringify(dump.editable)} | ` + + `textboxes=${JSON.stringify(dump.textboxes)} | ` + + `composeFooters=${JSON.stringify(dump.composeFooters)}`, + ); + } catch (e) { + this._logger.warn(`[ComposeDiag] failed to collect diagnostics: ${e}`); + } + } + + /** + * Minimal Markdown -> HTML converter for chat messages. + * + * Handles the subset our agent answers commonly produce: paragraphs, headers, + * bold/italic, inline code, fenced code, ordered/unordered lists and links. + * The output is plain HTML (no scripts/styles) intended for pasting into the + * Teams compose box via ``document.execCommand("insertHTML")``. + */ + /** + * Strip / replace Markdown markers that CKEditor 5's auto-format heuristics + * react to while characters are typed. The plain-text fallback in + * sendChatMessage() goes through this — otherwise leading `* `, `- `, `+ ` + * or `1. ` triggers auto-list conversion, which swallows the marker AND + * the first character of the item ("Datenmigration" → "atenmigration"). + * + * We intentionally KEEP newlines so the caller can inject Shift+Enter + * soft breaks between lines. We do NOT keep the marker glyphs themselves + * (`#`, `*`, `_`, `~`, backtick) because CKEditor inline auto-format + * (bold/italic/code) reacts to them mid-type as well. + */ + private static _neutraliseForPlainType(input: string): string { + if (!input) return ''; + let out = input; + + out = out.replace(/```[\s\S]*?```/g, ' '); + out = out.replace(/`([^`\n]+)`/g, '$1'); + + out = out.replace(/^\s*#{1,6}\s+/gm, ''); + + out = out.replace(/^\s*[-*+]\s+/gm, '\u2022 '); + + out = out.replace(/^\s*(\d+)[\.\)]\s+/gm, '$1) '); + + out = out.replace(/\*\*([^*]+)\*\*/g, '$1'); + out = out.replace(/__([^_]+)__/g, '$1'); + out = out.replace(/(? + /-/.test(line) && !/[A-Za-z0-9]/.test(line) ? '' : line, + ); + out = out.replace(/^\s*\|?(.+?)\|?\s*$/gm, (line) => { + if (!line.includes('|')) return line; + const cells = line + .trim() + .replace(/^\|/, '') + .replace(/\|\s*$/, '') + .split('|') + .map((c) => c.trim()) + .filter((c) => c.length > 0); + return cells.length >= 2 ? cells.join(' - ') : line; + }); + + out = out.replace(/^[ \t]*[-*_]{3,}[ \t]*$/gm, ''); + + out = out.replace(/[ \t]+/g, ' '); + out = out.replace(/\n{3,}/g, '\n\n'); + + return out.trim(); + } + + private static _markdownToHtml(input: string): string { + if (!input) return ''; + + const escapeHtml = (s: string): string => + s.replace(/&/g, '&').replace(//g, '>'); + + // Inject a blank line BEFORE every Markdown table header so the block + // splitter (split on \n{2,}) puts the table into its own block. AI + // models routinely emit tables right after a sentence like "Hier ist + // die Tabelle:" without a blank line in between, which would otherwise + // glue the intro into the same block and break table detection. + const preLines = input.split('\n'); + const normalised: string[] = []; + for (let i = 0; i < preLines.length; i++) { + const cur = preLines[i]; + const next = preLines[i + 1] ?? ''; + const curHasPipe = /\|/.test(cur); + const nextIsSep = /^\s*\|?[\s\-:|]+\|?\s*$/.test(next.trim()) && /-/.test(next); + const prev = i > 0 ? preLines[i - 1] : ''; + if (curHasPipe && nextIsSep && i > 0 && prev.trim() !== '') { + normalised.push(''); + } + normalised.push(cur); + } + input = normalised.join('\n'); + + const codeBlocks: string[] = []; + let working = input.replace(/```([\s\S]*?)```/g, (_match, body: string) => { + const idx = codeBlocks.push(`
${escapeHtml(body.trim())}
`) - 1; + return `\u0000CODEBLOCK_${idx}\u0000`; + }); + + working = escapeHtml(working); + + const inlineCodes: string[] = []; + working = working.replace(/`([^`\n]+)`/g, (_m, body: string) => { + const idx = inlineCodes.push(`${body}`) - 1; + return `\u0000INLINECODE_${idx}\u0000`; + }); + + const blocks = working.split(/\n{2,}/); + const renderedBlocks: string[] = []; + + for (const rawBlock of blocks) { + const block = rawBlock.trim(); + if (!block) continue; + + const codeBlockMatch = block.match(/^\u0000CODEBLOCK_(\d+)\u0000$/); + if (codeBlockMatch) { + renderedBlocks.push(codeBlocks[Number(codeBlockMatch[1])]); + continue; + } + + const headerMatch = block.match(/^(#{1,6})\s+(.+)$/); + if (headerMatch) { + const level = headerMatch[1].length; + renderedBlocks.push(`${ChatProcedure._renderInline(headerMatch[2].trim())}`); + continue; + } + + const lines = block.split('\n'); + + // Markdown pipe-table: + // | Header 1 | Header 2 | + // |----------|----------| + // | Cell 1 | Cell 2 | + // + // Detection: at least header + separator (>= 2 lines), the SECOND line + // looks like the divider (only -, :, |, spaces) and contains at least + // one dash, and at least the first or second line uses a pipe. + if (lines.length >= 2 && /\|/.test(lines[0])) { + const sep = lines[1].trim(); + const looksLikeSep = /^\|?[\s\-:|]+\|?$/.test(sep) && /-/.test(sep); + if (looksLikeSep) { + const headerCells = ChatProcedure._splitTableRow(lines[0]); + const bodyRows = lines + .slice(2) + .filter((l) => l.trim().length > 0) + .map((l) => ChatProcedure._splitTableRow(l)); + + const thead = + `${headerCells + .map((c) => `${ChatProcedure._renderInline(c)}`) + .join('')}`; + + const tbody = bodyRows.length + ? `${bodyRows + .map( + (row) => + `${row + .map((c) => `${ChatProcedure._renderInline(c)}`) + .join('')}`, + ) + .join('')}` + : ''; + + renderedBlocks.push(`${thead}${tbody}
`); + continue; + } + } + + const isUnordered = lines.every((l) => /^\s*[-*+]\s+/.test(l)); + const isOrdered = lines.every((l) => /^\s*\d+[.)]\s+/.test(l)); + + if (isUnordered && lines.length > 0) { + const items = lines.map((l) => l.replace(/^\s*[-*+]\s+/, '').trim()); + renderedBlocks.push(`
    ${items.map((i) => `
  • ${ChatProcedure._renderInline(i)}
  • `).join('')}
`); + continue; + } + + if (isOrdered && lines.length > 0) { + const items = lines.map((l) => l.replace(/^\s*\d+[.)]\s+/, '').trim()); + renderedBlocks.push(`
    ${items.map((i) => `
  1. ${ChatProcedure._renderInline(i)}
  2. `).join('')}
`); + continue; + } + + const paragraph = ChatProcedure._renderInline(block.replace(/\n/g, '
')); + renderedBlocks.push(`

${paragraph}

`); + } + + let html = renderedBlocks.join(''); + + html = html.replace(/\u0000INLINECODE_(\d+)\u0000/g, (_m, idx: string) => inlineCodes[Number(idx)]); + html = html.replace(/\u0000CODEBLOCK_(\d+)\u0000/g, (_m, idx: string) => codeBlocks[Number(idx)]); + + return html; + } + + /** + * Render inline markdown (bold/italic/links) on already-escaped text. + */ + private static _renderInline(text: string): string { + let out = text; + out = out.replace(/\[([^\]]+)\]\(([^)\s]+)\)/g, (_m, label: string, url: string) => { + return `${label}`; + }); + out = out.replace(/\*\*([^*\n]+)\*\*/g, '$1'); + out = out.replace(/__([^_\n]+)__/g, '$1'); + out = out.replace(/(?$1'); + out = out.replace(/(?$1'); + return out; + } + + /** + * Split one Markdown table row into its cells. Drops the optional leading + * and trailing pipe and trims whitespace around every cell. Empty trailing + * cells (which appear when authors close the row with `|`) are removed. + */ + private static _splitTableRow(line: string): string[] { + const trimmed = line.trim().replace(/^\|/, '').replace(/\|\s*$/, ''); + const cells = trimmed.split('|').map((c) => c.trim()); + while (cells.length > 0 && cells[cells.length - 1] === '') cells.pop(); + return cells; + } + /** * Stop monitoring chat messages. */ diff --git a/src/bot/mediaGetUserMediaPatch.ts b/src/bot/mediaGetUserMediaPatch.ts new file mode 100644 index 0000000..cc4456b --- /dev/null +++ b/src/bot/mediaGetUserMediaPatch.ts @@ -0,0 +1,385 @@ +/** + * Injected in the browser: wraps getUserMedia, TTS destination, optional canvas + * video. Must be a single self-contained function for Playwright serialization. + * Re-calling this on the same document re-patches gUM and reuses the saved + * Chromium getUserMedia + AudioContext when present (Teams can replace + * navigator.mediaDevices.getUserMedia after a document/iframe refresh). + */ +export type MediaGetUserMediaPatchOptions = { + useCanvasVideo: boolean; + displayLabel: string; +}; + +export const poweronMediaPatchInstall = (opts: MediaGetUserMediaPatchOptions) => { + 'use strict'; + const { useCanvasVideo, displayLabel } = opts; + const w: any = window as any; + + if (!w.__gumChromium) { + w.__gumChromium = (navigator.mediaDevices as any).getUserMedia.bind(navigator.mediaDevices); + } + + // Patch RTCPeerConnection.prototype methods once per realm to observe + react to Teams' track placement. + if (!w.__poweronRtcPatched && (window as any).RTCPeerConnection) { + w.__poweronRtcPatched = true; + const RTCProto: any = (window as any).RTCPeerConnection.prototype; + const _origAddTrack = RTCProto.addTrack; + const _origAddTransceiver = RTCProto.addTransceiver; + RTCProto.addTrack = function (track: MediaStreamTrack, ...streams: MediaStream[]) { + try { + // eslint-disable-next-line no-console + console.log( + '[AudioPlayback] pc.addTrack kind=' + (track && track.kind) + + ' id=' + (track && track.id) + + ' enabled=' + (track && track.enabled), + ); + } catch { + // ignore + } + let useTrack: MediaStreamTrack = track; + try { + if (useCanvasVideo && track && track.kind === 'video') { + if (typeof w.__startBotAvatarStream === 'function') { + w.__startBotAvatarStream(); + } + const av: MediaStreamTrack | undefined = w.__botAvatarVideoTrack; + if (av && av.readyState === 'live') { + try { + track.stop(); + } catch { + // ignore + } + useTrack = av.clone(); + // eslint-disable-next-line no-console + console.log('[AudioPlayback] pc.addTrack swapped video -> avatar id=' + useTrack.id); + } + } + } catch { + // ignore + } + return _origAddTrack.call(this, useTrack, ...streams); + }; + RTCProto.addTransceiver = function (trackOrKind: any, init?: any) { + try { + const k = typeof trackOrKind === 'string' ? trackOrKind : trackOrKind?.kind; + // eslint-disable-next-line no-console + console.log( + '[AudioPlayback] pc.addTransceiver kind=' + k + + ' direction=' + (init && init.direction), + ); + } catch { + // ignore + } + return _origAddTransceiver.call(this, trackOrKind, init); + }; + } + + if (!w.__ttsStreamDest) { + const AudioContextClass = (window as any).AudioContext || (window as any).webkitAudioContext; + const ctx: AudioContext = new AudioContextClass(); + const streamDest: MediaStreamAudioDestinationNode = ctx.createMediaStreamDestination(); + w.__ttsAudioContext = ctx; + w.__ttsStreamDest = streamDest; + w.__ttsAudioStream = streamDest.stream; + } + const streamDest = w.__ttsStreamDest as MediaStreamAudioDestinationNode; + if (!streamDest) { + return; + } + + const _fps = 15; + w.__startBotAvatarStream = () => { + if ( + w.__botAvatarStreamStarted + && w.__botAvatarVideoTrack + && w.__botAvatarVideoTrack.readyState === 'live' + && w.__botAvatarCanvas + && w.__botAvatarCanvas.isConnected + ) { + return; + } + if (w.__botAvatarDrawInterval) { + clearInterval(w.__botAvatarDrawInterval); + w.__botAvatarDrawInterval = null; + } + try { + w.__botAvatarVideoTrack?.stop?.(); + } catch { + // ignore + } + w.__botAvatarStreamStarted = true; + w.__botAvatarDisplayLabel = displayLabel; + const canvas = document.createElement('canvas'); + canvas.width = 640; + canvas.height = 360; + canvas.setAttribute('data-poweron-avatar', '1'); + canvas.style.cssText = + 'position:fixed;right:0;bottom:0;width:4px;height:4px;z-index:2147483646;opacity:1;pointer-events:none;'; + (document.body || document.documentElement).appendChild(canvas); + w.__botAvatarCanvas = canvas; + const c2d = canvas.getContext('2d'); + let t = 0; + const draw = () => { + if (!c2d) { + return; + } + t += 0.04; + const wPx = canvas.width; + const hPx = canvas.height; + c2d.fillStyle = '#061525'; + c2d.fillRect(0, 0, wPx, hPx); + const g = c2d.createLinearGradient(0, 0, wPx, hPx); + g.addColorStop(0, '#1a4f8c'); + g.addColorStop(0.5, '#0c305a'); + g.addColorStop(1, '#132e6e'); + c2d.fillStyle = g; + c2d.fillRect(0, 0, wPx, hPx); + c2d.strokeStyle = 'rgba(255, 200, 80, 0.95)'; + c2d.lineWidth = 3; + c2d.strokeRect(6, 6, wPx - 12, hPx - 12); + c2d.fillStyle = 'rgba(255, 220, 120, 0.95)'; + c2d.font = '600 13px system-ui, "Segoe UI", sans-serif'; + c2d.textAlign = 'left'; + c2d.textBaseline = 'top'; + c2d.fillText('PORTA', 14, 10); + c2d.textAlign = 'center'; + c2d.textBaseline = 'middle'; + c2d.fillStyle = '#ffffff'; + c2d.font = 'bold 28px system-ui, "Segoe UI", sans-serif'; + const line = (w.__botAvatarDisplayLabel || displayLabel).toString().slice(0, 72); + c2d.fillText(line, wPx / 2, hPx / 2 - 6); + c2d.fillStyle = 'rgba(255,255,255,0.78)'; + c2d.font = '14px system-ui, "Segoe UI", sans-serif'; + c2d.fillText('poweron', wPx / 2, hPx / 2 + 26); + const pulse = 0.75 + 0.25 * Math.sin(t); + c2d.fillStyle = 'rgba(120, 200, 255, ' + 0.15 * pulse + ')'; + c2d.fillRect(0, 0, wPx, 6); + c2d.fillRect(0, hPx - 6, wPx, 6); + }; + draw(); + w.__botAvatarDrawInterval = window.setInterval(draw, 1000 / _fps); + const cap = canvas.captureStream(_fps); + w.__botAvatarVideoTrack = cap.getVideoTracks()[0]; + if (w.__botAvatarVideoTrack) { + w.__botAvatarVideoTrack.enabled = true; + try { + w.__botAvatarVideoTrack.contentHint = 'motion'; + } catch { + // ignore + } + } + // eslint-disable-next-line no-console + console.log( + '[AudioPlayback] canvas avatar stream (re)built, videoTrack=', + w.__botAvatarVideoTrack ? w.__botAvatarVideoTrack.id : 'none', + ); + }; + + w.__forceVideoTrackToSenders = async () => { + if (!useCanvasVideo) { + return { replaced: 0, pcs: 0, reason: 'canvas-video-off' }; + } + w.__startBotAvatarStream(); + const src: MediaStreamTrack | undefined = w.__botAvatarVideoTrack; + if (!src) { + return { replaced: 0, pcs: 0, reason: 'no-avatar-track' }; + } + const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[]; + let replaced = 0; + let added = 0; + let videoTransceivers = 0; + let videoSendersWithTrack = 0; + let videoSendersWithoutTrack = 0; + let totalTransceivers = 0; + const directionsBefore: string[] = []; + const directionsAfter: string[] = []; + for (const pc of pcs) { + const transceivers = (pc as any).getTransceivers?.() || []; + totalTransceivers += transceivers.length; + let pcHasVideoSender = false; + for (const t of transceivers) { + const sender = t.sender; + if (!sender) { + continue; + } + const senderKind = sender.track?.kind; + const receiverKind = t.receiver?.track?.kind; + const txKind = (t as any).kind || senderKind || receiverKind || null; + if (txKind !== 'video') { + continue; + } + videoTransceivers++; + pcHasVideoSender = true; + directionsBefore.push(t.direction); + if (sender.track) { + videoSendersWithTrack++; + } else { + videoSendersWithoutTrack++; + } + try { + // eslint-disable-next-line no-await-in-loop + await sender.replaceTrack(src.clone()); + replaced++; + const tr = sender.track; + if (tr && !tr.enabled) { + tr.enabled = true; + } + if (t.direction === 'inactive' || t.direction === 'recvonly') { + try { + t.direction = 'sendrecv'; + } catch { + // ignore + } + } + directionsAfter.push(t.direction); + } catch (err: any) { + directionsAfter.push('err:' + String(err && err.message ? err.message : err).slice(0, 32)); + } + } + if (!pcHasVideoSender) { + try { + const newSender = (pc as any).addTrack(src.clone(), w.__botAvatarCanvas?.captureStream + ? w.__botAvatarCanvas.captureStream(15) + : new MediaStream([src.clone()])); + if (newSender) { + added++; + } + } catch (err) { + directionsAfter.push('addTrack-err:' + String((err as any)?.message || err).slice(0, 32)); + } + } + } + return { + replaced, + added, + pcs: pcs.length, + reason: 'ok', + videoTransceivers, + videoSendersWithTrack, + videoSendersWithoutTrack, + totalTransceivers, + directionsBefore, + directionsAfter, + }; + }; + + const _wrappedGUM = async (constraints?: MediaStreamConstraints) => { + // eslint-disable-next-line no-console + console.log( + '[AudioPlayback] gUM call audio=' + !!(constraints && constraints.audio) + + ' video=' + !!(constraints && constraints.video), + ); + // eslint-disable-next-line no-restricted-globals + const realStream = await w.__gumChromium(constraints); + const wantAudio = !!(constraints && constraints.audio); + const wantVideo = !!(constraints && constraints.video); + + if (useCanvasVideo && wantVideo) { + w.__startBotAvatarStream(); + const vt: MediaStreamTrack | undefined = w.__botAvatarVideoTrack; + if (!vt) { + return realStream; + } + const vClone = vt.clone(); + if (wantAudio) { + const combinedStream = new MediaStream(); + streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone())); + combinedStream.addTrack(vClone); + try { + realStream.getTracks().forEach(t => t.stop()); + } catch { + // ignore + } + // eslint-disable-next-line no-console + console.log( + '[AudioPlayback] getUserMedia (canvas+tts): a=' + combinedStream.getAudioTracks().length + + ' v=' + combinedStream.getVideoTracks().length, + ); + return combinedStream; + } + const videoOnly = new MediaStream(); + videoOnly.addTrack(vClone); + try { + realStream.getTracks().forEach(t => t.stop()); + } catch { + // ignore + } + return videoOnly; + } + + if (wantAudio) { + const combinedStream = new MediaStream(); + streamDest.stream.getAudioTracks().forEach((t: MediaStreamTrack) => combinedStream.addTrack(t.clone())); + realStream.getVideoTracks().forEach(t => combinedStream.addTrack(t)); + // eslint-disable-next-line no-console + console.log( + '[AudioPlayback] gUM audio: a=' + combinedStream.getAudioTracks().length + + ' v=' + combinedStream.getVideoTracks().length, + ); + return combinedStream; + } + return realStream; + }; + + try { + Object.defineProperty(navigator.mediaDevices, 'getUserMedia', { + configurable: true, + enumerable: true, + writable: true, + value: _wrappedGUM, + }); + } catch { + (navigator.mediaDevices as any).getUserMedia = _wrappedGUM; + } + // Some libraries cache navigator.getUserMedia (legacy) + try { + (navigator as any).getUserMedia = (constraints: MediaStreamConstraints, ok: any, err: any) => { + _wrappedGUM(constraints).then(ok, err); + }; + } catch { + // ignore + } + + w.__forceTtsTrackToSenders = async () => { + const pcs: RTCPeerConnection[] = (w.__audioCapturePeerConnections || []) as RTCPeerConnection[]; + const ttsTrack = streamDest.stream.getAudioTracks()?.[0]; + if (!ttsTrack) { + return { replaced: 0, pcs: pcs?.length || 0, reason: 'no-tts-track' }; + } + const diag: Record = { + ttsTrackId: ttsTrack.id, + ttsTrackEnabled: ttsTrack.enabled, + ttsTrackReadyState: ttsTrack.readyState, + ttsTrackMuted: ttsTrack.muted, + beforeSenderTrackIds: [] as string[], + afterSenderTrackIds: [] as string[], + }; + let replaced = 0; + for (const pc of pcs) { + try { + const senders = pc.getSenders?.() || []; + for (const sender of senders) { + if (sender?.track?.kind === 'audio') { + diag.beforeSenderTrackIds.push(sender.track.id); + const freshClone = ttsTrack.clone(); + // eslint-disable-next-line no-await-in-loop + await sender.replaceTrack(freshClone); + replaced++; + const afterTrack = sender.track; + diag.afterSenderTrackIds.push(afterTrack ? afterTrack.id : 'null'); + diag.afterSenderTrackEnabled = afterTrack ? afterTrack.enabled : undefined; + diag.afterSenderTrackReadyState = afterTrack ? afterTrack.readyState : undefined; + diag.originalTrackState = ttsTrack.readyState; + if (afterTrack && !afterTrack.enabled) { + afterTrack.enabled = true; + diag.forcedEnabled = true; + } + } + } + } catch (err: any) { + diag.error = String(err && err.message ? err.message : err); + } + } + return { replaced, pcs: pcs?.length || 0, reason: 'ok', diag }; + }; +}; diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts index 2f00028..624306d 100644 --- a/src/bot/orchestrator.ts +++ b/src/bot/orchestrator.ts @@ -15,10 +15,11 @@ import { AudioCaptureProcedure } from './audioCaptureProcedure'; import { ChatProcedure, ChatMessageEntry } from './chatProcedure'; import { AuthProcedure, MfaChallenge } from './authProcedure'; import { TeamsActionsService } from './teamsActionsService'; +import { BackgroundProcedure } from './backgroundProcedure'; import { isValidMeetingUrl, getMeetingLaunchUrl, resolveLaunchUrl } from './meetingUrlParser'; -// Camera / fake video injection is disabled for now to focus on stability. -// The Y4M fake video file was causing browser crashes when audio started flowing. +// Optional: canvas "avatar" video (config.botUseCanvasVideo) replaces the Chromium +// fake test pattern when the camera is on. Y4M file injection remains disabled. export interface OrchestratorCallbacks { onStateChange: (state: BotState, message?: string) => void; @@ -76,6 +77,11 @@ export class BotOrchestrator { private _chatQueueProcessing: boolean = false; private _mfaResolver: ((response: { action: string; code?: string }) => void) | null = null; + /** Debounce Teams iframe navigations (media runs in a child frame) */ + private _frameNavMediaRebindTimer: ReturnType | null = null; + /** Re-apply gUM + video senders for a few seconds after join */ + private _canvasRebindTimer: ReturnType | null = null; + constructor( sessionId: string, meetingUrl: string, @@ -205,6 +211,11 @@ export class BotOrchestrator { // Ensure microphone is ON (required for voice playback) await this._ensureMicOn(); + if (config.botUseCanvasVideo) { + await this._ensureCameraOn(); + const bg = new BackgroundProcedure(this._page!, this._logger); + void bg.trySelectNoVirtualBackground(); + } // STEP 2: Enter bot name and click "Join now" await this._takeScreenshot('anon-step2-before-join', this._isDebugMode); @@ -234,6 +245,10 @@ export class BotOrchestrator { // Initialize audio playback await this._audioProcedure!.initialize(); + if (config.botUseCanvasVideo) { + await this._ensureCameraOnInMeeting(); + this._startCanvasRebindAfterJoin(); + } // Enable transcript capture (captions or audio based on transferMode) await this._enableTranscriptCapture(); @@ -414,6 +429,11 @@ export class BotOrchestrator { // Ensure microphone is ON before joining (required for voice playback) await this._ensureMicOn(); + if (config.botUseCanvasVideo) { + await this._ensureCameraOn(); + const bg = new BackgroundProcedure(this._page!, this._logger); + void bg.trySelectNoVirtualBackground(); + } // STEP 5: Poll for "Join now" on the pre-join screen await this._takeScreenshot('step5-before-join-now', this._isDebugMode); @@ -436,11 +456,37 @@ export class BotOrchestrator { this._startKeepAlive(); await this._audioProcedure!.initialize(); + if (config.botUseCanvasVideo) { + await this._ensureCameraOnInMeeting(); + this._startCanvasRebindAfterJoin(); + } await this._enableTranscriptCapture(); await this._enableChat(); await this._sendJoinGreeting(); } + private _startCanvasRebindAfterJoin(): void { + this._stopCanvasRebindAfterJoin(); + if (!config.botUseCanvasVideo || !this._audioProcedure) { + return; + } + let n = 0; + this._canvasRebindTimer = setInterval(() => { + n += 1; + void this._audioProcedure?.reinstallMediaPatchInAllFrames(); + if (n >= 35) { + this._stopCanvasRebindAfterJoin(); + } + }, 400); + } + + private _stopCanvasRebindAfterJoin(): void { + if (this._canvasRebindTimer) { + clearInterval(this._canvasRebindTimer); + this._canvasRebindTimer = null; + } + } + /** * Ensure the camera is turned on in the pre-join screen. * When camera is on, Teams shows the profile/background image. @@ -888,6 +934,12 @@ export class BotOrchestrator { this._isShuttingDown = true; this._logger.info('Stopping bot...'); + if (this._frameNavMediaRebindTimer) { + clearTimeout(this._frameNavMediaRebindTimer); + this._frameNavMediaRebindTimer = null; + } + this._stopCanvasRebindAfterJoin(); + // Stop keepalive first this._stopKeepAlive(); @@ -1077,7 +1129,10 @@ export class BotOrchestrator { }, this._options.language ); - this._audioProcedure = new AudioProcedure(this._page, this._logger); + this._audioProcedure = new AudioProcedure(this._page, this._logger, { + useCanvasVideo: config.botUseCanvasVideo, + displayLabel: this._botName, + }); this._teamsActions = new TeamsActionsService(this._page, this._logger); this._chatProcedure = new ChatProcedure( this._page, @@ -1100,6 +1155,19 @@ export class BotOrchestrator { // Aggressive hybrid mode: always capture meeting audio as transcript source. await this._audioCaptureProcedure!.injectCaptureOverride(); + this._page.on('framenavigated', () => { + if (!config.botUseCanvasVideo || !this._audioProcedure) { + return; + } + if (this._frameNavMediaRebindTimer) { + clearTimeout(this._frameNavMediaRebindTimer); + } + this._frameNavMediaRebindTimer = setTimeout(() => { + this._frameNavMediaRebindTimer = null; + void this._audioProcedure?.reinstallMediaPatchInAllFrames(); + }, 600); + }); + // Handle page errors this._page.on('pageerror', (error) => { this._logger.error('Page error:', error); @@ -1134,6 +1202,7 @@ export class BotOrchestrator { * Close the browser. */ private async _closeBrowser(): Promise { + this._stopCanvasRebindAfterJoin(); try { if (this._page) { await this._page.close(); @@ -1282,43 +1351,31 @@ export class BotOrchestrator { } /** - * Send a greeting message in the meeting chat AND via voice after joining. - * Uses the bot's display name and the configured language. - * Voice greeting confirms that the audio pipeline (TTS -> mic) is working. + * Signal "bot has joined the meeting" to the Gateway. The Gateway owns + * greeting generation: it produces a localised greeting via the AI + * service in the configured language + persona, then dispatches it back + * to this bot via the regular `sendChatMessage` command (chat) and the + * `playAudio` pipeline (voice). NO hardcoded greeting strings or + * language branches live in the bot — the bot is purely a transport. + * + * We still wait briefly so the chat panel + input have settled in the + * Teams DOM before the Gateway-driven `sendChatMessage` arrives. */ private async _sendJoinGreeting(): Promise { try { - const firstName = this._botName.split(' ')[0] || this._botName; - const lang = (this._options.language || 'de-DE').toLowerCase(); - - let greeting: string; - if (lang.startsWith('de')) { - greeting = `Hallo, hier ist ${firstName}. Ich bin bereit.`; - } else if (lang.startsWith('fr')) { - greeting = `Bonjour, c'est ${firstName}. Je suis prête.`; - } else if (lang.startsWith('it')) { - greeting = `Ciao, sono ${firstName}. Sono pronta.`; - } else { - greeting = `Hello, this is ${firstName}. I'm ready.`; - } - - this._logger.info(`Sending join greeting (chat + voice): ${greeting}`); - - // Brief delay so chat input is ready after panel open (Teams DOM can lag) + this._logger.info('Requesting join greeting from Gateway'); await new Promise((r) => setTimeout(r, 800)); - - // Chat greeting (queued; retries if input not found) - await this.sendChatMessageToMeeting(greeting); - - // Voice greeting — ask Gateway to generate TTS and send back playAudio this._sendToGateway({ - type: 'voiceGreeting', + type: 'requestGreeting', sessionId: this._sessionId, - text: greeting, - language: this._options.language || 'de-DE', + // Hint the Gateway about display name + language; Gateway already + // has the canonical config but passing them here keeps the contract + // self-contained and avoids a DB lookup just for greeting text. + botName: this._botName, + language: this._options.language || '', }); } catch (error) { - this._logger.warn('Could not send join greeting:', error); + this._logger.warn('Could not request join greeting:', error); } } @@ -1505,15 +1562,21 @@ export class BotOrchestrator { fs.writeFileSync(filepath, buffer); this._logger.info(`Screenshot saved: ${filepath}`); - // Also log as base64 for Azure logs (truncated for readability) - const base64 = buffer.toString('base64'); - this._logger.info(`SCREENSHOT_BASE64_START:${name}`); - // Log in chunks to avoid log line limits - const chunkSize = 50000; - for (let i = 0; i < base64.length; i += chunkSize) { - this._logger.info(`SCREENSHOT_CHUNK:${base64.substring(i, i + chunkSize)}`); + // Optional: also stream the PNG as base64 chunks into the log. Nobody + // parses these chunks back into images — they exist purely so that + // cloud deployments without disk access (e.g. Azure App Service) can + // recover screenshots from log search. The UI loads screenshots via + // the REST proxy, NOT from these log lines, so we keep this OFF by + // default to avoid spamming the bot log with ~200 KB blobs per shot. + if (config.screenshotLogBase64) { + const base64 = buffer.toString('base64'); + this._logger.info(`SCREENSHOT_BASE64_START:${name}`); + const chunkSize = 50000; + for (let i = 0; i < base64.length; i += chunkSize) { + this._logger.info(`SCREENSHOT_CHUNK:${base64.substring(i, i + chunkSize)}`); + } + this._logger.info(`SCREENSHOT_BASE64_END:${name}`); } - this._logger.info(`SCREENSHOT_BASE64_END:${name}`); } catch (error) { this._logger.error('Error taking screenshot:', error); } diff --git a/src/config.ts b/src/config.ts index 5978364..5e2bcd1 100644 --- a/src/config.ts +++ b/src/config.ts @@ -14,6 +14,11 @@ export const config = { // Bot botName: process.env.BOT_NAME || 'PowerOn AI', botHeadless: process.env.BOT_HEADLESS !== 'false', + /** + * Replace Chromium's fake test-pattern video with a canvas stream (gradient + label). + * Unset in production with BOT_USE_CANVAS_VIDEO=false if you need camera off / profile tile only. + */ + botUseCanvasVideo: process.env.BOT_USE_CANVAS_VIDEO !== 'false', // Logging logLevel: process.env.LOG_LEVEL || 'info', @@ -22,6 +27,12 @@ export const config = { // Screenshots screenshotDir: process.env.SCREENSHOT_DIR || './output/screenshots', screenshotOnError: process.env.SCREENSHOT_ON_ERROR === 'true', + // Stream screenshot bytes as base64 chunks into the bot log. Only useful in + // cloud deployments (e.g. Azure App Service) where the screenshot files on + // disk are not reachable. Locally the UI loads them via the REST proxy + // (/api/teamsbot/{instanceId}/screenshots/{file}), so this just bloats the + // log. Default OFF. + screenshotLogBase64: process.env.SCREENSHOT_LOG_BASE64 === 'true', // Timeouts (in milliseconds) timeouts: {