From 80747d4aaca33601f7fbd529ecfe2ff169e012bb Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Tue, 17 Feb 2026 22:47:48 +0100
Subject: [PATCH] feat: white Y4M fake video feed + prioritize live captions
over transcription panel
Co-authored-by: Cursor
---
src/bot/captionsProcedure.ts | 255 ++++++++++++++++++-----------------
src/bot/orchestrator.ts | 39 ++++++
2 files changed, 171 insertions(+), 123 deletions(-)
diff --git a/src/bot/captionsProcedure.ts b/src/bot/captionsProcedure.ts
index edb0913..b0799fb 100644
--- a/src/bot/captionsProcedure.ts
+++ b/src/bot/captionsProcedure.ts
@@ -107,10 +107,10 @@ export class CaptionsProcedure {
*
* Strategies in priority order:
* 1. Direct captions button (anonymous / light-meetings UI)
- * 2. "Record and transcribe" → "Start transcription" (authenticated Teams 2025+)
- * → triggers spoken-language-selection-dialog handled by _handleLanguageDialog()
+ * 2. "Language and speech" → live captions toggle (authenticated, no panel needed)
* 3. "Captions & transcripts" submenu (older authenticated Teams)
- * 4. "Language and speech" panel toggle (alternative path)
+ * 4. "Record and transcribe" → "Start transcription" (authenticated, fallback with panel)
+ * → triggers spoken-language-selection-dialog handled by _handleLanguageDialog()
* 5. Generic text / DOM scan fallback
*/
private async _clickEnableCaptions(): Promise {
@@ -137,9 +137,135 @@ export class CaptionsProcedure {
}
}
- // ── Strategy 2: "Record and transcribe" → "Start transcription" ──
- // Authenticated Teams 2025+: More → Record and transcribe → Start transcription
- // After clicking, a spoken-language-selection-dialog appears (handled later).
+ // ── Strategy 2: "Language and speech" → live captions toggle (no panel) ──
+ // Preferred for authenticated joins: enables caption overlay at bottom (same as anonymous)
+ const langSpeechSelectors = [
+ '[data-tid="LanguageSpeechMenuControl-id"]',
+ 'div[role="menuitem"]:has-text("Language and speech")',
+ 'div[role="menuitem"]:has-text("Sprache und Spracheingabe")',
+ ];
+
+ for (const selector of langSpeechSelectors) {
+ try {
+ const item = await this._page.$(selector);
+ if (item) {
+ await item.click();
+ this._logger.info(`Clicked "Language and speech": ${selector}`);
+ await this._page.waitForTimeout(2000);
+
+ const panelToggles = await this._page.evaluate(() => {
+ const switches = document.querySelectorAll(
+ 'input[role="switch"], [role="switch"], input[type="checkbox"]'
+ );
+ return Array.from(switches).map(s => ({
+ tid: s.getAttribute('data-tid') || '',
+ label: s.getAttribute('aria-label') || '',
+ checked: (s as HTMLInputElement).checked,
+ nearText: ((s.closest('div, label') as HTMLElement)?.textContent || '')
+ .trim().substring(0, 80),
+ }));
+ });
+ this._logger.info(`Panel toggles: ${JSON.stringify(panelToggles)}`);
+
+ const toggleResult = await this._page.evaluate(() => {
+ const switches = document.querySelectorAll(
+ 'input[role="switch"], [role="switch"], input[type="checkbox"]'
+ );
+ for (const sw of Array.from(switches)) {
+ const label = (sw.getAttribute('aria-label') || '').toLowerCase();
+ const tid = (sw.getAttribute('data-tid') || '').toLowerCase();
+ const parentEl = sw.closest('div, label, span') as HTMLElement;
+ const nearText = (parentEl?.textContent || '').toLowerCase();
+ const isCaptions =
+ label.includes('caption') || label.includes('untertitel') ||
+ tid.includes('caption') || tid.includes('subtitle') ||
+ nearText.includes('live caption') || nearText.includes('liveuntertitel');
+ if (isCaptions) {
+ if (!(sw as HTMLInputElement).checked) {
+ (sw as HTMLElement).click();
+ return { found: true, clicked: true, info: label || tid || nearText.substring(0, 60) };
+ }
+ return { found: true, clicked: false, info: `already on: ${label || tid}` };
+ }
+ }
+ return { found: false, clicked: false, info: '' };
+ });
+
+ this._logger.info(`Captions toggle result: ${JSON.stringify(toggleResult)}`);
+ if (toggleResult.found && toggleResult.clicked) {
+ await this._page.waitForTimeout(1500);
+ }
+ await this._page.keyboard.press('Escape');
+ if (toggleResult.found) return;
+
+ this._logger.warn('Language panel opened but no captions toggle found — trying next strategy');
+ break;
+ }
+ } catch {
+ // Continue
+ }
+ }
+
+ // ── Strategy 3: "Captions & transcripts" submenu (older Teams) ──
+ const submenuSelectors = [
+ '[data-tid="captions-and-transcripts-button"]',
+ '[role="menuitem"]:has-text("Captions & transcripts")',
+ '[role="menuitem"]:has-text("Captions and transcripts")',
+ '[role="menuitem"]:has-text("Untertitel und Transkripte")',
+ '[role="menuitem"]:has-text("Untertitel")',
+ ];
+
+ for (const selector of submenuSelectors) {
+ try {
+ const item = await this._page.$(selector);
+ if (item) {
+ await item.click();
+ this._logger.info(`Clicked captions submenu: ${selector}`);
+ await this._page.waitForTimeout(1500);
+
+ const enableSelectors = [
+ 'button:has-text("Turn on live captions")',
+ 'button:has-text("Live captions")',
+ 'button:has-text("Live-Untertitel aktivieren")',
+ '[role="menuitem"]:has-text("Turn on live captions")',
+ '[role="menuitem"]:has-text("Live captions")',
+ '[role="menuitemcheckbox"]:has-text("captions")',
+ '[data-tid="toggle-captions"]',
+ ];
+
+ for (const enableSel of enableSelectors) {
+ try {
+ const enableBtn = await this._page.$(enableSel);
+ if (enableBtn) {
+ await enableBtn.click();
+ this._logger.info(`Clicked enable captions: ${enableSel}`);
+ await this._page.waitForTimeout(1000);
+ return;
+ }
+ } catch {
+ // Continue
+ }
+ }
+
+ this._logger.info('Opened captions submenu but could not find enable button');
+ break;
+ }
+ } catch {
+ // Continue
+ }
+ }
+
+ // ── Strategy 4 (fallback): "Record and transcribe" → "Start transcription" ──
+ // Requires transcript panel to be visible for scraping. Only used if live captions failed.
+ this._logger.info('Live captions not available, trying transcription fallback...');
+
+ // Re-open More menu (previous strategies may have closed it)
+ try {
+ await this._openMoreMenu();
+ } catch {
+ this._logger.warn('Could not re-open More menu for transcription fallback');
+ }
+
const recordMenuSelectors = [
'[data-tid="RecordingMenuControl-id"]',
'div[role="menuitem"]:has-text("Record and transcribe")',
@@ -239,123 +365,6 @@ export class CaptionsProcedure {
}
}
- // ── Strategy 3: "Captions & transcripts" submenu (older Teams) ──
- const submenuSelectors = [
- '[data-tid="captions-and-transcripts-button"]',
- '[role="menuitem"]:has-text("Captions & transcripts")',
- '[role="menuitem"]:has-text("Captions and transcripts")',
- '[role="menuitem"]:has-text("Untertitel und Transkripte")',
- '[role="menuitem"]:has-text("Untertitel")',
- ];
-
- for (const selector of submenuSelectors) {
- try {
- const item = await this._page.$(selector);
- if (item) {
- await item.click();
- this._logger.info(`Clicked captions submenu: ${selector}`);
- await this._page.waitForTimeout(1500);
-
- const enableSelectors = [
- 'button:has-text("Turn on live captions")',
- 'button:has-text("Live captions")',
- 'button:has-text("Live-Untertitel aktivieren")',
- '[role="menuitem"]:has-text("Turn on live captions")',
- '[role="menuitem"]:has-text("Live captions")',
- '[role="menuitemcheckbox"]:has-text("captions")',
- '[data-tid="toggle-captions"]',
- ];
-
- for (const enableSel of enableSelectors) {
- try {
- const enableBtn = await this._page.$(enableSel);
- if (enableBtn) {
- await enableBtn.click();
- this._logger.info(`Clicked enable captions: ${enableSel}`);
- await this._page.waitForTimeout(1000);
- return;
- }
- } catch {
- // Continue
- }
- }
-
- this._logger.info('Opened captions submenu but could not find enable button');
- break;
- }
- } catch {
- // Continue
- }
- }
-
- // ── Strategy 4: "Language and speech" panel toggle ──
- const langSpeechSelectors = [
- '[data-tid="LanguageSpeechMenuControl-id"]',
- 'div[role="menuitem"]:has-text("Language and speech")',
- 'div[role="menuitem"]:has-text("Sprache und Spracheingabe")',
- ];
-
- for (const selector of langSpeechSelectors) {
- try {
- const item = await this._page.$(selector);
- if (item) {
- await item.click();
- this._logger.info(`Clicked "Language and speech": ${selector}`);
- await this._page.waitForTimeout(2000);
-
- const panelToggles = await this._page.evaluate(() => {
- const switches = document.querySelectorAll(
- 'input[role="switch"], [role="switch"], input[type="checkbox"]'
- );
- return Array.from(switches).map(s => ({
- tid: s.getAttribute('data-tid') || '',
- label: s.getAttribute('aria-label') || '',
- checked: (s as HTMLInputElement).checked,
- nearText: ((s.closest('div, label') as HTMLElement)?.textContent || '')
- .trim().substring(0, 80),
- }));
- });
- this._logger.info(`Panel toggles: ${JSON.stringify(panelToggles)}`);
-
- const toggleResult = await this._page.evaluate(() => {
- const switches = document.querySelectorAll(
- 'input[role="switch"], [role="switch"], input[type="checkbox"]'
- );
- for (const sw of Array.from(switches)) {
- const label = (sw.getAttribute('aria-label') || '').toLowerCase();
- const tid = (sw.getAttribute('data-tid') || '').toLowerCase();
- const parentEl = sw.closest('div, label, span') as HTMLElement;
- const nearText = (parentEl?.textContent || '').toLowerCase();
- const isCaptions =
- label.includes('caption') || label.includes('untertitel') ||
- tid.includes('caption') || tid.includes('subtitle') ||
- nearText.includes('live caption') || nearText.includes('liveuntertitel');
- if (isCaptions) {
- if (!(sw as HTMLInputElement).checked) {
- (sw as HTMLElement).click();
- return { found: true, clicked: true, info: label || tid || nearText.substring(0, 60) };
- }
- return { found: true, clicked: false, info: `already on: ${label || tid}` };
- }
- }
- return { found: false, clicked: false, info: '' };
- });
-
- this._logger.info(`Captions toggle result: ${JSON.stringify(toggleResult)}`);
- if (toggleResult.found && toggleResult.clicked) {
- await this._page.waitForTimeout(1500);
- }
- await this._page.keyboard.press('Escape');
- if (toggleResult.found) return;
-
- this._logger.warn('Language panel opened but no captions toggle found');
- break;
- }
- } catch {
- // Continue
- }
- }
-
// ── Strategy 5: DOM scan for anything containing "caption" / "transcri" ──
const found = await this._page.evaluate(() => {
const keywords = ['caption', 'captions', 'untertitel', 'live caption', 'transcri', 'transkri'];
diff --git a/src/bot/orchestrator.ts b/src/bot/orchestrator.ts
index 26a645b..828ae4f 100644
--- a/src/bot/orchestrator.ts
+++ b/src/bot/orchestrator.ts
@@ -3,6 +3,7 @@ import { Logger } from 'winston';
import { v4 as uuidv4 } from 'uuid';
import path from 'path';
import fs from 'fs';
+import os from 'os';
import WebSocket from 'ws';
import { config } from '../config';
@@ -16,6 +17,38 @@ import { ChatProcedure, ChatMessageEntry } from './chatProcedure';
import { AuthProcedure } from './authProcedure';
import { isValidMeetingUrl } from './meetingUrlParser';
+/**
+ * Generate a solid-white Y4M video file for use as fake camera input.
+ * Chromium loops this single frame at 30fps, so participants see a static white image.
+ * Later this can be replaced with a custom image (avatar/background).
+ */
+function _generateFakeVideoFile(): string {
+ const width = 1280;
+ const height = 720;
+ const filePath = path.join(os.tmpdir(), 'bot-video-white.y4m');
+
+ if (fs.existsSync(filePath)) return filePath;
+
+ const header = `YUV4MPEG2 W${width} H${height} F30:1 Ip A0:0 C420jpeg\n`;
+ const frameHeader = 'FRAME\n';
+
+ // White in YUV: Y=235, U=128, V=128
+ const yPlane = Buffer.alloc(width * height, 235);
+ const uvSize = (width / 2) * (height / 2);
+ const uPlane = Buffer.alloc(uvSize, 128);
+ const vPlane = Buffer.alloc(uvSize, 128);
+
+ const fd = fs.openSync(filePath, 'w');
+ fs.writeSync(fd, header);
+ fs.writeSync(fd, frameHeader);
+ fs.writeSync(fd, yPlane);
+ fs.writeSync(fd, uPlane);
+ fs.writeSync(fd, vPlane);
+ fs.closeSync(fd);
+
+ return filePath;
+}
+
export interface OrchestratorCallbacks {
onStateChange: (state: BotState, message?: string) => void;
onTranscript: (entry: TranscriptEntry) => void;
@@ -770,16 +803,22 @@ export class BotOrchestrator {
private async _launchBrowser(authMode: boolean = false): Promise {
this._logger.info(`Launching browser (authMode=${authMode})...`);
+ // Generate a solid white Y4M video file so participants see a clean image
+ const fakeVideoPath = _generateFakeVideoFile();
+ this._logger.info(`Fake video file: ${fakeVideoPath}`);
+
const args = authMode
? [
// Chromium Minimal: only --no-sandbox + fake media (proven to work for authenticated join)
'--no-sandbox',
'--use-fake-ui-for-media-stream',
'--use-fake-device-for-media-stream',
+ `--use-file-for-fake-video-capture=${fakeVideoPath}`,
]
: [
'--use-fake-ui-for-media-stream',
'--use-fake-device-for-media-stream',
+ `--use-file-for-fake-video-capture=${fakeVideoPath}`,
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--autoplay-policy=no-user-gesture-required',