feat: white Y4M fake video feed + prioritize live captions over transcription panel
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
4c0afa3a12
commit
80747d4aac
2 changed files with 171 additions and 123 deletions
|
|
@ -107,10 +107,10 @@ export class CaptionsProcedure {
|
||||||
*
|
*
|
||||||
* Strategies in priority order:
|
* Strategies in priority order:
|
||||||
* 1. Direct captions button (anonymous / light-meetings UI)
|
* 1. Direct captions button (anonymous / light-meetings UI)
|
||||||
* 2. "Record and transcribe" → "Start transcription" (authenticated Teams 2025+)
|
* 2. "Language and speech" → live captions toggle (authenticated, no panel needed)
|
||||||
* → triggers spoken-language-selection-dialog handled by _handleLanguageDialog()
|
|
||||||
* 3. "Captions & transcripts" submenu (older authenticated Teams)
|
* 3. "Captions & transcripts" submenu (older authenticated Teams)
|
||||||
* 4. "Language and speech" panel toggle (alternative path)
|
* 4. "Record and transcribe" → "Start transcription" (authenticated, fallback with panel)
|
||||||
|
* → triggers spoken-language-selection-dialog handled by _handleLanguageDialog()
|
||||||
* 5. Generic text / DOM scan fallback
|
* 5. Generic text / DOM scan fallback
|
||||||
*/
|
*/
|
||||||
private async _clickEnableCaptions(): Promise<void> {
|
private async _clickEnableCaptions(): Promise<void> {
|
||||||
|
|
@ -137,9 +137,135 @@ export class CaptionsProcedure {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Strategy 2: "Record and transcribe" → "Start transcription" ──
|
// ── Strategy 2: "Language and speech" → live captions toggle (no panel) ──
|
||||||
// Authenticated Teams 2025+: More → Record and transcribe → Start transcription
|
// Preferred for authenticated joins: enables caption overlay at bottom (same as anonymous)
|
||||||
// After clicking, a spoken-language-selection-dialog appears (handled later).
|
const langSpeechSelectors = [
|
||||||
|
'[data-tid="LanguageSpeechMenuControl-id"]',
|
||||||
|
'div[role="menuitem"]:has-text("Language and speech")',
|
||||||
|
'div[role="menuitem"]:has-text("Sprache und Spracheingabe")',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of langSpeechSelectors) {
|
||||||
|
try {
|
||||||
|
const item = await this._page.$(selector);
|
||||||
|
if (item) {
|
||||||
|
await item.click();
|
||||||
|
this._logger.info(`Clicked "Language and speech": ${selector}`);
|
||||||
|
await this._page.waitForTimeout(2000);
|
||||||
|
|
||||||
|
const panelToggles = await this._page.evaluate(() => {
|
||||||
|
const switches = document.querySelectorAll(
|
||||||
|
'input[role="switch"], [role="switch"], input[type="checkbox"]'
|
||||||
|
);
|
||||||
|
return Array.from(switches).map(s => ({
|
||||||
|
tid: s.getAttribute('data-tid') || '',
|
||||||
|
label: s.getAttribute('aria-label') || '',
|
||||||
|
checked: (s as HTMLInputElement).checked,
|
||||||
|
nearText: ((s.closest('div, label') as HTMLElement)?.textContent || '')
|
||||||
|
.trim().substring(0, 80),
|
||||||
|
}));
|
||||||
|
});
|
||||||
|
this._logger.info(`Panel toggles: ${JSON.stringify(panelToggles)}`);
|
||||||
|
|
||||||
|
const toggleResult = await this._page.evaluate(() => {
|
||||||
|
const switches = document.querySelectorAll(
|
||||||
|
'input[role="switch"], [role="switch"], input[type="checkbox"]'
|
||||||
|
);
|
||||||
|
for (const sw of Array.from(switches)) {
|
||||||
|
const label = (sw.getAttribute('aria-label') || '').toLowerCase();
|
||||||
|
const tid = (sw.getAttribute('data-tid') || '').toLowerCase();
|
||||||
|
const parentEl = sw.closest('div, label, span') as HTMLElement;
|
||||||
|
const nearText = (parentEl?.textContent || '').toLowerCase();
|
||||||
|
const isCaptions =
|
||||||
|
label.includes('caption') || label.includes('untertitel') ||
|
||||||
|
tid.includes('caption') || tid.includes('subtitle') ||
|
||||||
|
nearText.includes('live caption') || nearText.includes('liveuntertitel');
|
||||||
|
if (isCaptions) {
|
||||||
|
if (!(sw as HTMLInputElement).checked) {
|
||||||
|
(sw as HTMLElement).click();
|
||||||
|
return { found: true, clicked: true, info: label || tid || nearText.substring(0, 60) };
|
||||||
|
}
|
||||||
|
return { found: true, clicked: false, info: `already on: ${label || tid}` };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { found: false, clicked: false, info: '' };
|
||||||
|
});
|
||||||
|
|
||||||
|
this._logger.info(`Captions toggle result: ${JSON.stringify(toggleResult)}`);
|
||||||
|
if (toggleResult.found && toggleResult.clicked) {
|
||||||
|
await this._page.waitForTimeout(1500);
|
||||||
|
}
|
||||||
|
await this._page.keyboard.press('Escape');
|
||||||
|
if (toggleResult.found) return;
|
||||||
|
|
||||||
|
this._logger.warn('Language panel opened but no captions toggle found — trying next strategy');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Strategy 3: "Captions & transcripts" submenu (older Teams) ──
|
||||||
|
const submenuSelectors = [
|
||||||
|
'[data-tid="captions-and-transcripts-button"]',
|
||||||
|
'[role="menuitem"]:has-text("Captions & transcripts")',
|
||||||
|
'[role="menuitem"]:has-text("Captions and transcripts")',
|
||||||
|
'[role="menuitem"]:has-text("Untertitel und Transkripte")',
|
||||||
|
'[role="menuitem"]:has-text("Untertitel")',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of submenuSelectors) {
|
||||||
|
try {
|
||||||
|
const item = await this._page.$(selector);
|
||||||
|
if (item) {
|
||||||
|
await item.click();
|
||||||
|
this._logger.info(`Clicked captions submenu: ${selector}`);
|
||||||
|
await this._page.waitForTimeout(1500);
|
||||||
|
|
||||||
|
const enableSelectors = [
|
||||||
|
'button:has-text("Turn on live captions")',
|
||||||
|
'button:has-text("Live captions")',
|
||||||
|
'button:has-text("Live-Untertitel aktivieren")',
|
||||||
|
'[role="menuitem"]:has-text("Turn on live captions")',
|
||||||
|
'[role="menuitem"]:has-text("Live captions")',
|
||||||
|
'[role="menuitemcheckbox"]:has-text("captions")',
|
||||||
|
'[data-tid="toggle-captions"]',
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const enableSel of enableSelectors) {
|
||||||
|
try {
|
||||||
|
const enableBtn = await this._page.$(enableSel);
|
||||||
|
if (enableBtn) {
|
||||||
|
await enableBtn.click();
|
||||||
|
this._logger.info(`Clicked enable captions: ${enableSel}`);
|
||||||
|
await this._page.waitForTimeout(1000);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this._logger.info('Opened captions submenu but could not find enable button');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Strategy 4 (fallback): "Record and transcribe" → "Start transcription" ──
|
||||||
|
// Requires transcript panel to be visible for scraping. Only used if live captions failed.
|
||||||
|
this._logger.info('Live captions not available, trying transcription fallback...');
|
||||||
|
|
||||||
|
// Re-open More menu (previous strategies may have closed it)
|
||||||
|
try {
|
||||||
|
await this._openMoreMenu();
|
||||||
|
} catch {
|
||||||
|
this._logger.warn('Could not re-open More menu for transcription fallback');
|
||||||
|
}
|
||||||
|
|
||||||
const recordMenuSelectors = [
|
const recordMenuSelectors = [
|
||||||
'[data-tid="RecordingMenuControl-id"]',
|
'[data-tid="RecordingMenuControl-id"]',
|
||||||
'div[role="menuitem"]:has-text("Record and transcribe")',
|
'div[role="menuitem"]:has-text("Record and transcribe")',
|
||||||
|
|
@ -239,123 +365,6 @@ export class CaptionsProcedure {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Strategy 3: "Captions & transcripts" submenu (older Teams) ──
|
|
||||||
const submenuSelectors = [
|
|
||||||
'[data-tid="captions-and-transcripts-button"]',
|
|
||||||
'[role="menuitem"]:has-text("Captions & transcripts")',
|
|
||||||
'[role="menuitem"]:has-text("Captions and transcripts")',
|
|
||||||
'[role="menuitem"]:has-text("Untertitel und Transkripte")',
|
|
||||||
'[role="menuitem"]:has-text("Untertitel")',
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const selector of submenuSelectors) {
|
|
||||||
try {
|
|
||||||
const item = await this._page.$(selector);
|
|
||||||
if (item) {
|
|
||||||
await item.click();
|
|
||||||
this._logger.info(`Clicked captions submenu: ${selector}`);
|
|
||||||
await this._page.waitForTimeout(1500);
|
|
||||||
|
|
||||||
const enableSelectors = [
|
|
||||||
'button:has-text("Turn on live captions")',
|
|
||||||
'button:has-text("Live captions")',
|
|
||||||
'button:has-text("Live-Untertitel aktivieren")',
|
|
||||||
'[role="menuitem"]:has-text("Turn on live captions")',
|
|
||||||
'[role="menuitem"]:has-text("Live captions")',
|
|
||||||
'[role="menuitemcheckbox"]:has-text("captions")',
|
|
||||||
'[data-tid="toggle-captions"]',
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const enableSel of enableSelectors) {
|
|
||||||
try {
|
|
||||||
const enableBtn = await this._page.$(enableSel);
|
|
||||||
if (enableBtn) {
|
|
||||||
await enableBtn.click();
|
|
||||||
this._logger.info(`Clicked enable captions: ${enableSel}`);
|
|
||||||
await this._page.waitForTimeout(1000);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this._logger.info('Opened captions submenu but could not find enable button');
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Strategy 4: "Language and speech" panel toggle ──
|
|
||||||
const langSpeechSelectors = [
|
|
||||||
'[data-tid="LanguageSpeechMenuControl-id"]',
|
|
||||||
'div[role="menuitem"]:has-text("Language and speech")',
|
|
||||||
'div[role="menuitem"]:has-text("Sprache und Spracheingabe")',
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const selector of langSpeechSelectors) {
|
|
||||||
try {
|
|
||||||
const item = await this._page.$(selector);
|
|
||||||
if (item) {
|
|
||||||
await item.click();
|
|
||||||
this._logger.info(`Clicked "Language and speech": ${selector}`);
|
|
||||||
await this._page.waitForTimeout(2000);
|
|
||||||
|
|
||||||
const panelToggles = await this._page.evaluate(() => {
|
|
||||||
const switches = document.querySelectorAll(
|
|
||||||
'input[role="switch"], [role="switch"], input[type="checkbox"]'
|
|
||||||
);
|
|
||||||
return Array.from(switches).map(s => ({
|
|
||||||
tid: s.getAttribute('data-tid') || '',
|
|
||||||
label: s.getAttribute('aria-label') || '',
|
|
||||||
checked: (s as HTMLInputElement).checked,
|
|
||||||
nearText: ((s.closest('div, label') as HTMLElement)?.textContent || '')
|
|
||||||
.trim().substring(0, 80),
|
|
||||||
}));
|
|
||||||
});
|
|
||||||
this._logger.info(`Panel toggles: ${JSON.stringify(panelToggles)}`);
|
|
||||||
|
|
||||||
const toggleResult = await this._page.evaluate(() => {
|
|
||||||
const switches = document.querySelectorAll(
|
|
||||||
'input[role="switch"], [role="switch"], input[type="checkbox"]'
|
|
||||||
);
|
|
||||||
for (const sw of Array.from(switches)) {
|
|
||||||
const label = (sw.getAttribute('aria-label') || '').toLowerCase();
|
|
||||||
const tid = (sw.getAttribute('data-tid') || '').toLowerCase();
|
|
||||||
const parentEl = sw.closest('div, label, span') as HTMLElement;
|
|
||||||
const nearText = (parentEl?.textContent || '').toLowerCase();
|
|
||||||
const isCaptions =
|
|
||||||
label.includes('caption') || label.includes('untertitel') ||
|
|
||||||
tid.includes('caption') || tid.includes('subtitle') ||
|
|
||||||
nearText.includes('live caption') || nearText.includes('liveuntertitel');
|
|
||||||
if (isCaptions) {
|
|
||||||
if (!(sw as HTMLInputElement).checked) {
|
|
||||||
(sw as HTMLElement).click();
|
|
||||||
return { found: true, clicked: true, info: label || tid || nearText.substring(0, 60) };
|
|
||||||
}
|
|
||||||
return { found: true, clicked: false, info: `already on: ${label || tid}` };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return { found: false, clicked: false, info: '' };
|
|
||||||
});
|
|
||||||
|
|
||||||
this._logger.info(`Captions toggle result: ${JSON.stringify(toggleResult)}`);
|
|
||||||
if (toggleResult.found && toggleResult.clicked) {
|
|
||||||
await this._page.waitForTimeout(1500);
|
|
||||||
}
|
|
||||||
await this._page.keyboard.press('Escape');
|
|
||||||
if (toggleResult.found) return;
|
|
||||||
|
|
||||||
this._logger.warn('Language panel opened but no captions toggle found');
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Strategy 5: DOM scan for anything containing "caption" / "transcri" ──
|
// ── Strategy 5: DOM scan for anything containing "caption" / "transcri" ──
|
||||||
const found = await this._page.evaluate(() => {
|
const found = await this._page.evaluate(() => {
|
||||||
const keywords = ['caption', 'captions', 'untertitel', 'live caption', 'transcri', 'transkri'];
|
const keywords = ['caption', 'captions', 'untertitel', 'live caption', 'transcri', 'transkri'];
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import { Logger } from 'winston';
|
||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
|
import os from 'os';
|
||||||
import WebSocket from 'ws';
|
import WebSocket from 'ws';
|
||||||
|
|
||||||
import { config } from '../config';
|
import { config } from '../config';
|
||||||
|
|
@ -16,6 +17,38 @@ import { ChatProcedure, ChatMessageEntry } from './chatProcedure';
|
||||||
import { AuthProcedure } from './authProcedure';
|
import { AuthProcedure } from './authProcedure';
|
||||||
import { isValidMeetingUrl } from './meetingUrlParser';
|
import { isValidMeetingUrl } from './meetingUrlParser';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a solid-white Y4M video file for use as fake camera input.
|
||||||
|
* Chromium loops this single frame at 30fps, so participants see a static white image.
|
||||||
|
* Later this can be replaced with a custom image (avatar/background).
|
||||||
|
*/
|
||||||
|
function _generateFakeVideoFile(): string {
|
||||||
|
const width = 1280;
|
||||||
|
const height = 720;
|
||||||
|
const filePath = path.join(os.tmpdir(), 'bot-video-white.y4m');
|
||||||
|
|
||||||
|
if (fs.existsSync(filePath)) return filePath;
|
||||||
|
|
||||||
|
const header = `YUV4MPEG2 W${width} H${height} F30:1 Ip A0:0 C420jpeg\n`;
|
||||||
|
const frameHeader = 'FRAME\n';
|
||||||
|
|
||||||
|
// White in YUV: Y=235, U=128, V=128
|
||||||
|
const yPlane = Buffer.alloc(width * height, 235);
|
||||||
|
const uvSize = (width / 2) * (height / 2);
|
||||||
|
const uPlane = Buffer.alloc(uvSize, 128);
|
||||||
|
const vPlane = Buffer.alloc(uvSize, 128);
|
||||||
|
|
||||||
|
const fd = fs.openSync(filePath, 'w');
|
||||||
|
fs.writeSync(fd, header);
|
||||||
|
fs.writeSync(fd, frameHeader);
|
||||||
|
fs.writeSync(fd, yPlane);
|
||||||
|
fs.writeSync(fd, uPlane);
|
||||||
|
fs.writeSync(fd, vPlane);
|
||||||
|
fs.closeSync(fd);
|
||||||
|
|
||||||
|
return filePath;
|
||||||
|
}
|
||||||
|
|
||||||
export interface OrchestratorCallbacks {
|
export interface OrchestratorCallbacks {
|
||||||
onStateChange: (state: BotState, message?: string) => void;
|
onStateChange: (state: BotState, message?: string) => void;
|
||||||
onTranscript: (entry: TranscriptEntry) => void;
|
onTranscript: (entry: TranscriptEntry) => void;
|
||||||
|
|
@ -770,16 +803,22 @@ export class BotOrchestrator {
|
||||||
private async _launchBrowser(authMode: boolean = false): Promise<void> {
|
private async _launchBrowser(authMode: boolean = false): Promise<void> {
|
||||||
this._logger.info(`Launching browser (authMode=${authMode})...`);
|
this._logger.info(`Launching browser (authMode=${authMode})...`);
|
||||||
|
|
||||||
|
// Generate a solid white Y4M video file so participants see a clean image
|
||||||
|
const fakeVideoPath = _generateFakeVideoFile();
|
||||||
|
this._logger.info(`Fake video file: ${fakeVideoPath}`);
|
||||||
|
|
||||||
const args = authMode
|
const args = authMode
|
||||||
? [
|
? [
|
||||||
// Chromium Minimal: only --no-sandbox + fake media (proven to work for authenticated join)
|
// Chromium Minimal: only --no-sandbox + fake media (proven to work for authenticated join)
|
||||||
'--no-sandbox',
|
'--no-sandbox',
|
||||||
'--use-fake-ui-for-media-stream',
|
'--use-fake-ui-for-media-stream',
|
||||||
'--use-fake-device-for-media-stream',
|
'--use-fake-device-for-media-stream',
|
||||||
|
`--use-file-for-fake-video-capture=${fakeVideoPath}`,
|
||||||
]
|
]
|
||||||
: [
|
: [
|
||||||
'--use-fake-ui-for-media-stream',
|
'--use-fake-ui-for-media-stream',
|
||||||
'--use-fake-device-for-media-stream',
|
'--use-fake-device-for-media-stream',
|
||||||
|
`--use-file-for-fake-video-capture=${fakeVideoPath}`,
|
||||||
'--disable-web-security',
|
'--disable-web-security',
|
||||||
'--disable-features=IsolateOrigins,site-per-process',
|
'--disable-features=IsolateOrigins,site-per-process',
|
||||||
'--autoplay-policy=no-user-gesture-required',
|
'--autoplay-policy=no-user-gesture-required',
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue