From 25f684eb589c6f368f66e957e1f4215556fe56f9 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Sat, 28 Feb 2026 00:49:11 +0100
Subject: [PATCH] AudioCapture: adaptive chunks (8s max, flush after 1s
silence)
Made-with: Cursor
---
src/bot/audioCaptureProcedure.ts | 40 +++++++++++++++++++++++++-------
1 file changed, 32 insertions(+), 8 deletions(-)
diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts
index 008be26..8866c39 100644
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@@ -124,18 +124,20 @@ export class AudioCaptureProcedure {
let callbackCount = 0;
let totalNonZeroSamples = 0;
const minRmsThreshold = 0.0003;
- const samplesPerChunk = nativeRate * 4;
+ const maxSamplesPerChunk = nativeRate * 8;
const targetRate = 16000;
- // Pre-roll: keep last 500ms of discarded silent chunks so that
- // speech onsets at the tail of a silent window are preserved.
const preRollSamples = Math.ceil(nativeRate * 0.5);
+ const minFlushSamples = Math.ceil(nativeRate * 0.5);
+ // Adaptive flush: after ~1s silence following voiced content
+ const silenceFlushCallbacks = 6;
+ let hasVoicedContent = false;
+ let consecutiveSilentCallbacks = 0;
processor.onaudioprocess = (e: AudioProcessingEvent) => {
const input = e.inputBuffer.getChannelData(0);
callbackCount++;
// #region agent log
- // Count non-zero samples on EVERY callback for accurate diagnostics
let nonZeroThisCallback = 0;
for (let i = 0; i < input.length; i++) {
if (input[i] !== 0) nonZeroThisCallback++;
@@ -154,10 +156,32 @@ export class AudioCaptureProcedure {
}
// #endregion
+ // Per-callback voice activity detection
+ let cbPower = 0;
+ for (let i = 0; i < input.length; i++) {
+ cbPower += input[i] * input[i];
+ }
+ const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
+
+ if (cbRms >= minRmsThreshold) {
+ hasVoicedContent = true;
+ consecutiveSilentCallbacks = 0;
+ } else {
+ consecutiveSilentCallbacks++;
+ }
+
chunkBuffer.push(new Float32Array(input));
samplesCollected += input.length;
- if (samplesCollected >= samplesPerChunk) {
+ // Flush: max duration reached OR voiced content followed by ~1s silence
+ const shouldFlush = (
+ samplesCollected >= maxSamplesPerChunk
+ || (hasVoicedContent
+ && consecutiveSilentCallbacks >= silenceFlushCallbacks
+ && samplesCollected > minFlushSamples)
+ );
+
+ if (shouldFlush) {
const merged = new Float32Array(samplesCollected);
let offset = 0;
for (const buf of chunkBuffer) {
@@ -171,6 +195,9 @@ export class AudioCaptureProcedure {
}
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
+ hasVoicedContent = false;
+ consecutiveSilentCallbacks = 0;
+
if (rms < minRmsThreshold) {
skippedSilentChunks++;
if (skippedSilentChunks % 10 === 0) {
@@ -178,9 +205,6 @@ export class AudioCaptureProcedure {
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
);
}
- // Pre-roll: retain the tail of the silent chunk so the next
- // voiced chunk includes the speech onset that may have started
- // in the last few hundred ms of this window.
const keep = Math.min(preRollSamples, merged.length);
const preRoll = merged.slice(merged.length - keep);
chunkBuffer = [preRoll];