AudioCapture: adaptive chunks (8s max, flush after 1s silence)

Made-with: Cursor
2026-02-28 00:49:11 +01:00 · 2026-02-28 00:49:11 +01:00 · 25f684eb58
commit 25f684eb58
parent 07f7b03515
1 changed files with 32 additions and 8 deletions
--- a/src/bot/audioCaptureProcedure.ts
+++ b/src/bot/audioCaptureProcedure.ts
@ -124,18 +124,20 @@ export class AudioCaptureProcedure {
            let callbackCount = 0;
            let totalNonZeroSamples = 0;
            const minRmsThreshold = 0.0003;
-            const samplesPerChunk = nativeRate * 4;
+            const maxSamplesPerChunk = nativeRate * 8;
            const targetRate = 16000;
            // Pre-roll: keep last 500ms of discarded silent chunks so that
            // speech onsets at the tail of a silent window are preserved.
            const preRollSamples = Math.ceil(nativeRate * 0.5);
            const minFlushSamples = Math.ceil(nativeRate * 0.5);
            // Adaptive flush: after ~1s silence following voiced content
            const silenceFlushCallbacks = 6;
            let hasVoicedContent = false;
            let consecutiveSilentCallbacks = 0;
            processor.onaudioprocess = (e: AudioProcessingEvent) => {
              const input = e.inputBuffer.getChannelData(0);
              callbackCount++;
              // #region agent log
              // Count non-zero samples on EVERY callback for accurate diagnostics
              let nonZeroThisCallback = 0;
              for (let i = 0; i < input.length; i++) {
                if (input[i] !== 0) nonZeroThisCallback++;
@ -154,10 +156,32 @@ export class AudioCaptureProcedure {
              }
              // #endregion
              // Per-callback voice activity detection
              let cbPower = 0;
              for (let i = 0; i < input.length; i++) {
                cbPower += input[i] * input[i];
              }
              const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
              if (cbRms >= minRmsThreshold) {
                hasVoicedContent = true;
                consecutiveSilentCallbacks = 0;
              } else {
                consecutiveSilentCallbacks++;
              }
              chunkBuffer.push(new Float32Array(input));
              samplesCollected += input.length;
-              if (samplesCollected >= samplesPerChunk) {
+              // Flush: max duration reached OR voiced content followed by ~1s silence
              const shouldFlush = (
                samplesCollected >= maxSamplesPerChunk
                || (hasVoicedContent
                    && consecutiveSilentCallbacks >= silenceFlushCallbacks
                    && samplesCollected > minFlushSamples)
              );
              if (shouldFlush) {
                const merged = new Float32Array(samplesCollected);
                let offset = 0;
                for (const buf of chunkBuffer) {
@ -171,6 +195,9 @@ export class AudioCaptureProcedure {
                }
                const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
                hasVoicedContent = false;
                consecutiveSilentCallbacks = 0;
                if (rms < minRmsThreshold) {
                  skippedSilentChunks++;
                  if (skippedSilentChunks % 10 === 0) {
@ -178,9 +205,6 @@ export class AudioCaptureProcedure {
                      `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
                    );
                  }
                  // Pre-roll: retain the tail of the silent chunk so the next
                  // voiced chunk includes the speech onset that may have started
                  // in the last few hundred ms of this window.
                  const keep = Math.min(preRollSamples, merged.length);
                  const preRoll = merged.slice(merged.length - keep);
                  chunkBuffer = [preRoll];