diff --git a/src/bot/audioCaptureProcedure.ts b/src/bot/audioCaptureProcedure.ts index 008be26..8866c39 100644 --- a/src/bot/audioCaptureProcedure.ts +++ b/src/bot/audioCaptureProcedure.ts @@ -124,18 +124,20 @@ export class AudioCaptureProcedure { let callbackCount = 0; let totalNonZeroSamples = 0; const minRmsThreshold = 0.0003; - const samplesPerChunk = nativeRate * 4; + const maxSamplesPerChunk = nativeRate * 8; const targetRate = 16000; - // Pre-roll: keep last 500ms of discarded silent chunks so that - // speech onsets at the tail of a silent window are preserved. const preRollSamples = Math.ceil(nativeRate * 0.5); + const minFlushSamples = Math.ceil(nativeRate * 0.5); + // Adaptive flush: after ~1s silence following voiced content + const silenceFlushCallbacks = 6; + let hasVoicedContent = false; + let consecutiveSilentCallbacks = 0; processor.onaudioprocess = (e: AudioProcessingEvent) => { const input = e.inputBuffer.getChannelData(0); callbackCount++; // #region agent log - // Count non-zero samples on EVERY callback for accurate diagnostics let nonZeroThisCallback = 0; for (let i = 0; i < input.length; i++) { if (input[i] !== 0) nonZeroThisCallback++; @@ -154,10 +156,32 @@ export class AudioCaptureProcedure { } // #endregion + // Per-callback voice activity detection + let cbPower = 0; + for (let i = 0; i < input.length; i++) { + cbPower += input[i] * input[i]; + } + const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1)); + + if (cbRms >= minRmsThreshold) { + hasVoicedContent = true; + consecutiveSilentCallbacks = 0; + } else { + consecutiveSilentCallbacks++; + } + chunkBuffer.push(new Float32Array(input)); samplesCollected += input.length; - if (samplesCollected >= samplesPerChunk) { + // Flush: max duration reached OR voiced content followed by ~1s silence + const shouldFlush = ( + samplesCollected >= maxSamplesPerChunk + || (hasVoicedContent + && consecutiveSilentCallbacks >= silenceFlushCallbacks + && samplesCollected > minFlushSamples) + ); + + if (shouldFlush) { const merged = new Float32Array(samplesCollected); let offset = 0; for (const buf of chunkBuffer) { @@ -171,6 +195,9 @@ export class AudioCaptureProcedure { } const rms = Math.sqrt(powerSum / Math.max(merged.length, 1)); + hasVoicedContent = false; + consecutiveSilentCallbacks = 0; + if (rms < minRmsThreshold) { skippedSilentChunks++; if (skippedSilentChunks % 10 === 0) { @@ -178,9 +205,6 @@ export class AudioCaptureProcedure { `[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}` ); } - // Pre-roll: retain the tail of the silent chunk so the next - // voiced chunk includes the speech onset that may have started - // in the last few hundred ms of this window. const keep = Math.min(preRollSamples, merged.length); const preRoll = merged.slice(merged.length - keep); chunkBuffer = [preRoll];