AudioCapture: adaptive chunks (8s max, flush after 1s silence)
Made-with: Cursor
This commit is contained in:
parent
07f7b03515
commit
25f684eb58
1 changed files with 32 additions and 8 deletions
|
|
@ -124,18 +124,20 @@ export class AudioCaptureProcedure {
|
||||||
let callbackCount = 0;
|
let callbackCount = 0;
|
||||||
let totalNonZeroSamples = 0;
|
let totalNonZeroSamples = 0;
|
||||||
const minRmsThreshold = 0.0003;
|
const minRmsThreshold = 0.0003;
|
||||||
const samplesPerChunk = nativeRate * 4;
|
const maxSamplesPerChunk = nativeRate * 8;
|
||||||
const targetRate = 16000;
|
const targetRate = 16000;
|
||||||
// Pre-roll: keep last 500ms of discarded silent chunks so that
|
|
||||||
// speech onsets at the tail of a silent window are preserved.
|
|
||||||
const preRollSamples = Math.ceil(nativeRate * 0.5);
|
const preRollSamples = Math.ceil(nativeRate * 0.5);
|
||||||
|
const minFlushSamples = Math.ceil(nativeRate * 0.5);
|
||||||
|
// Adaptive flush: after ~1s silence following voiced content
|
||||||
|
const silenceFlushCallbacks = 6;
|
||||||
|
let hasVoicedContent = false;
|
||||||
|
let consecutiveSilentCallbacks = 0;
|
||||||
|
|
||||||
processor.onaudioprocess = (e: AudioProcessingEvent) => {
|
processor.onaudioprocess = (e: AudioProcessingEvent) => {
|
||||||
const input = e.inputBuffer.getChannelData(0);
|
const input = e.inputBuffer.getChannelData(0);
|
||||||
callbackCount++;
|
callbackCount++;
|
||||||
|
|
||||||
// #region agent log
|
// #region agent log
|
||||||
// Count non-zero samples on EVERY callback for accurate diagnostics
|
|
||||||
let nonZeroThisCallback = 0;
|
let nonZeroThisCallback = 0;
|
||||||
for (let i = 0; i < input.length; i++) {
|
for (let i = 0; i < input.length; i++) {
|
||||||
if (input[i] !== 0) nonZeroThisCallback++;
|
if (input[i] !== 0) nonZeroThisCallback++;
|
||||||
|
|
@ -154,10 +156,32 @@ export class AudioCaptureProcedure {
|
||||||
}
|
}
|
||||||
// #endregion
|
// #endregion
|
||||||
|
|
||||||
|
// Per-callback voice activity detection
|
||||||
|
let cbPower = 0;
|
||||||
|
for (let i = 0; i < input.length; i++) {
|
||||||
|
cbPower += input[i] * input[i];
|
||||||
|
}
|
||||||
|
const cbRms = Math.sqrt(cbPower / Math.max(input.length, 1));
|
||||||
|
|
||||||
|
if (cbRms >= minRmsThreshold) {
|
||||||
|
hasVoicedContent = true;
|
||||||
|
consecutiveSilentCallbacks = 0;
|
||||||
|
} else {
|
||||||
|
consecutiveSilentCallbacks++;
|
||||||
|
}
|
||||||
|
|
||||||
chunkBuffer.push(new Float32Array(input));
|
chunkBuffer.push(new Float32Array(input));
|
||||||
samplesCollected += input.length;
|
samplesCollected += input.length;
|
||||||
|
|
||||||
if (samplesCollected >= samplesPerChunk) {
|
// Flush: max duration reached OR voiced content followed by ~1s silence
|
||||||
|
const shouldFlush = (
|
||||||
|
samplesCollected >= maxSamplesPerChunk
|
||||||
|
|| (hasVoicedContent
|
||||||
|
&& consecutiveSilentCallbacks >= silenceFlushCallbacks
|
||||||
|
&& samplesCollected > minFlushSamples)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (shouldFlush) {
|
||||||
const merged = new Float32Array(samplesCollected);
|
const merged = new Float32Array(samplesCollected);
|
||||||
let offset = 0;
|
let offset = 0;
|
||||||
for (const buf of chunkBuffer) {
|
for (const buf of chunkBuffer) {
|
||||||
|
|
@ -171,6 +195,9 @@ export class AudioCaptureProcedure {
|
||||||
}
|
}
|
||||||
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
const rms = Math.sqrt(powerSum / Math.max(merged.length, 1));
|
||||||
|
|
||||||
|
hasVoicedContent = false;
|
||||||
|
consecutiveSilentCallbacks = 0;
|
||||||
|
|
||||||
if (rms < minRmsThreshold) {
|
if (rms < minRmsThreshold) {
|
||||||
skippedSilentChunks++;
|
skippedSilentChunks++;
|
||||||
if (skippedSilentChunks % 10 === 0) {
|
if (skippedSilentChunks % 10 === 0) {
|
||||||
|
|
@ -178,9 +205,6 @@ export class AudioCaptureProcedure {
|
||||||
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
|
`[AudioCapture] silent chunk skipped: track=${trackId}, readyState=${event.track.readyState}, muted=${event.track.muted}, enabled=${event.track.enabled}, rms=${rms.toFixed(6)}, callbacks=${callbackCount}, totalNonZero=${totalNonZeroSamples}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// Pre-roll: retain the tail of the silent chunk so the next
|
|
||||||
// voiced chunk includes the speech onset that may have started
|
|
||||||
// in the last few hundred ms of this window.
|
|
||||||
const keep = Math.min(preRollSamples, merged.length);
|
const keep = Math.min(preRollSamples, merged.length);
|
||||||
const preRoll = merged.slice(merged.length - keep);
|
const preRoll = merged.slice(merged.length - keep);
|
||||||
chunkBuffer = [preRoll];
|
chunkBuffer = [preRoll];
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue