From e3284994d0ec90de43e9d5ff74fd853f830bba61 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Tue, 12 May 2026 23:33:43 +0200
Subject: [PATCH] fixes stt paras
---
modules/features/teamsbot/service.py | 76 ++++++++++++++++++++++------
1 file changed, 60 insertions(+), 16 deletions(-)
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 93cc27a2..fcce44bd 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -602,6 +602,13 @@ class TeamsbotService:
self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None
self._lastSttTime: float = 0.0
+
+ # Audio chunk aggregation: collect chunks and send to STT only
+ # after a speech pause or when the buffer reaches a target duration.
+ self._audioBuffer: bytes = b""
+ self._audioBufferStartTime: float = 0.0
+ self._audioBufferLastChunkTime: float = 0.0
+ self._audioBufferSampleRate: int = 16000
self._lastBotResponseText: Optional[str] = None
self._lastBotResponseTs: float = 0.0
@@ -1203,6 +1210,14 @@ class TeamsbotService:
interface.updateSession(sessionId, updates)
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
+ # Flush remaining audio buffer before generating summary
+ if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
+ if self._audioBuffer:
+ logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)")
+ self._audioBuffer = b""
+ self._audioBufferStartTime = 0.0
+ self._audioBufferLastChunkTime = 0.0
+
# Generate summary when session ends
if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId))
@@ -1217,11 +1232,18 @@ class TeamsbotService:
voiceInterface,
websocket: WebSocket,
):
- """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
+ """Process an audio chunk from WebRTC capture. The bot-side VAD
+ (AudioWorklet / ScriptProcessor) already segments speech into 1-8s
+ voiced chunks. Here we apply a minimum-duration safety net: very short
+ chunks (<1s) are buffered until they reach 1s; everything else goes
+ straight to STT. A wall-clock timeout flushes stale buffers."""
import base64
+ _MIN_CHUNK_SEC = 1.0
+ _STALE_TIMEOUT_SEC = 3.0
+
try:
audioBytes = base64.b64decode(audioBase64)
- if len(audioBytes) < 1000:
+ if len(audioBytes) < 500:
return
if captureDiagnostics:
@@ -1234,14 +1256,12 @@ class TeamsbotService:
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
)
- # Use RMS from capture diagnostics to skip real silence.
- # Byte-variation heuristics produced false positives and dropped valid speech.
+ isSilent = False
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
try:
rmsVal = float(captureDiagnostics.get("rms"))
if rmsVal < 0.0003:
- logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
- return
+ isSilent = True
except Exception:
pass
@@ -1249,23 +1269,47 @@ class TeamsbotService:
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
return
- # Treat sampleRate=0 as unknown (triggers auto-detection)
- effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
+ now = time.time()
+ effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000
+
+ if not isSilent:
+ if not self._audioBuffer:
+ self._audioBufferStartTime = now
+ self._audioBuffer += audioBytes
+ self._audioBufferLastChunkTime = now
+ self._audioBufferSampleRate = effectiveRate
+
+ bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0
+ bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0
+
+ shouldFlush = (
+ self._audioBuffer
+ and (
+ bufferDuration >= _MIN_CHUNK_SEC
+ or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3)
+ )
+ )
+
+ if not shouldFlush:
+ return
+
+ flushBytes = self._audioBuffer
+ flushRate = self._audioBufferSampleRate
+ self._audioBuffer = b""
+ self._audioBufferStartTime = 0.0
+ self._audioBufferLastChunkTime = 0.0
+
+ flushDuration = len(flushBytes) / (flushRate * 2)
+ logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz")
phraseHints = list(self._knownSpeakers)
if self.config.botName:
phraseHints.append(self.config.botName)
- # Sprache kommt ausschliesslich aus der Session/Instance-Konfig
- # (TeamsbotUserSettings.language ueberschreibt
- # TeamsbotConfig.language, Fallback de-DE im Schema).
- # KEIN hardcodierter Alternative-Sprachen-Pool — der hat dafuer
- # gesorgt, dass Google STT bei verrauschter Audio auf en-US
- # gesprungen ist und englisches Kauderwelsch geliefert hat.
sttResult = await voiceInterface.speechToText(
- audioContent=audioBytes,
+ audioContent=flushBytes,
language=self.config.language or "de-DE",
- sampleRate=effectiveSampleRate,
+ sampleRate=flushRate,
channels=1,
skipFallbacks=True,
phraseHints=phraseHints if phraseHints else None,