From e3284994d0ec90de43e9d5ff74fd853f830bba61 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Tue, 12 May 2026 23:33:43 +0200 Subject: [PATCH] fixes stt paras --- modules/features/teamsbot/service.py | 76 ++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 16 deletions(-) diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 93cc27a2..fcce44bd 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -602,6 +602,13 @@ class TeamsbotService: self._lastTranscriptText: Optional[str] = None self._lastTranscriptId: Optional[str] = None self._lastSttTime: float = 0.0 + + # Audio chunk aggregation: collect chunks and send to STT only + # after a speech pause or when the buffer reaches a target duration. + self._audioBuffer: bytes = b"" + self._audioBufferStartTime: float = 0.0 + self._audioBufferLastChunkTime: float = 0.0 + self._audioBufferSampleRate: int = 16000 self._lastBotResponseText: Optional[str] = None self._lastBotResponseTs: float = 0.0 @@ -1203,6 +1210,14 @@ class TeamsbotService: interface.updateSession(sessionId, updates) await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage}) + # Flush remaining audio buffer before generating summary + if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]: + if self._audioBuffer: + logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)") + self._audioBuffer = b"" + self._audioBufferStartTime = 0.0 + self._audioBufferLastChunkTime = 0.0 + # Generate summary when session ends if dbStatus == TeamsbotSessionStatus.ENDED.value: asyncio.create_task(self._generateMeetingSummary(sessionId)) @@ -1217,11 +1232,18 @@ class TeamsbotService: voiceInterface, websocket: WebSocket, ): - """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline.""" + """Process an audio chunk from WebRTC capture. The bot-side VAD + (AudioWorklet / ScriptProcessor) already segments speech into 1-8s + voiced chunks. Here we apply a minimum-duration safety net: very short + chunks (<1s) are buffered until they reach 1s; everything else goes + straight to STT. A wall-clock timeout flushes stale buffers.""" import base64 + _MIN_CHUNK_SEC = 1.0 + _STALE_TIMEOUT_SEC = 3.0 + try: audioBytes = base64.b64decode(audioBase64) - if len(audioBytes) < 1000: + if len(audioBytes) < 500: return if captureDiagnostics: @@ -1234,14 +1256,12 @@ class TeamsbotService: f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}" ) - # Use RMS from capture diagnostics to skip real silence. - # Byte-variation heuristics produced false positives and dropped valid speech. + isSilent = False if captureDiagnostics and captureDiagnostics.get("rms") is not None: try: rmsVal = float(captureDiagnostics.get("rms")) if rmsVal < 0.0003: - logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})") - return + isSilent = True except Exception: pass @@ -1249,23 +1269,47 @@ class TeamsbotService: logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}") return - # Treat sampleRate=0 as unknown (triggers auto-detection) - effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None + now = time.time() + effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000 + + if not isSilent: + if not self._audioBuffer: + self._audioBufferStartTime = now + self._audioBuffer += audioBytes + self._audioBufferLastChunkTime = now + self._audioBufferSampleRate = effectiveRate + + bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0 + bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0 + + shouldFlush = ( + self._audioBuffer + and ( + bufferDuration >= _MIN_CHUNK_SEC + or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3) + ) + ) + + if not shouldFlush: + return + + flushBytes = self._audioBuffer + flushRate = self._audioBufferSampleRate + self._audioBuffer = b"" + self._audioBufferStartTime = 0.0 + self._audioBufferLastChunkTime = 0.0 + + flushDuration = len(flushBytes) / (flushRate * 2) + logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz") phraseHints = list(self._knownSpeakers) if self.config.botName: phraseHints.append(self.config.botName) - # Sprache kommt ausschliesslich aus der Session/Instance-Konfig - # (TeamsbotUserSettings.language ueberschreibt - # TeamsbotConfig.language, Fallback de-DE im Schema). - # KEIN hardcodierter Alternative-Sprachen-Pool — der hat dafuer - # gesorgt, dass Google STT bei verrauschter Audio auf en-US - # gesprungen ist und englisches Kauderwelsch geliefert hat. sttResult = await voiceInterface.speechToText( - audioContent=audioBytes, + audioContent=flushBytes, language=self.config.language or "de-DE", - sampleRate=effectiveSampleRate, + sampleRate=flushRate, channels=1, skipFallbacks=True, phraseHints=phraseHints if phraseHints else None,