From e3284994d0ec90de43e9d5ff74fd853f830bba61 Mon Sep 17 00:00:00 2001
From: ValueOn AG <p.motsch@valueon.ch>
Date: Tue, 12 May 2026 23:33:43 +0200
Subject: [PATCH] fixes stt paras

---
 modules/features/teamsbot/service.py | 76 ++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 93cc27a2..fcce44bd 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -602,6 +602,13 @@ class TeamsbotService:
         self._lastTranscriptText: Optional[str] = None
         self._lastTranscriptId: Optional[str] = None
         self._lastSttTime: float = 0.0
+
+        # Audio chunk aggregation: collect chunks and send to STT only
+        # after a speech pause or when the buffer reaches a target duration.
+        self._audioBuffer: bytes = b""
+        self._audioBufferStartTime: float = 0.0
+        self._audioBufferLastChunkTime: float = 0.0
+        self._audioBufferSampleRate: int = 16000
         self._lastBotResponseText: Optional[str] = None
         self._lastBotResponseTs: float = 0.0
 
@@ -1203,6 +1210,14 @@ class TeamsbotService:
         interface.updateSession(sessionId, updates)
         await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
 
+        # Flush remaining audio buffer before generating summary
+        if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
+            if self._audioBuffer:
+                logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)")
+                self._audioBuffer = b""
+                self._audioBufferStartTime = 0.0
+                self._audioBufferLastChunkTime = 0.0
+
         # Generate summary when session ends
         if dbStatus == TeamsbotSessionStatus.ENDED.value:
             asyncio.create_task(self._generateMeetingSummary(sessionId))
@@ -1217,11 +1232,18 @@ class TeamsbotService:
         voiceInterface,
         websocket: WebSocket,
     ):
-        """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
+        """Process an audio chunk from WebRTC capture. The bot-side VAD
+        (AudioWorklet / ScriptProcessor) already segments speech into 1-8s
+        voiced chunks. Here we apply a minimum-duration safety net: very short
+        chunks (<1s) are buffered until they reach 1s; everything else goes
+        straight to STT. A wall-clock timeout flushes stale buffers."""
         import base64
+        _MIN_CHUNK_SEC = 1.0
+        _STALE_TIMEOUT_SEC = 3.0
+
         try:
             audioBytes = base64.b64decode(audioBase64)
-            if len(audioBytes) < 1000:
+            if len(audioBytes) < 500:
                 return
 
             if captureDiagnostics:
@@ -1234,14 +1256,12 @@ class TeamsbotService:
                     f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
                 )
 
-            # Use RMS from capture diagnostics to skip real silence.
-            # Byte-variation heuristics produced false positives and dropped valid speech.
+            isSilent = False
             if captureDiagnostics and captureDiagnostics.get("rms") is not None:
                 try:
                     rmsVal = float(captureDiagnostics.get("rms"))
                     if rmsVal < 0.0003:
-                        logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
-                        return
+                        isSilent = True
                 except Exception:
                     pass
 
@@ -1249,23 +1269,47 @@ class TeamsbotService:
                 logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
                 return
 
-            # Treat sampleRate=0 as unknown (triggers auto-detection)
-            effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
+            now = time.time()
+            effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000
+
+            if not isSilent:
+                if not self._audioBuffer:
+                    self._audioBufferStartTime = now
+                self._audioBuffer += audioBytes
+                self._audioBufferLastChunkTime = now
+                self._audioBufferSampleRate = effectiveRate
+
+            bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0
+            bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0
+
+            shouldFlush = (
+                self._audioBuffer
+                and (
+                    bufferDuration >= _MIN_CHUNK_SEC
+                    or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3)
+                )
+            )
+
+            if not shouldFlush:
+                return
+
+            flushBytes = self._audioBuffer
+            flushRate = self._audioBufferSampleRate
+            self._audioBuffer = b""
+            self._audioBufferStartTime = 0.0
+            self._audioBufferLastChunkTime = 0.0
+
+            flushDuration = len(flushBytes) / (flushRate * 2)
+            logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz")
 
             phraseHints = list(self._knownSpeakers)
             if self.config.botName:
                 phraseHints.append(self.config.botName)
 
-            # Sprache kommt ausschliesslich aus der Session/Instance-Konfig
-            # (TeamsbotUserSettings.language ueberschreibt
-            # TeamsbotConfig.language, Fallback de-DE im Schema).
-            # KEIN hardcodierter Alternative-Sprachen-Pool — der hat dafuer
-            # gesorgt, dass Google STT bei verrauschter Audio auf en-US
-            # gesprungen ist und englisches Kauderwelsch geliefert hat.
             sttResult = await voiceInterface.speechToText(
-                audioContent=audioBytes,
+                audioContent=flushBytes,
                 language=self.config.language or "de-DE",
-                sampleRate=effectiveSampleRate,
+                sampleRate=flushRate,
                 channels=1,
                 skipFallbacks=True,
                 phraseHints=phraseHints if phraseHints else None,