fixes stt paras

2026-05-12 23:33:43 +02:00 · 2026-05-12 23:33:43 +02:00 · e3284994d0
commit e3284994d0
parent 16ab816c65
1 changed files with 60 additions and 16 deletions
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -602,6 +602,13 @@ class TeamsbotService:
        self._lastTranscriptText: Optional[str] = None
        self._lastTranscriptId: Optional[str] = None
        self._lastSttTime: float = 0.0
+
+        # Audio chunk aggregation: collect chunks and send to STT only
+        # after a speech pause or when the buffer reaches a target duration.
+        self._audioBuffer: bytes = b""
+        self._audioBufferStartTime: float = 0.0
+        self._audioBufferLastChunkTime: float = 0.0
+        self._audioBufferSampleRate: int = 16000
        self._lastBotResponseText: Optional[str] = None
        self._lastBotResponseTs: float = 0.0

@ -1203,6 +1210,14 @@ class TeamsbotService:
        interface.updateSession(sessionId, updates)
        await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})

+        # Flush remaining audio buffer before generating summary
+        if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
+            if self._audioBuffer:
+                logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)")
+                self._audioBuffer = b""
+                self._audioBufferStartTime = 0.0
+                self._audioBufferLastChunkTime = 0.0
+
        # Generate summary when session ends
        if dbStatus == TeamsbotSessionStatus.ENDED.value:
            asyncio.create_task(self._generateMeetingSummary(sessionId))
@ -1217,11 +1232,18 @@ class TeamsbotService:
        voiceInterface,
        websocket: WebSocket,
    ):
-        """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
+        """Process an audio chunk from WebRTC capture. The bot-side VAD
+        (AudioWorklet / ScriptProcessor) already segments speech into 1-8s
+        voiced chunks. Here we apply a minimum-duration safety net: very short
+        chunks (<1s) are buffered until they reach 1s; everything else goes
+        straight to STT. A wall-clock timeout flushes stale buffers."""
        import base64
+        _MIN_CHUNK_SEC = 1.0
+        _STALE_TIMEOUT_SEC = 3.0
+
        try:
            audioBytes = base64.b64decode(audioBase64)
-            if len(audioBytes) < 1000:
+            if len(audioBytes) < 500:
                return

            if captureDiagnostics:
@ -1234,14 +1256,12 @@ class TeamsbotService:
                    f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
                )

-            # Use RMS from capture diagnostics to skip real silence.
-            # Byte-variation heuristics produced false positives and dropped valid speech.
+            isSilent = False
            if captureDiagnostics and captureDiagnostics.get("rms") is not None:
                try:
                    rmsVal = float(captureDiagnostics.get("rms"))
                    if rmsVal < 0.0003:
-                        logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
-                        return
+                        isSilent = True
                except Exception:
                    pass

@ -1249,23 +1269,47 @@ class TeamsbotService:
                logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
                return

-            # Treat sampleRate=0 as unknown (triggers auto-detection)
-            effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
+            now = time.time()
+            effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000
+
+            if not isSilent:
+                if not self._audioBuffer:
+                    self._audioBufferStartTime = now
+                self._audioBuffer += audioBytes
+                self._audioBufferLastChunkTime = now
+                self._audioBufferSampleRate = effectiveRate
+
+            bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0
+            bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0
+
+            shouldFlush = (
+                self._audioBuffer
+                and (
+                    bufferDuration >= _MIN_CHUNK_SEC
+                    or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3)
+                )
+            )
+
+            if not shouldFlush:
+                return
+
+            flushBytes = self._audioBuffer
+            flushRate = self._audioBufferSampleRate
+            self._audioBuffer = b""
+            self._audioBufferStartTime = 0.0
+            self._audioBufferLastChunkTime = 0.0
+
+            flushDuration = len(flushBytes) / (flushRate * 2)
+            logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz")

            phraseHints = list(self._knownSpeakers)
            if self.config.botName:
                phraseHints.append(self.config.botName)

-            # Sprache kommt ausschliesslich aus der Session/Instance-Konfig
-            # (TeamsbotUserSettings.language ueberschreibt
-            # TeamsbotConfig.language, Fallback de-DE im Schema).
-            # KEIN hardcodierter Alternative-Sprachen-Pool — der hat dafuer
-            # gesorgt, dass Google STT bei verrauschter Audio auf en-US
-            # gesprungen ist und englisches Kauderwelsch geliefert hat.
            sttResult = await voiceInterface.speechToText(
-                audioContent=audioBytes,
+                audioContent=flushBytes,
                language=self.config.language or "de-DE",
-                sampleRate=effectiveSampleRate,
+                sampleRate=flushRate,
                channels=1,
                skipFallbacks=True,
                phraseHints=phraseHints if phraseHints else None,