Improve Teams bot response reliability and transcript quality.

Fix invalid bot-response timestamps in SSE payloads, reduce duplicate response loops, and improve audio STT stability with larger capture chunks and safer silence filtering. Made-with: Cursor
2026-02-26 21:18:06 +01:00 · 2026-02-26 21:18:06 +01:00 · fe1a97564b
commit fe1a97564b
parent 90c0850449
2 changed files with 41 additions and 5 deletions
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -88,6 +88,8 @@ class TeamsbotService:
        self._lastTranscriptText: Optional[str] = None
        self._lastTranscriptId: Optional[str] = None
        self._recentSpeakerHints: List[Dict[str, Any]] = []
+        self._lastBotResponseText: Optional[str] = None
+        self._lastBotResponseTs: float = 0.0

    # =========================================================================
    # Session Lifecycle
@ -417,10 +419,16 @@ class TeamsbotService:
                    f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
                )

-            # Detect silent/all-zeros audio early to avoid expensive STT calls
-            if len(set(audioBytes[100:min(500, len(audioBytes))])) < 3:
-                logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, low byte variation)")
-                return
+            # Use RMS from capture diagnostics to skip real silence.
+            # Byte-variation heuristics produced false positives and dropped valid speech.
+            if captureDiagnostics and captureDiagnostics.get("rms") is not None:
+                try:
+                    rmsVal = float(captureDiagnostics.get("rms"))
+                    if rmsVal < 0.0015:
+                        logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
+                        return
+                except Exception:
+                    pass

            if not voiceInterface:
                logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
@ -544,7 +552,7 @@ class TeamsbotService:
            and self._lastTranscriptText
            and self._lastTranscriptId
            and text.startswith(self._lastTranscriptText)
-            and source == "caption"  # only for captions, chat messages are always new
+            and source in ("caption", "audioCapture")
        )

        if isContinuation:
@ -845,6 +853,25 @@ class TeamsbotService:
                else:
                    responseType = TeamsbotResponseType.CHAT

+                # Suppress duplicate responses in short windows ("repeat loop" protection).
+                normalizedResponse = (speechResult.responseText or "").strip().lower()
+                nowTs = time.time()
+                if (
+                    normalizedResponse
+                    and self._lastBotResponseText == normalizedResponse
+                    and (nowTs - self._lastBotResponseTs) < 90
+                ):
+                    logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
+                    await _emitSessionEvent(sessionId, "analysis", {
+                        "shouldRespond": False,
+                        "detectedIntent": speechResult.detectedIntent,
+                        "reasoning": "Suppressed duplicate response within 90s",
+                        "modelName": response.modelName,
+                        "processingTime": response.processingTime,
+                        "priceCHF": response.priceCHF,
+                    })
+                    return
+
                # 4a: Voice response (TTS -> Audio to bot)
                if sendVoice:
                    try:
@ -949,6 +976,7 @@ class TeamsbotService:
                    "modelName": response.modelName,
                    "processingTime": response.processingTime,
                    "priceCHF": response.priceCHF,
+                    "timestamp": botResponseData.get("timestamp"),
                })

                # Update session response count
@ -957,6 +985,8 @@ class TeamsbotService:
                    count = session.get("botResponseCount", 0) + 1
                    interface.updateSession(sessionId, {"botResponseCount": count})

+                self._lastBotResponseText = normalizedResponse
+                self._lastBotResponseTs = nowTs
                logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")

            # Step 5: Execute AI-issued commands (if any)
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -366,6 +366,12 @@ ANTWORT-STIL (wenn du antwortest):
 - NICHT frueheres wiederholen das du schon gesagt hast
 - Max 1-2 Saetze, praezise auf den Punkt
 - Sieh dir an was du (markiert als [YOU]) bereits gesagt hast und wiederhole es NICHT
+- KEINE reinen Absichtssaetze wie "Ich werde ...", "Ich kann ...", "Gerne ...".
+  Liefere direkt den eigentlichen Inhalt in der gleichen Antwort.
+
+WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
+- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
+- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.

 STOP-ERKENNUNG:
 Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden