From fe1a97564bfcbee836cb10c389746e7740750564 Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Thu, 26 Feb 2026 21:18:06 +0100 Subject: [PATCH] Improve Teams bot response reliability and transcript quality. Fix invalid bot-response timestamps in SSE payloads, reduce duplicate response loops, and improve audio STT stability with larger capture chunks and safer silence filtering. Made-with: Cursor --- modules/features/teamsbot/service.py | 40 ++++++++++++++++++--- modules/services/serviceAi/mainServiceAi.py | 6 ++++ 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index b5136130..4eaee284 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -88,6 +88,8 @@ class TeamsbotService: self._lastTranscriptText: Optional[str] = None self._lastTranscriptId: Optional[str] = None self._recentSpeakerHints: List[Dict[str, Any]] = [] + self._lastBotResponseText: Optional[str] = None + self._lastBotResponseTs: float = 0.0 # ========================================================================= # Session Lifecycle @@ -417,10 +419,16 @@ class TeamsbotService: f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}" ) - # Detect silent/all-zeros audio early to avoid expensive STT calls - if len(set(audioBytes[100:min(500, len(audioBytes))])) < 3: - logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, low byte variation)") - return + # Use RMS from capture diagnostics to skip real silence. + # Byte-variation heuristics produced false positives and dropped valid speech. + if captureDiagnostics and captureDiagnostics.get("rms") is not None: + try: + rmsVal = float(captureDiagnostics.get("rms")) + if rmsVal < 0.0015: + logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})") + return + except Exception: + pass if not voiceInterface: logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}") @@ -544,7 +552,7 @@ class TeamsbotService: and self._lastTranscriptText and self._lastTranscriptId and text.startswith(self._lastTranscriptText) - and source == "caption" # only for captions, chat messages are always new + and source in ("caption", "audioCapture") ) if isContinuation: @@ -845,6 +853,25 @@ class TeamsbotService: else: responseType = TeamsbotResponseType.CHAT + # Suppress duplicate responses in short windows ("repeat loop" protection). + normalizedResponse = (speechResult.responseText or "").strip().lower() + nowTs = time.time() + if ( + normalizedResponse + and self._lastBotResponseText == normalizedResponse + and (nowTs - self._lastBotResponseTs) < 90 + ): + logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window") + await _emitSessionEvent(sessionId, "analysis", { + "shouldRespond": False, + "detectedIntent": speechResult.detectedIntent, + "reasoning": "Suppressed duplicate response within 90s", + "modelName": response.modelName, + "processingTime": response.processingTime, + "priceCHF": response.priceCHF, + }) + return + # 4a: Voice response (TTS -> Audio to bot) if sendVoice: try: @@ -949,6 +976,7 @@ class TeamsbotService: "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, + "timestamp": botResponseData.get("timestamp"), }) # Update session response count @@ -957,6 +985,8 @@ class TeamsbotService: count = session.get("botResponseCount", 0) + 1 interface.updateSession(sessionId, {"botResponseCount": count}) + self._lastBotResponseText = normalizedResponse + self._lastBotResponseTs = nowTs logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}") # Step 5: Execute AI-issued commands (if any) diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 9ee31a79..1f7da68b 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -366,6 +366,12 @@ ANTWORT-STIL (wenn du antwortest): - NICHT frueheres wiederholen das du schon gesagt hast - Max 1-2 Saetze, praezise auf den Punkt - Sieh dir an was du (markiert als [YOU]) bereits gesagt hast und wiederhole es NICHT +- KEINE reinen Absichtssaetze wie "Ich werde ...", "Ich kann ...", "Gerne ...". + Liefere direkt den eigentlichen Inhalt in der gleichen Antwort. + +WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN: +- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen). +- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text. STOP-ERKENNUNG: Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden