From fe1a97564bfcbee836cb10c389746e7740750564 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Thu, 26 Feb 2026 21:18:06 +0100
Subject: [PATCH] Improve Teams bot response reliability and transcript
quality.
Fix invalid bot-response timestamps in SSE payloads, reduce duplicate response loops, and improve audio STT stability with larger capture chunks and safer silence filtering.
Made-with: Cursor
---
modules/features/teamsbot/service.py | 40 ++++++++++++++++++---
modules/services/serviceAi/mainServiceAi.py | 6 ++++
2 files changed, 41 insertions(+), 5 deletions(-)
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index b5136130..4eaee284 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -88,6 +88,8 @@ class TeamsbotService:
self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None
self._recentSpeakerHints: List[Dict[str, Any]] = []
+ self._lastBotResponseText: Optional[str] = None
+ self._lastBotResponseTs: float = 0.0
# =========================================================================
# Session Lifecycle
@@ -417,10 +419,16 @@ class TeamsbotService:
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
)
- # Detect silent/all-zeros audio early to avoid expensive STT calls
- if len(set(audioBytes[100:min(500, len(audioBytes))])) < 3:
- logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, low byte variation)")
- return
+ # Use RMS from capture diagnostics to skip real silence.
+ # Byte-variation heuristics produced false positives and dropped valid speech.
+ if captureDiagnostics and captureDiagnostics.get("rms") is not None:
+ try:
+ rmsVal = float(captureDiagnostics.get("rms"))
+ if rmsVal < 0.0015:
+ logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
+ return
+ except Exception:
+ pass
if not voiceInterface:
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
@@ -544,7 +552,7 @@ class TeamsbotService:
and self._lastTranscriptText
and self._lastTranscriptId
and text.startswith(self._lastTranscriptText)
- and source == "caption" # only for captions, chat messages are always new
+ and source in ("caption", "audioCapture")
)
if isContinuation:
@@ -845,6 +853,25 @@ class TeamsbotService:
else:
responseType = TeamsbotResponseType.CHAT
+ # Suppress duplicate responses in short windows ("repeat loop" protection).
+ normalizedResponse = (speechResult.responseText or "").strip().lower()
+ nowTs = time.time()
+ if (
+ normalizedResponse
+ and self._lastBotResponseText == normalizedResponse
+ and (nowTs - self._lastBotResponseTs) < 90
+ ):
+ logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
+ await _emitSessionEvent(sessionId, "analysis", {
+ "shouldRespond": False,
+ "detectedIntent": speechResult.detectedIntent,
+ "reasoning": "Suppressed duplicate response within 90s",
+ "modelName": response.modelName,
+ "processingTime": response.processingTime,
+ "priceCHF": response.priceCHF,
+ })
+ return
+
# 4a: Voice response (TTS -> Audio to bot)
if sendVoice:
try:
@@ -949,6 +976,7 @@ class TeamsbotService:
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
+ "timestamp": botResponseData.get("timestamp"),
})
# Update session response count
@@ -957,6 +985,8 @@ class TeamsbotService:
count = session.get("botResponseCount", 0) + 1
interface.updateSession(sessionId, {"botResponseCount": count})
+ self._lastBotResponseText = normalizedResponse
+ self._lastBotResponseTs = nowTs
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
# Step 5: Execute AI-issued commands (if any)
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 9ee31a79..1f7da68b 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -366,6 +366,12 @@ ANTWORT-STIL (wenn du antwortest):
- NICHT frueheres wiederholen das du schon gesagt hast
- Max 1-2 Saetze, praezise auf den Punkt
- Sieh dir an was du (markiert als [YOU]) bereits gesagt hast und wiederhole es NICHT
+- KEINE reinen Absichtssaetze wie "Ich werde ...", "Ich kann ...", "Gerne ...".
+ Liefere direkt den eigentlichen Inhalt in der gleichen Antwort.
+
+WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
+- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
+- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
STOP-ERKENNUNG:
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden