Improve Teams bot response reliability and transcript quality.
Fix invalid bot-response timestamps in SSE payloads, reduce duplicate response loops, and improve audio STT stability with larger capture chunks and safer silence filtering. Made-with: Cursor
This commit is contained in:
parent
90c0850449
commit
fe1a97564b
2 changed files with 41 additions and 5 deletions
|
|
@ -88,6 +88,8 @@ class TeamsbotService:
|
||||||
self._lastTranscriptText: Optional[str] = None
|
self._lastTranscriptText: Optional[str] = None
|
||||||
self._lastTranscriptId: Optional[str] = None
|
self._lastTranscriptId: Optional[str] = None
|
||||||
self._recentSpeakerHints: List[Dict[str, Any]] = []
|
self._recentSpeakerHints: List[Dict[str, Any]] = []
|
||||||
|
self._lastBotResponseText: Optional[str] = None
|
||||||
|
self._lastBotResponseTs: float = 0.0
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session Lifecycle
|
# Session Lifecycle
|
||||||
|
|
@ -417,10 +419,16 @@ class TeamsbotService:
|
||||||
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
|
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Detect silent/all-zeros audio early to avoid expensive STT calls
|
# Use RMS from capture diagnostics to skip real silence.
|
||||||
if len(set(audioBytes[100:min(500, len(audioBytes))])) < 3:
|
# Byte-variation heuristics produced false positives and dropped valid speech.
|
||||||
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, low byte variation)")
|
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
|
||||||
return
|
try:
|
||||||
|
rmsVal = float(captureDiagnostics.get("rms"))
|
||||||
|
if rmsVal < 0.0015:
|
||||||
|
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if not voiceInterface:
|
if not voiceInterface:
|
||||||
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
|
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
|
||||||
|
|
@ -544,7 +552,7 @@ class TeamsbotService:
|
||||||
and self._lastTranscriptText
|
and self._lastTranscriptText
|
||||||
and self._lastTranscriptId
|
and self._lastTranscriptId
|
||||||
and text.startswith(self._lastTranscriptText)
|
and text.startswith(self._lastTranscriptText)
|
||||||
and source == "caption" # only for captions, chat messages are always new
|
and source in ("caption", "audioCapture")
|
||||||
)
|
)
|
||||||
|
|
||||||
if isContinuation:
|
if isContinuation:
|
||||||
|
|
@ -845,6 +853,25 @@ class TeamsbotService:
|
||||||
else:
|
else:
|
||||||
responseType = TeamsbotResponseType.CHAT
|
responseType = TeamsbotResponseType.CHAT
|
||||||
|
|
||||||
|
# Suppress duplicate responses in short windows ("repeat loop" protection).
|
||||||
|
normalizedResponse = (speechResult.responseText or "").strip().lower()
|
||||||
|
nowTs = time.time()
|
||||||
|
if (
|
||||||
|
normalizedResponse
|
||||||
|
and self._lastBotResponseText == normalizedResponse
|
||||||
|
and (nowTs - self._lastBotResponseTs) < 90
|
||||||
|
):
|
||||||
|
logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
|
||||||
|
await _emitSessionEvent(sessionId, "analysis", {
|
||||||
|
"shouldRespond": False,
|
||||||
|
"detectedIntent": speechResult.detectedIntent,
|
||||||
|
"reasoning": "Suppressed duplicate response within 90s",
|
||||||
|
"modelName": response.modelName,
|
||||||
|
"processingTime": response.processingTime,
|
||||||
|
"priceCHF": response.priceCHF,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
|
||||||
# 4a: Voice response (TTS -> Audio to bot)
|
# 4a: Voice response (TTS -> Audio to bot)
|
||||||
if sendVoice:
|
if sendVoice:
|
||||||
try:
|
try:
|
||||||
|
|
@ -949,6 +976,7 @@ class TeamsbotService:
|
||||||
"modelName": response.modelName,
|
"modelName": response.modelName,
|
||||||
"processingTime": response.processingTime,
|
"processingTime": response.processingTime,
|
||||||
"priceCHF": response.priceCHF,
|
"priceCHF": response.priceCHF,
|
||||||
|
"timestamp": botResponseData.get("timestamp"),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Update session response count
|
# Update session response count
|
||||||
|
|
@ -957,6 +985,8 @@ class TeamsbotService:
|
||||||
count = session.get("botResponseCount", 0) + 1
|
count = session.get("botResponseCount", 0) + 1
|
||||||
interface.updateSession(sessionId, {"botResponseCount": count})
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
||||||
|
|
||||||
|
self._lastBotResponseText = normalizedResponse
|
||||||
|
self._lastBotResponseTs = nowTs
|
||||||
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
|
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
|
||||||
|
|
||||||
# Step 5: Execute AI-issued commands (if any)
|
# Step 5: Execute AI-issued commands (if any)
|
||||||
|
|
|
||||||
|
|
@ -366,6 +366,12 @@ ANTWORT-STIL (wenn du antwortest):
|
||||||
- NICHT frueheres wiederholen das du schon gesagt hast
|
- NICHT frueheres wiederholen das du schon gesagt hast
|
||||||
- Max 1-2 Saetze, praezise auf den Punkt
|
- Max 1-2 Saetze, praezise auf den Punkt
|
||||||
- Sieh dir an was du (markiert als [YOU]) bereits gesagt hast und wiederhole es NICHT
|
- Sieh dir an was du (markiert als [YOU]) bereits gesagt hast und wiederhole es NICHT
|
||||||
|
- KEINE reinen Absichtssaetze wie "Ich werde ...", "Ich kann ...", "Gerne ...".
|
||||||
|
Liefere direkt den eigentlichen Inhalt in der gleichen Antwort.
|
||||||
|
|
||||||
|
WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
|
||||||
|
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
|
||||||
|
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
|
||||||
|
|
||||||
STOP-ERKENNUNG:
|
STOP-ERKENNUNG:
|
||||||
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
|
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue