From 681c96be8eb022a4a43ce9e785220180e4990251 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Fri, 27 Feb 2026 16:40:08 +0100
Subject: [PATCH] Teamsbot: prevent double AI triggers, remove caption text
from AI context
Made-with: Cursor
---
modules/features/teamsbot/service.py | 34 ++++++++++++----------------
1 file changed, 14 insertions(+), 20 deletions(-)
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 2408f291..01f84ae7 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -78,6 +78,7 @@ class TeamsbotService:
# State
self._lastAiCallTime: float = 0.0
+ self._aiAnalysisInProgress: bool = False
self._contextBuffer: List[Dict[str, Any]] = []
self._sessionContext: Optional[str] = None # User-provided background context
self._contextSummary: Optional[str] = None # AI-generated summary of long context
@@ -553,32 +554,19 @@ class TeamsbotService:
if not text:
return
- # Speaker hints are lightweight caption-derived signals used for
- # speaker attribution only. Caption text is NOT used as transcript
- # (transcript comes from STT/audioCapture or chat).
- # For address detection we still allow transient analysis from
- # speaker hints (without DB write), otherwise direct calls like
- # "Nyla, hörst du mich?" can be missed when audio capture is silent.
+ # Captions are used ONLY for speaker name resolution (never as transcript).
+ # Transcript text comes exclusively from audio STT or chat.
+ # Address detection (bot name in caption) still triggers AI analysis
+ # using existing audio-based context — but caption text itself is NOT
+ # added to the context buffer.
if source in ("caption", "speakerHint"):
self._registerSpeakerHint(speaker, text)
- # Do NOT emit caption text as transcript to UI; caption is for name resolution only.
if (
source == "speakerHint"
and isFinal
and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
):
- # Keep hint text only in volatile context (not persisted).
- self._contextBuffer.append({
- "speaker": speaker or "Unknown",
- "text": text,
- "timestamp": getUtcTimestamp(),
- "source": "speakerHint",
- })
- maxSegments = self.config.contextWindowSegments
- if len(self._contextBuffer) > maxSegments:
- self._contextBuffer = self._contextBuffer[-maxSegments:]
-
shouldTriggerFromHint = self._shouldTriggerAnalysis(text, allowPeriodic=False)
logger.debug(
f"Session {sessionId}: speakerHint shouldTriggerAnalysis={shouldTriggerFromHint}, "
@@ -586,8 +574,8 @@ class TeamsbotService:
)
if shouldTriggerFromHint:
logger.info(
- f"Session {sessionId}: Triggering AI analysis from speakerHint "
- f"(buffer: {len(self._contextBuffer)} segments)"
+ f"Session {sessionId}: Triggering AI analysis from speakerHint address detection "
+ f"(buffer: {len(self._contextBuffer)} segments, caption text NOT in buffer)"
)
await self._analyzeAndRespond(
sessionId,
@@ -832,6 +820,10 @@ class TeamsbotService:
triggerTranscript: Dict[str, Any],
):
"""Run SPEECH_TEAMS AI analysis and respond if needed."""
+ if self._aiAnalysisInProgress:
+ logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger")
+ return
+ self._aiAnalysisInProgress = True
self._lastAiCallTime = time.time()
# Build transcript context from buffer.
@@ -1101,6 +1093,8 @@ class TeamsbotService:
except Exception as e:
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
+ finally:
+ self._aiAnalysisInProgress = False
# =========================================================================
# AI Command Execution