From 681c96be8eb022a4a43ce9e785220180e4990251 Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Fri, 27 Feb 2026 16:40:08 +0100 Subject: [PATCH] Teamsbot: prevent double AI triggers, remove caption text from AI context Made-with: Cursor --- modules/features/teamsbot/service.py | 34 ++++++++++++---------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index 2408f291..01f84ae7 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -78,6 +78,7 @@ class TeamsbotService: # State self._lastAiCallTime: float = 0.0 + self._aiAnalysisInProgress: bool = False self._contextBuffer: List[Dict[str, Any]] = [] self._sessionContext: Optional[str] = None # User-provided background context self._contextSummary: Optional[str] = None # AI-generated summary of long context @@ -553,32 +554,19 @@ class TeamsbotService: if not text: return - # Speaker hints are lightweight caption-derived signals used for - # speaker attribution only. Caption text is NOT used as transcript - # (transcript comes from STT/audioCapture or chat). - # For address detection we still allow transient analysis from - # speaker hints (without DB write), otherwise direct calls like - # "Nyla, hörst du mich?" can be missed when audio capture is silent. + # Captions are used ONLY for speaker name resolution (never as transcript). + # Transcript text comes exclusively from audio STT or chat. + # Address detection (bot name in caption) still triggers AI analysis + # using existing audio-based context — but caption text itself is NOT + # added to the context buffer. if source in ("caption", "speakerHint"): self._registerSpeakerHint(speaker, text) - # Do NOT emit caption text as transcript to UI; caption is for name resolution only. if ( source == "speakerHint" and isFinal and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY ): - # Keep hint text only in volatile context (not persisted). - self._contextBuffer.append({ - "speaker": speaker or "Unknown", - "text": text, - "timestamp": getUtcTimestamp(), - "source": "speakerHint", - }) - maxSegments = self.config.contextWindowSegments - if len(self._contextBuffer) > maxSegments: - self._contextBuffer = self._contextBuffer[-maxSegments:] - shouldTriggerFromHint = self._shouldTriggerAnalysis(text, allowPeriodic=False) logger.debug( f"Session {sessionId}: speakerHint shouldTriggerAnalysis={shouldTriggerFromHint}, " @@ -586,8 +574,8 @@ class TeamsbotService: ) if shouldTriggerFromHint: logger.info( - f"Session {sessionId}: Triggering AI analysis from speakerHint " - f"(buffer: {len(self._contextBuffer)} segments)" + f"Session {sessionId}: Triggering AI analysis from speakerHint address detection " + f"(buffer: {len(self._contextBuffer)} segments, caption text NOT in buffer)" ) await self._analyzeAndRespond( sessionId, @@ -832,6 +820,10 @@ class TeamsbotService: triggerTranscript: Dict[str, Any], ): """Run SPEECH_TEAMS AI analysis and respond if needed.""" + if self._aiAnalysisInProgress: + logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger") + return + self._aiAnalysisInProgress = True self._lastAiCallTime = time.time() # Build transcript context from buffer. @@ -1101,6 +1093,8 @@ class TeamsbotService: except Exception as e: logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True) await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"}) + finally: + self._aiAnalysisInProgress = False # ========================================================================= # AI Command Execution