From 681c96be8eb022a4a43ce9e785220180e4990251 Mon Sep 17 00:00:00 2001
From: patrick-motsch <p.motsch@valueon.ch>
Date: Fri, 27 Feb 2026 16:40:08 +0100
Subject: [PATCH] Teamsbot: prevent double AI triggers, remove caption text
 from AI context

Made-with: Cursor
---
 modules/features/teamsbot/service.py | 34 ++++++++++++----------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 2408f291..01f84ae7 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -78,6 +78,7 @@ class TeamsbotService:
 
         # State
         self._lastAiCallTime: float = 0.0
+        self._aiAnalysisInProgress: bool = False
         self._contextBuffer: List[Dict[str, Any]] = []
         self._sessionContext: Optional[str] = None  # User-provided background context
         self._contextSummary: Optional[str] = None  # AI-generated summary of long context
@@ -553,32 +554,19 @@ class TeamsbotService:
         if not text:
             return
 
-        # Speaker hints are lightweight caption-derived signals used for
-        # speaker attribution only. Caption text is NOT used as transcript
-        # (transcript comes from STT/audioCapture or chat).
-        # For address detection we still allow transient analysis from
-        # speaker hints (without DB write), otherwise direct calls like
-        # "Nyla, hörst du mich?" can be missed when audio capture is silent.
+        # Captions are used ONLY for speaker name resolution (never as transcript).
+        # Transcript text comes exclusively from audio STT or chat.
+        # Address detection (bot name in caption) still triggers AI analysis
+        # using existing audio-based context — but caption text itself is NOT
+        # added to the context buffer.
         if source in ("caption", "speakerHint"):
             self._registerSpeakerHint(speaker, text)
-            # Do NOT emit caption text as transcript to UI; caption is for name resolution only.
 
             if (
                 source == "speakerHint"
                 and isFinal
                 and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
             ):
-                # Keep hint text only in volatile context (not persisted).
-                self._contextBuffer.append({
-                    "speaker": speaker or "Unknown",
-                    "text": text,
-                    "timestamp": getUtcTimestamp(),
-                    "source": "speakerHint",
-                })
-                maxSegments = self.config.contextWindowSegments
-                if len(self._contextBuffer) > maxSegments:
-                    self._contextBuffer = self._contextBuffer[-maxSegments:]
-
                 shouldTriggerFromHint = self._shouldTriggerAnalysis(text, allowPeriodic=False)
                 logger.debug(
                     f"Session {sessionId}: speakerHint shouldTriggerAnalysis={shouldTriggerFromHint}, "
@@ -586,8 +574,8 @@ class TeamsbotService:
                 )
                 if shouldTriggerFromHint:
                     logger.info(
-                        f"Session {sessionId}: Triggering AI analysis from speakerHint "
-                        f"(buffer: {len(self._contextBuffer)} segments)"
+                        f"Session {sessionId}: Triggering AI analysis from speakerHint address detection "
+                        f"(buffer: {len(self._contextBuffer)} segments, caption text NOT in buffer)"
                     )
                     await self._analyzeAndRespond(
                         sessionId,
@@ -832,6 +820,10 @@ class TeamsbotService:
         triggerTranscript: Dict[str, Any],
     ):
         """Run SPEECH_TEAMS AI analysis and respond if needed."""
+        if self._aiAnalysisInProgress:
+            logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger")
+            return
+        self._aiAnalysisInProgress = True
         self._lastAiCallTime = time.time()
 
         # Build transcript context from buffer.
@@ -1101,6 +1093,8 @@ class TeamsbotService:
         except Exception as e:
             logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
             await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
+        finally:
+            self._aiAnalysisInProgress = False
 
     # =========================================================================
     # AI Command Execution