From 25854edb4dfa19023e36fca7591fc1256b07feed Mon Sep 17 00:00:00 2001
From: patrick-motsch <p.motsch@valueon.ch>
Date: Thu, 26 Feb 2026 21:41:56 +0100
Subject: [PATCH] Allow speaker-hint captions to trigger AI without transcript
 persistence.

Keep aggressive hybrid persistence rules, but use final speaker hints as transient context to react to direct bot addressing even when audio capture is silent.

Made-with: Cursor
---
 modules/features/teamsbot/service.py | 46 +++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index 4eaee284..a8469661 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -524,11 +524,47 @@ class TeamsbotService:
         if not text:
             return
 
-        # Speaker hints are lightweight caption-derived signals used only to
-        # attribute audio-stream STT to likely speakers. They are not persisted.
+        # Speaker hints are lightweight caption-derived signals used for
+        # speaker attribution. We do NOT persist caption text as transcript.
+        # However, for address detection we still allow transient analysis from
+        # speaker hints (without DB write), otherwise direct calls like
+        # "Nyla, hörst du mich?" can be missed when audio capture is silent.
         if source in ("caption", "speakerHint"):
             self._registerSpeakerHint(speaker, text)
-        if source in ("caption", "speakerHint"):
+
+            if (
+                source == "speakerHint"
+                and isFinal
+                and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
+            ):
+                # Keep hint text only in volatile context (not persisted).
+                self._contextBuffer.append({
+                    "speaker": speaker or "Unknown",
+                    "text": text,
+                    "timestamp": getUtcTimestamp(),
+                    "source": "speakerHint",
+                })
+                maxSegments = self.config.contextWindowSegments
+                if len(self._contextBuffer) > maxSegments:
+                    self._contextBuffer = self._contextBuffer[-maxSegments:]
+
+                shouldTriggerFromHint = self._shouldTriggerAnalysis(text, allowPeriodic=False)
+                logger.debug(
+                    f"Session {sessionId}: speakerHint shouldTriggerAnalysis={shouldTriggerFromHint}, "
+                    f"bufferSize={len(self._contextBuffer)}"
+                )
+                if shouldTriggerFromHint:
+                    logger.info(
+                        f"Session {sessionId}: Triggering AI analysis from speakerHint "
+                        f"(buffer: {len(self._contextBuffer)} segments)"
+                    )
+                    await self._analyzeAndRespond(
+                        sessionId,
+                        interface,
+                        voiceInterface,
+                        websocket,
+                        {"id": None, "speaker": speaker, "text": text, "source": source},
+                    )
             return
 
         # Filter out the bot's own speech entirely — captions of the bot's
@@ -667,7 +703,7 @@ class TeamsbotService:
         
         return False
 
-    def _shouldTriggerAnalysis(self, transcriptText: str) -> bool:
+    def _shouldTriggerAnalysis(self, transcriptText: str, allowPeriodic: bool = True) -> bool:
         """
         Decide whether to trigger AI analysis based on the latest transcript.
         Triggers:
@@ -711,7 +747,7 @@ class TeamsbotService:
             return False
 
         # Periodic trigger
-        if timeSinceLastCall >= self.config.triggerIntervalSeconds:
+        if allowPeriodic and timeSinceLastCall >= self.config.triggerIntervalSeconds:
             logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s since last call)")
             return True