Teamsbot: handle chatHistory source - store context but skip AI trigger

Made-with: Cursor
2026-02-27 13:56:46 +01:00 · 2026-02-27 13:56:46 +01:00 · 275f7bbc66
commit 275f7bbc66
parent 21f77d1924
1 changed files with 58 additions and 18 deletions
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -271,7 +271,10 @@ class TeamsbotService:
                if msgType == "transcript":
                    transcript = message.get("transcript", {})
                    source = transcript.get("source", "caption")
-                    logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
+                    speaker = transcript.get("speaker", "Unknown")
+                    textPreview = (transcript.get("text", "") or "")[:60]
+                    # Caption/speakerHint: name resolution only; transcript comes from STT
+                    logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...")
                    await self._processTranscript(
                        sessionId=sessionId,
                        speaker=transcript.get("speaker", "Unknown"),
@ -285,7 +288,12 @@ class TeamsbotService:

                elif msgType == "chatMessage":
                    chat = message.get("chat", {})
-                    logger.info(f"[WS] Chat: speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}...")
+                    isHistory = chat.get("isHistory", False)
+                    source = "chatHistory" if isHistory else "chat"
+                    logger.info(
+                        f"[WS] Chat{'[HISTORY]' if isHistory else ''}: "
+                        f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..."
+                    )
                    await self._processTranscript(
                        sessionId=sessionId,
                        speaker=chat.get("speaker", "Unknown"),
@ -294,7 +302,7 @@ class TeamsbotService:
                        interface=interface,
                        voiceInterface=voiceInterface,
                        websocket=websocket,
-                        source="chat",
+                        source=source,
                    )

                elif msgType == "status":
@ -466,7 +474,11 @@ class TeamsbotService:
                text = sttResult["text"].strip()
                if text:
                    resolvedSpeaker = self._resolveSpeakerForAudioCapture()
-                    logger.info(f"[AudioChunk] STT result: {text[:80]}...")
+                    fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False)
+                    logger.info(
+                        f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} "
+                        f"(fromCaption={fromCaption}), text={text[:80]}..."
+                    )
                    await self._processTranscript(
                        sessionId=sessionId,
                        speaker=resolvedSpeaker["speaker"],
@ -542,24 +554,14 @@ class TeamsbotService:
            return

        # Speaker hints are lightweight caption-derived signals used for
-        # speaker attribution. We do NOT persist caption text as transcript.
-        # However, for address detection we still allow transient analysis from
+        # speaker attribution only. Caption text is NOT used as transcript
+        # (transcript comes from STT/audioCapture or chat).
+        # For address detection we still allow transient analysis from
        # speaker hints (without DB write), otherwise direct calls like
        # "Nyla, hörst du mich?" can be missed when audio capture is silent.
        if source in ("caption", "speakerHint"):
            self._registerSpeakerHint(speaker, text)
-
-            # Emit caption/speakerHint to UI so user sees who spoke (audioCapture alone shows "Meeting Audio")
-            await _emitSessionEvent(sessionId, "transcript", {
-                "id": None,
-                "speaker": speaker or "Unknown",
-                "text": text,
-                "confidence": 1.0,
-                "timestamp": getIsoTimestamp(),
-                "isContinuation": False,
-                "source": source,
-                "speakerResolvedFromHint": False,
-            })
+            # Do NOT emit caption text as transcript to UI; caption is for name resolution only.

            if (
                source == "speakerHint"
@ -596,6 +598,44 @@ class TeamsbotService:
                    )
            return

+        # Chat history: messages sent before the bot joined the meeting.
+        # Stored in DB and context (available if someone refers to chat history)
+        # but never used to trigger AI responses.
+        if source == "chatHistory":
+            transcriptData = TeamsbotTranscript(
+                sessionId=sessionId,
+                speaker=speaker,
+                text=text,
+                timestamp=getIsoTimestamp(),
+                confidence=1.0,
+                language=self.config.language,
+                isFinal=True,
+            ).model_dump()
+            createdTranscript = interface.createTranscript(transcriptData)
+
+            self._contextBuffer.append({
+                "speaker": speaker or "Unknown",
+                "text": text,
+                "timestamp": getUtcTimestamp(),
+                "source": "chatHistory",
+            })
+            maxSegments = self.config.contextWindowSegments
+            if len(self._contextBuffer) > maxSegments:
+                self._contextBuffer = self._contextBuffer[-maxSegments:]
+
+            await _emitSessionEvent(sessionId, "transcript", {
+                "id": createdTranscript.get("id"),
+                "speaker": speaker,
+                "text": text,
+                "confidence": 1.0,
+                "timestamp": getIsoTimestamp(),
+                "isContinuation": False,
+                "source": "chatHistory",
+                "isHistory": True,
+            })
+            logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}")
+            return
+
        # Filter out the bot's own speech entirely — captions of the bot's
        # own voice come back as garbled text (e.g. German TTS → English caption)
        # which pollutes the context buffer and confuses AI analysis.