From 275f7bbc66d452a7acaebeeaf4c1c22645c6cb8f Mon Sep 17 00:00:00 2001 From: patrick-motsch Date: Fri, 27 Feb 2026 13:56:46 +0100 Subject: [PATCH] Teamsbot: handle chatHistory source - store context but skip AI trigger Made-with: Cursor --- modules/features/teamsbot/service.py | 76 +++++++++++++++++++++------- 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index bce4edb0..2408f291 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -271,7 +271,10 @@ class TeamsbotService: if msgType == "transcript": transcript = message.get("transcript", {}) source = transcript.get("source", "caption") - logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...") + speaker = transcript.get("speaker", "Unknown") + textPreview = (transcript.get("text", "") or "")[:60] + # Caption/speakerHint: name resolution only; transcript comes from STT + logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...") await self._processTranscript( sessionId=sessionId, speaker=transcript.get("speaker", "Unknown"), @@ -285,7 +288,12 @@ class TeamsbotService: elif msgType == "chatMessage": chat = message.get("chat", {}) - logger.info(f"[WS] Chat: speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}...") + isHistory = chat.get("isHistory", False) + source = "chatHistory" if isHistory else "chat" + logger.info( + f"[WS] Chat{'[HISTORY]' if isHistory else ''}: " + f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..." + ) await self._processTranscript( sessionId=sessionId, speaker=chat.get("speaker", "Unknown"), @@ -294,7 +302,7 @@ class TeamsbotService: interface=interface, voiceInterface=voiceInterface, websocket=websocket, - source="chat", + source=source, ) elif msgType == "status": @@ -466,7 +474,11 @@ class TeamsbotService: text = sttResult["text"].strip() if text: resolvedSpeaker = self._resolveSpeakerForAudioCapture() - logger.info(f"[AudioChunk] STT result: {text[:80]}...") + fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False) + logger.info( + f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} " + f"(fromCaption={fromCaption}), text={text[:80]}..." + ) await self._processTranscript( sessionId=sessionId, speaker=resolvedSpeaker["speaker"], @@ -542,24 +554,14 @@ class TeamsbotService: return # Speaker hints are lightweight caption-derived signals used for - # speaker attribution. We do NOT persist caption text as transcript. - # However, for address detection we still allow transient analysis from + # speaker attribution only. Caption text is NOT used as transcript + # (transcript comes from STT/audioCapture or chat). + # For address detection we still allow transient analysis from # speaker hints (without DB write), otherwise direct calls like # "Nyla, hörst du mich?" can be missed when audio capture is silent. if source in ("caption", "speakerHint"): self._registerSpeakerHint(speaker, text) - - # Emit caption/speakerHint to UI so user sees who spoke (audioCapture alone shows "Meeting Audio") - await _emitSessionEvent(sessionId, "transcript", { - "id": None, - "speaker": speaker or "Unknown", - "text": text, - "confidence": 1.0, - "timestamp": getIsoTimestamp(), - "isContinuation": False, - "source": source, - "speakerResolvedFromHint": False, - }) + # Do NOT emit caption text as transcript to UI; caption is for name resolution only. if ( source == "speakerHint" @@ -596,6 +598,44 @@ class TeamsbotService: ) return + # Chat history: messages sent before the bot joined the meeting. + # Stored in DB and context (available if someone refers to chat history) + # but never used to trigger AI responses. + if source == "chatHistory": + transcriptData = TeamsbotTranscript( + sessionId=sessionId, + speaker=speaker, + text=text, + timestamp=getIsoTimestamp(), + confidence=1.0, + language=self.config.language, + isFinal=True, + ).model_dump() + createdTranscript = interface.createTranscript(transcriptData) + + self._contextBuffer.append({ + "speaker": speaker or "Unknown", + "text": text, + "timestamp": getUtcTimestamp(), + "source": "chatHistory", + }) + maxSegments = self.config.contextWindowSegments + if len(self._contextBuffer) > maxSegments: + self._contextBuffer = self._contextBuffer[-maxSegments:] + + await _emitSessionEvent(sessionId, "transcript", { + "id": createdTranscript.get("id"), + "speaker": speaker, + "text": text, + "confidence": 1.0, + "timestamp": getIsoTimestamp(), + "isContinuation": False, + "source": "chatHistory", + "isHistory": True, + }) + logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}") + return + # Filter out the bot's own speech entirely — captions of the bot's # own voice come back as garbled text (e.g. German TTS → English caption) # which pollutes the context buffer and confuses AI analysis.