Teamsbot: handle chatHistory source - store context but skip AI trigger

Made-with: Cursor
This commit is contained in:
patrick-motsch 2026-02-27 13:56:46 +01:00
parent 21f77d1924
commit 275f7bbc66

View file

@ -271,7 +271,10 @@ class TeamsbotService:
if msgType == "transcript":
transcript = message.get("transcript", {})
source = transcript.get("source", "caption")
logger.info(f"[WS] Transcript: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
speaker = transcript.get("speaker", "Unknown")
textPreview = (transcript.get("text", "") or "")[:60]
# Caption/speakerHint: name resolution only; transcript comes from STT
logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...")
await self._processTranscript(
sessionId=sessionId,
speaker=transcript.get("speaker", "Unknown"),
@ -285,7 +288,12 @@ class TeamsbotService:
elif msgType == "chatMessage":
chat = message.get("chat", {})
logger.info(f"[WS] Chat: speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}...")
isHistory = chat.get("isHistory", False)
source = "chatHistory" if isHistory else "chat"
logger.info(
f"[WS] Chat{'[HISTORY]' if isHistory else ''}: "
f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..."
)
await self._processTranscript(
sessionId=sessionId,
speaker=chat.get("speaker", "Unknown"),
@ -294,7 +302,7 @@ class TeamsbotService:
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source="chat",
source=source,
)
elif msgType == "status":
@ -466,7 +474,11 @@ class TeamsbotService:
text = sttResult["text"].strip()
if text:
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
logger.info(f"[AudioChunk] STT result: {text[:80]}...")
fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False)
logger.info(
f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} "
f"(fromCaption={fromCaption}), text={text[:80]}..."
)
await self._processTranscript(
sessionId=sessionId,
speaker=resolvedSpeaker["speaker"],
@ -542,24 +554,14 @@ class TeamsbotService:
return
# Speaker hints are lightweight caption-derived signals used for
# speaker attribution. We do NOT persist caption text as transcript.
# However, for address detection we still allow transient analysis from
# speaker attribution only. Caption text is NOT used as transcript
# (transcript comes from STT/audioCapture or chat).
# For address detection we still allow transient analysis from
# speaker hints (without DB write), otherwise direct calls like
# "Nyla, hörst du mich?" can be missed when audio capture is silent.
if source in ("caption", "speakerHint"):
self._registerSpeakerHint(speaker, text)
# Emit caption/speakerHint to UI so user sees who spoke (audioCapture alone shows "Meeting Audio")
await _emitSessionEvent(sessionId, "transcript", {
"id": None,
"speaker": speaker or "Unknown",
"text": text,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": source,
"speakerResolvedFromHint": False,
})
# Do NOT emit caption text as transcript to UI; caption is for name resolution only.
if (
source == "speakerHint"
@ -596,6 +598,44 @@ class TeamsbotService:
)
return
# Chat history: messages sent before the bot joined the meeting.
# Stored in DB and context (available if someone refers to chat history)
# but never used to trigger AI responses.
if source == "chatHistory":
transcriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=speaker,
text=text,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
isFinal=True,
).model_dump()
createdTranscript = interface.createTranscript(transcriptData)
self._contextBuffer.append({
"speaker": speaker or "Unknown",
"text": text,
"timestamp": getUtcTimestamp(),
"source": "chatHistory",
})
maxSegments = self.config.contextWindowSegments
if len(self._contextBuffer) > maxSegments:
self._contextBuffer = self._contextBuffer[-maxSegments:]
await _emitSessionEvent(sessionId, "transcript", {
"id": createdTranscript.get("id"),
"speaker": speaker,
"text": text,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": "chatHistory",
"isHistory": True,
})
logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}")
return
# Filter out the bot's own speech entirely — captions of the bot's
# own voice come back as garbled text (e.g. German TTS → English caption)
# which pollutes the context buffer and confuses AI analysis.