From 1f529568f5a46025055c1f9d55a8fe2740a73580 Mon Sep 17 00:00:00 2001
From: patrick-motsch
Date: Sat, 28 Feb 2026 16:12:57 +0100
Subject: [PATCH] feat(teamsbot): per-request chat/voice channels,
responseTextForVoice/Chat
Made-with: Cursor
---
.../features/teamsbot/datamodelTeamsbot.py | 3 ++
modules/features/teamsbot/service.py | 54 ++++++++++++-------
modules/services/serviceAi/mainServiceAi.py | 17 +++++-
3 files changed, 54 insertions(+), 20 deletions(-)
diff --git a/modules/features/teamsbot/datamodelTeamsbot.py b/modules/features/teamsbot/datamodelTeamsbot.py
index 0d058db3..14579722 100644
--- a/modules/features/teamsbot/datamodelTeamsbot.py
+++ b/modules/features/teamsbot/datamodelTeamsbot.py
@@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
"""Structured response from the SPEECH_TEAMS AI handler."""
shouldRespond: bool = Field(description="Whether the bot should respond")
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
+ responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
+ responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
+ responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index a127dff7..8ca4b595 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -986,14 +986,19 @@ class TeamsbotService:
})
return
- # Determine response channel (voice, chat, or both)
- # Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
- channelRaw = self.config.responseChannel
- channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
- logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
-
- sendVoice = channelStr in ("voice", "both")
- sendChat = channelStr in ("chat", "both")
+ # Determine response channel: per-request (AI) overrides config
+ channels = speechResult.responseChannels
+ if channels and isinstance(channels, list):
+ channelStr = ",".join(str(c).lower().strip() for c in channels)
+ sendVoice = "voice" in channelStr
+ sendChat = "chat" in channelStr
+ logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
+ else:
+ channelRaw = self.config.responseChannel
+ channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
+ sendVoice = channelStr in ("voice", "both")
+ sendChat = channelStr in ("chat", "both")
+ logger.info(f"Response channel (from config): '{channelStr}'")
if sendVoice and sendChat:
responseType = TeamsbotResponseType.BOTH
@@ -1003,7 +1008,13 @@ class TeamsbotService:
responseType = TeamsbotResponseType.CHAT
# Suppress duplicate responses in short windows ("repeat loop" protection).
- normalizedResponse = (speechResult.responseText or "").strip().lower()
+ canonicalText = (
+ speechResult.responseText
+ or speechResult.responseTextForVoice
+ or speechResult.responseTextForChat
+ or ""
+ )
+ normalizedResponse = (canonicalText or "").strip().lower()
nowTs = time.time()
if (
normalizedResponse
@@ -1021,8 +1032,13 @@ class TeamsbotService:
})
return
+ # Resolve text per channel (AI can send different content to voice vs chat)
+ textForVoice = speechResult.responseTextForVoice or speechResult.responseText
+ textForChat = speechResult.responseTextForChat or speechResult.responseText
+ storedText = textForChat or textForVoice or speechResult.responseText
+
# 4a: Voice response (TTS -> Audio to bot)
- if sendVoice:
+ if sendVoice and textForVoice:
try:
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "requested",
@@ -1034,7 +1050,7 @@ class TeamsbotService:
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
)
ttsResult = await voiceInterface.textToSpeech(
- text=speechResult.responseText,
+ text=textForVoice,
languageCode=self.config.language,
voiceName=self.config.voiceId
)
@@ -1087,13 +1103,13 @@ class TeamsbotService:
sendChat = True # Fallback to chat if voice-only and TTS failed
# 4b: Chat response (send text message to meeting chat)
- if sendChat:
+ if sendChat and textForChat:
try:
if websocket:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
- "text": speechResult.responseText,
+ "text": textForChat,
}))
logger.info(f"Chat response sent for session {sessionId}")
except Exception as chatErr:
@@ -1102,7 +1118,7 @@ class TeamsbotService:
# 4b: Store bot response
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
- responseText=speechResult.responseText,
+ responseText=storedText,
responseType=responseType,
detectedIntent=speechResult.detectedIntent,
reasoning=speechResult.reasoning,
@@ -1118,7 +1134,7 @@ class TeamsbotService:
# 4c: Emit SSE event
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
- "responseText": speechResult.responseText,
+ "responseText": storedText,
"responseType": responseType.value,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
@@ -1141,7 +1157,7 @@ class TeamsbotService:
botTranscriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=self.config.botName,
- text=speechResult.responseText,
+ text=storedText,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
@@ -1151,7 +1167,7 @@ class TeamsbotService:
self._contextBuffer.append({
"speaker": self.config.botName,
- "text": speechResult.responseText,
+ "text": storedText,
"timestamp": getUtcTimestamp(),
"source": "botResponse",
})
@@ -1159,7 +1175,7 @@ class TeamsbotService:
await _emitSessionEvent(sessionId, "transcript", {
"id": botTranscript.get("id"),
"speaker": self.config.botName,
- "text": speechResult.responseText,
+ "text": storedText,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
@@ -1169,7 +1185,7 @@ class TeamsbotService:
# Reset differential writing tracker so next STT creates a new block
self._lastTranscriptSpeaker = self.config.botName
- self._lastTranscriptText = speechResult.responseText
+ self._lastTranscriptText = storedText
self._lastTranscriptId = botTranscript.get("id")
self._followUpWindowEnd = time.time() + 15.0
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 1f7da68b..e1f6da11 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
+KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
+- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
+- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat":
+ - responseChannels: ["voice", "chat"]
+ - responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
+ - responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
+- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus":
+ - responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
+ - responseTextForVoice: Der zu sprechende Text
+- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
+- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
+
STOP-ERKENNUNG:
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
(in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
@@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
{{
"shouldRespond": true/false,
- "responseText": "Deine Antwort hier" oder null,
+ "responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
+ "responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
+ "responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
+ "responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
"reasoning": "Kurze Begruendung deiner Entscheidung",
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
"commands": [] oder null