From 1f529568f5a46025055c1f9d55a8fe2740a73580 Mon Sep 17 00:00:00 2001
From: patrick-motsch <p.motsch@valueon.ch>
Date: Sat, 28 Feb 2026 16:12:57 +0100
Subject: [PATCH] feat(teamsbot): per-request chat/voice channels,
 responseTextForVoice/Chat

Made-with: Cursor
---
 .../features/teamsbot/datamodelTeamsbot.py    |  3 ++
 modules/features/teamsbot/service.py          | 54 ++++++++++++-------
 modules/services/serviceAi/mainServiceAi.py   | 17 +++++-
 3 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/modules/features/teamsbot/datamodelTeamsbot.py b/modules/features/teamsbot/datamodelTeamsbot.py
index 0d058db3..14579722 100644
--- a/modules/features/teamsbot/datamodelTeamsbot.py
+++ b/modules/features/teamsbot/datamodelTeamsbot.py
@@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
     """Structured response from the SPEECH_TEAMS AI handler."""
     shouldRespond: bool = Field(description="Whether the bot should respond")
     responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
+    responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
+    responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
+    responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
     reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
     detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
     commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py
index a127dff7..8ca4b595 100644
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@@ -986,14 +986,19 @@ class TeamsbotService:
                     })
                     return
 
-                # Determine response channel (voice, chat, or both)
-                # Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
-                channelRaw = self.config.responseChannel
-                channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
-                logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
-                
-                sendVoice = channelStr in ("voice", "both")
-                sendChat = channelStr in ("chat", "both")
+                # Determine response channel: per-request (AI) overrides config
+                channels = speechResult.responseChannels
+                if channels and isinstance(channels, list):
+                    channelStr = ",".join(str(c).lower().strip() for c in channels)
+                    sendVoice = "voice" in channelStr
+                    sendChat = "chat" in channelStr
+                    logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
+                else:
+                    channelRaw = self.config.responseChannel
+                    channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
+                    sendVoice = channelStr in ("voice", "both")
+                    sendChat = channelStr in ("chat", "both")
+                    logger.info(f"Response channel (from config): '{channelStr}'")
                 
                 if sendVoice and sendChat:
                     responseType = TeamsbotResponseType.BOTH
@@ -1003,7 +1008,13 @@ class TeamsbotService:
                     responseType = TeamsbotResponseType.CHAT
 
                 # Suppress duplicate responses in short windows ("repeat loop" protection).
-                normalizedResponse = (speechResult.responseText or "").strip().lower()
+                canonicalText = (
+                    speechResult.responseText
+                    or speechResult.responseTextForVoice
+                    or speechResult.responseTextForChat
+                    or ""
+                )
+                normalizedResponse = (canonicalText or "").strip().lower()
                 nowTs = time.time()
                 if (
                     normalizedResponse
@@ -1021,8 +1032,13 @@ class TeamsbotService:
                     })
                     return
 
+                # Resolve text per channel (AI can send different content to voice vs chat)
+                textForVoice = speechResult.responseTextForVoice or speechResult.responseText
+                textForChat = speechResult.responseTextForChat or speechResult.responseText
+                storedText = textForChat or textForVoice or speechResult.responseText
+
                 # 4a: Voice response (TTS -> Audio to bot)
-                if sendVoice:
+                if sendVoice and textForVoice:
                     try:
                         await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
                             "status": "requested",
@@ -1034,7 +1050,7 @@ class TeamsbotService:
                             f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
                         )
                         ttsResult = await voiceInterface.textToSpeech(
-                            text=speechResult.responseText,
+                            text=textForVoice,
                             languageCode=self.config.language,
                             voiceName=self.config.voiceId
                         )
@@ -1087,13 +1103,13 @@ class TeamsbotService:
                             sendChat = True  # Fallback to chat if voice-only and TTS failed
 
                 # 4b: Chat response (send text message to meeting chat)
-                if sendChat:
+                if sendChat and textForChat:
                     try:
                         if websocket:
                             await websocket.send_text(json.dumps({
                                 "type": "sendChatMessage",
                                 "sessionId": sessionId,
-                                "text": speechResult.responseText,
+                                "text": textForChat,
                             }))
                             logger.info(f"Chat response sent for session {sessionId}")
                     except Exception as chatErr:
@@ -1102,7 +1118,7 @@ class TeamsbotService:
                 # 4b: Store bot response
                 botResponseData = TeamsbotBotResponse(
                     sessionId=sessionId,
-                    responseText=speechResult.responseText,
+                    responseText=storedText,
                     responseType=responseType,
                     detectedIntent=speechResult.detectedIntent,
                     reasoning=speechResult.reasoning,
@@ -1118,7 +1134,7 @@ class TeamsbotService:
                 # 4c: Emit SSE event
                 await _emitSessionEvent(sessionId, "botResponse", {
                     "id": createdResponse.get("id"),
-                    "responseText": speechResult.responseText,
+                    "responseText": storedText,
                     "responseType": responseType.value,
                     "detectedIntent": speechResult.detectedIntent,
                     "reasoning": speechResult.reasoning,
@@ -1141,7 +1157,7 @@ class TeamsbotService:
                 botTranscriptData = TeamsbotTranscript(
                     sessionId=sessionId,
                     speaker=self.config.botName,
-                    text=speechResult.responseText,
+                    text=storedText,
                     timestamp=getIsoTimestamp(),
                     confidence=1.0,
                     language=self.config.language,
@@ -1151,7 +1167,7 @@ class TeamsbotService:
 
                 self._contextBuffer.append({
                     "speaker": self.config.botName,
-                    "text": speechResult.responseText,
+                    "text": storedText,
                     "timestamp": getUtcTimestamp(),
                     "source": "botResponse",
                 })
@@ -1159,7 +1175,7 @@ class TeamsbotService:
                 await _emitSessionEvent(sessionId, "transcript", {
                     "id": botTranscript.get("id"),
                     "speaker": self.config.botName,
-                    "text": speechResult.responseText,
+                    "text": storedText,
                     "confidence": 1.0,
                     "timestamp": getIsoTimestamp(),
                     "isContinuation": False,
@@ -1169,7 +1185,7 @@ class TeamsbotService:
 
                 # Reset differential writing tracker so next STT creates a new block
                 self._lastTranscriptSpeaker = self.config.botName
-                self._lastTranscriptText = speechResult.responseText
+                self._lastTranscriptText = storedText
                 self._lastTranscriptId = botTranscript.get("id")
 
                 self._followUpWindowEnd = time.time() + 15.0
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 1f7da68b..e1f6da11 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
 - Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
 - Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
 
+KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
+- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
+- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat": 
+  - responseChannels: ["voice", "chat"]
+  - responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
+  - responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
+- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus": 
+  - responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
+  - responseTextForVoice: Der zu sprechende Text
+- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
+- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
+
 STOP-ERKENNUNG:
 Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
 (in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
@@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
 WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
 {{
     "shouldRespond": true/false,
-    "responseText": "Deine Antwort hier" oder null,
+    "responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
+    "responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
+    "responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
+    "responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
     "reasoning": "Kurze Begruendung deiner Entscheidung",
     "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
     "commands": [] oder null