feat(teamsbot): per-request chat/voice channels, responseTextForVoice/Chat

Made-with: Cursor
2026-02-28 16:12:57 +01:00 · 2026-02-28 16:12:57 +01:00 · 1f529568f5
commit 1f529568f5
parent 65a8026496
3 changed files with 54 additions and 20 deletions
--- a/modules/features/teamsbot/datamodelTeamsbot.py
+++ b/modules/features/teamsbot/datamodelTeamsbot.py
@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
    """Structured response from the SPEECH_TEAMS AI handler."""
    shouldRespond: bool = Field(description="Whether the bot should respond")
    responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
+    responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
+    responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
+    responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
    reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
    detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
    commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
--- a/modules/features/teamsbot/service.py
+++ b/modules/features/teamsbot/service.py
@ -986,14 +986,19 @@ class TeamsbotService:
                    })
                    return

-                # Determine response channel (voice, chat, or both)
-                # Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
+                # Determine response channel: per-request (AI) overrides config
+                channels = speechResult.responseChannels
+                if channels and isinstance(channels, list):
+                    channelStr = ",".join(str(c).lower().strip() for c in channels)
+                    sendVoice = "voice" in channelStr
+                    sendChat = "chat" in channelStr
+                    logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
+                else:
                    channelRaw = self.config.responseChannel
                    channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
-                logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
-                
                    sendVoice = channelStr in ("voice", "both")
                    sendChat = channelStr in ("chat", "both")
+                    logger.info(f"Response channel (from config): '{channelStr}'")
                
                if sendVoice and sendChat:
                    responseType = TeamsbotResponseType.BOTH
@ -1003,7 +1008,13 @@ class TeamsbotService:
                    responseType = TeamsbotResponseType.CHAT

                # Suppress duplicate responses in short windows ("repeat loop" protection).
-                normalizedResponse = (speechResult.responseText or "").strip().lower()
+                canonicalText = (
+                    speechResult.responseText
+                    or speechResult.responseTextForVoice
+                    or speechResult.responseTextForChat
+                    or ""
+                )
+                normalizedResponse = (canonicalText or "").strip().lower()
                nowTs = time.time()
                if (
                    normalizedResponse
@ -1021,8 +1032,13 @@ class TeamsbotService:
                    })
                    return

+                # Resolve text per channel (AI can send different content to voice vs chat)
+                textForVoice = speechResult.responseTextForVoice or speechResult.responseText
+                textForChat = speechResult.responseTextForChat or speechResult.responseText
+                storedText = textForChat or textForVoice or speechResult.responseText
+
                # 4a: Voice response (TTS -> Audio to bot)
-                if sendVoice:
+                if sendVoice and textForVoice:
                    try:
                        await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
                            "status": "requested",
@ -1034,7 +1050,7 @@ class TeamsbotService:
                            f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
                        )
                        ttsResult = await voiceInterface.textToSpeech(
-                            text=speechResult.responseText,
+                            text=textForVoice,
                            languageCode=self.config.language,
                            voiceName=self.config.voiceId
                        )
@ -1087,13 +1103,13 @@ class TeamsbotService:
                            sendChat = True  # Fallback to chat if voice-only and TTS failed

                # 4b: Chat response (send text message to meeting chat)
-                if sendChat:
+                if sendChat and textForChat:
                    try:
                        if websocket:
                            await websocket.send_text(json.dumps({
                                "type": "sendChatMessage",
                                "sessionId": sessionId,
-                                "text": speechResult.responseText,
+                                "text": textForChat,
                            }))
                            logger.info(f"Chat response sent for session {sessionId}")
                    except Exception as chatErr:
@ -1102,7 +1118,7 @@ class TeamsbotService:
                # 4b: Store bot response
                botResponseData = TeamsbotBotResponse(
                    sessionId=sessionId,
-                    responseText=speechResult.responseText,
+                    responseText=storedText,
                    responseType=responseType,
                    detectedIntent=speechResult.detectedIntent,
                    reasoning=speechResult.reasoning,
@ -1118,7 +1134,7 @@ class TeamsbotService:
                # 4c: Emit SSE event
                await _emitSessionEvent(sessionId, "botResponse", {
                    "id": createdResponse.get("id"),
-                    "responseText": speechResult.responseText,
+                    "responseText": storedText,
                    "responseType": responseType.value,
                    "detectedIntent": speechResult.detectedIntent,
                    "reasoning": speechResult.reasoning,
@ -1141,7 +1157,7 @@ class TeamsbotService:
                botTranscriptData = TeamsbotTranscript(
                    sessionId=sessionId,
                    speaker=self.config.botName,
-                    text=speechResult.responseText,
+                    text=storedText,
                    timestamp=getIsoTimestamp(),
                    confidence=1.0,
                    language=self.config.language,
@ -1151,7 +1167,7 @@ class TeamsbotService:

                self._contextBuffer.append({
                    "speaker": self.config.botName,
-                    "text": speechResult.responseText,
+                    "text": storedText,
                    "timestamp": getUtcTimestamp(),
                    "source": "botResponse",
                })
@ -1159,7 +1175,7 @@ class TeamsbotService:
                await _emitSessionEvent(sessionId, "transcript", {
                    "id": botTranscript.get("id"),
                    "speaker": self.config.botName,
-                    "text": speechResult.responseText,
+                    "text": storedText,
                    "confidence": 1.0,
                    "timestamp": getIsoTimestamp(),
                    "isContinuation": False,
@ -1169,7 +1185,7 @@ class TeamsbotService:

                # Reset differential writing tracker so next STT creates a new block
                self._lastTranscriptSpeaker = self.config.botName
-                self._lastTranscriptText = speechResult.responseText
+                self._lastTranscriptText = storedText
                self._lastTranscriptId = botTranscript.get("id")

                self._followUpWindowEnd = time.time() + 15.0
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
 - Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
 - Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.

+KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
+- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
+- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat": 
+  - responseChannels: ["voice", "chat"]
+  - responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
+  - responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
+- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus": 
+  - responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
+  - responseTextForVoice: Der zu sprechende Text
+- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
+- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
+
 STOP-ERKENNUNG:
 Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
 (in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
 WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
 {{
    "shouldRespond": true/false,
-    "responseText": "Deine Antwort hier" oder null,
+    "responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
+    "responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
+    "responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
+    "responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
    "reasoning": "Kurze Begruendung deiner Entscheidung",
    "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
    "commands": [] oder null