diff --git a/modules/features/teamsbot/datamodelTeamsbot.py b/modules/features/teamsbot/datamodelTeamsbot.py index 0d058db3..14579722 100644 --- a/modules/features/teamsbot/datamodelTeamsbot.py +++ b/modules/features/teamsbot/datamodelTeamsbot.py @@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel): """Structured response from the SPEECH_TEAMS AI handler.""" shouldRespond: bool = Field(description="Whether the bot should respond") responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)") + responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText") + responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText") + responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set") reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)") detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none") commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)") diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index a127dff7..8ca4b595 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -986,14 +986,19 @@ class TeamsbotService: }) return - # Determine response channel (voice, chat, or both) - # Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice" - channelRaw = self.config.responseChannel - channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip() - logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})") - - sendVoice = channelStr in ("voice", "both") - sendChat = channelStr in ("chat", "both") + # Determine response channel: per-request (AI) overrides config + channels = speechResult.responseChannels + if channels and isinstance(channels, list): + channelStr = ",".join(str(c).lower().strip() for c in channels) + sendVoice = "voice" in channelStr + sendChat = "chat" in channelStr + logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}") + else: + channelRaw = self.config.responseChannel + channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip() + sendVoice = channelStr in ("voice", "both") + sendChat = channelStr in ("chat", "both") + logger.info(f"Response channel (from config): '{channelStr}'") if sendVoice and sendChat: responseType = TeamsbotResponseType.BOTH @@ -1003,7 +1008,13 @@ class TeamsbotService: responseType = TeamsbotResponseType.CHAT # Suppress duplicate responses in short windows ("repeat loop" protection). - normalizedResponse = (speechResult.responseText or "").strip().lower() + canonicalText = ( + speechResult.responseText + or speechResult.responseTextForVoice + or speechResult.responseTextForChat + or "" + ) + normalizedResponse = (canonicalText or "").strip().lower() nowTs = time.time() if ( normalizedResponse @@ -1021,8 +1032,13 @@ class TeamsbotService: }) return + # Resolve text per channel (AI can send different content to voice vs chat) + textForVoice = speechResult.responseTextForVoice or speechResult.responseText + textForChat = speechResult.responseTextForChat or speechResult.responseText + storedText = textForChat or textForVoice or speechResult.responseText + # 4a: Voice response (TTS -> Audio to bot) - if sendVoice: + if sendVoice and textForVoice: try: await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "requested", @@ -1034,7 +1050,7 @@ class TeamsbotService: f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})" ) ttsResult = await voiceInterface.textToSpeech( - text=speechResult.responseText, + text=textForVoice, languageCode=self.config.language, voiceName=self.config.voiceId ) @@ -1087,13 +1103,13 @@ class TeamsbotService: sendChat = True # Fallback to chat if voice-only and TTS failed # 4b: Chat response (send text message to meeting chat) - if sendChat: + if sendChat and textForChat: try: if websocket: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, - "text": speechResult.responseText, + "text": textForChat, })) logger.info(f"Chat response sent for session {sessionId}") except Exception as chatErr: @@ -1102,7 +1118,7 @@ class TeamsbotService: # 4b: Store bot response botResponseData = TeamsbotBotResponse( sessionId=sessionId, - responseText=speechResult.responseText, + responseText=storedText, responseType=responseType, detectedIntent=speechResult.detectedIntent, reasoning=speechResult.reasoning, @@ -1118,7 +1134,7 @@ class TeamsbotService: # 4c: Emit SSE event await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), - "responseText": speechResult.responseText, + "responseText": storedText, "responseType": responseType.value, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, @@ -1141,7 +1157,7 @@ class TeamsbotService: botTranscriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, - text=speechResult.responseText, + text=storedText, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, @@ -1151,7 +1167,7 @@ class TeamsbotService: self._contextBuffer.append({ "speaker": self.config.botName, - "text": speechResult.responseText, + "text": storedText, "timestamp": getUtcTimestamp(), "source": "botResponse", }) @@ -1159,7 +1175,7 @@ class TeamsbotService: await _emitSessionEvent(sessionId, "transcript", { "id": botTranscript.get("id"), "speaker": self.config.botName, - "text": speechResult.responseText, + "text": storedText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, @@ -1169,7 +1185,7 @@ class TeamsbotService: # Reset differential writing tracker so next STT creates a new block self._lastTranscriptSpeaker = self.config.botName - self._lastTranscriptText = speechResult.responseText + self._lastTranscriptText = storedText self._lastTranscriptId = botTranscript.get("id") self._followUpWindowEnd = time.time() + 15.0 diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index 1f7da68b..e1f6da11 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN: - Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen). - Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text. +KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten: +- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt. +- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat": + - responseChannels: ["voice", "chat"] + - responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat") + - responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung) +- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus": + - responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext + - responseTextForVoice: Der zu sprechende Text +- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"] +- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden. + STOP-ERKENNUNG: Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden (in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet", @@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format: {{ "shouldRespond": true/false, - "responseText": "Deine Antwort hier" oder null, + "responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle), + "responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung), + "responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung), + "responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage, "reasoning": "Kurze Begruendung deiner Entscheidung", "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none", "commands": [] oder null