feat(teamsbot): per-request chat/voice channels, responseTextForVoice/Chat

Made-with: Cursor
This commit is contained in:
patrick-motsch 2026-02-28 16:12:57 +01:00
parent 65a8026496
commit 1f529568f5
3 changed files with 54 additions and 20 deletions

View file

@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
"""Structured response from the SPEECH_TEAMS AI handler.""" """Structured response from the SPEECH_TEAMS AI handler."""
shouldRespond: bool = Field(description="Whether the bot should respond") shouldRespond: bool = Field(description="Whether the bot should respond")
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)") responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)") reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none") detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)") commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")

View file

@ -986,14 +986,19 @@ class TeamsbotService:
}) })
return return
# Determine response channel (voice, chat, or both) # Determine response channel: per-request (AI) overrides config
# Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice" channels = speechResult.responseChannels
channelRaw = self.config.responseChannel if channels and isinstance(channels, list):
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip() channelStr = ",".join(str(c).lower().strip() for c in channels)
logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})") sendVoice = "voice" in channelStr
sendChat = "chat" in channelStr
sendVoice = channelStr in ("voice", "both") logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
sendChat = channelStr in ("chat", "both") else:
channelRaw = self.config.responseChannel
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
sendVoice = channelStr in ("voice", "both")
sendChat = channelStr in ("chat", "both")
logger.info(f"Response channel (from config): '{channelStr}'")
if sendVoice and sendChat: if sendVoice and sendChat:
responseType = TeamsbotResponseType.BOTH responseType = TeamsbotResponseType.BOTH
@ -1003,7 +1008,13 @@ class TeamsbotService:
responseType = TeamsbotResponseType.CHAT responseType = TeamsbotResponseType.CHAT
# Suppress duplicate responses in short windows ("repeat loop" protection). # Suppress duplicate responses in short windows ("repeat loop" protection).
normalizedResponse = (speechResult.responseText or "").strip().lower() canonicalText = (
speechResult.responseText
or speechResult.responseTextForVoice
or speechResult.responseTextForChat
or ""
)
normalizedResponse = (canonicalText or "").strip().lower()
nowTs = time.time() nowTs = time.time()
if ( if (
normalizedResponse normalizedResponse
@ -1021,8 +1032,13 @@ class TeamsbotService:
}) })
return return
# Resolve text per channel (AI can send different content to voice vs chat)
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
textForChat = speechResult.responseTextForChat or speechResult.responseText
storedText = textForChat or textForVoice or speechResult.responseText
# 4a: Voice response (TTS -> Audio to bot) # 4a: Voice response (TTS -> Audio to bot)
if sendVoice: if sendVoice and textForVoice:
try: try:
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "requested", "status": "requested",
@ -1034,7 +1050,7 @@ class TeamsbotService:
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})" f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
) )
ttsResult = await voiceInterface.textToSpeech( ttsResult = await voiceInterface.textToSpeech(
text=speechResult.responseText, text=textForVoice,
languageCode=self.config.language, languageCode=self.config.language,
voiceName=self.config.voiceId voiceName=self.config.voiceId
) )
@ -1087,13 +1103,13 @@ class TeamsbotService:
sendChat = True # Fallback to chat if voice-only and TTS failed sendChat = True # Fallback to chat if voice-only and TTS failed
# 4b: Chat response (send text message to meeting chat) # 4b: Chat response (send text message to meeting chat)
if sendChat: if sendChat and textForChat:
try: try:
if websocket: if websocket:
await websocket.send_text(json.dumps({ await websocket.send_text(json.dumps({
"type": "sendChatMessage", "type": "sendChatMessage",
"sessionId": sessionId, "sessionId": sessionId,
"text": speechResult.responseText, "text": textForChat,
})) }))
logger.info(f"Chat response sent for session {sessionId}") logger.info(f"Chat response sent for session {sessionId}")
except Exception as chatErr: except Exception as chatErr:
@ -1102,7 +1118,7 @@ class TeamsbotService:
# 4b: Store bot response # 4b: Store bot response
botResponseData = TeamsbotBotResponse( botResponseData = TeamsbotBotResponse(
sessionId=sessionId, sessionId=sessionId,
responseText=speechResult.responseText, responseText=storedText,
responseType=responseType, responseType=responseType,
detectedIntent=speechResult.detectedIntent, detectedIntent=speechResult.detectedIntent,
reasoning=speechResult.reasoning, reasoning=speechResult.reasoning,
@ -1118,7 +1134,7 @@ class TeamsbotService:
# 4c: Emit SSE event # 4c: Emit SSE event
await _emitSessionEvent(sessionId, "botResponse", { await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"), "id": createdResponse.get("id"),
"responseText": speechResult.responseText, "responseText": storedText,
"responseType": responseType.value, "responseType": responseType.value,
"detectedIntent": speechResult.detectedIntent, "detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning, "reasoning": speechResult.reasoning,
@ -1141,7 +1157,7 @@ class TeamsbotService:
botTranscriptData = TeamsbotTranscript( botTranscriptData = TeamsbotTranscript(
sessionId=sessionId, sessionId=sessionId,
speaker=self.config.botName, speaker=self.config.botName,
text=speechResult.responseText, text=storedText,
timestamp=getIsoTimestamp(), timestamp=getIsoTimestamp(),
confidence=1.0, confidence=1.0,
language=self.config.language, language=self.config.language,
@ -1151,7 +1167,7 @@ class TeamsbotService:
self._contextBuffer.append({ self._contextBuffer.append({
"speaker": self.config.botName, "speaker": self.config.botName,
"text": speechResult.responseText, "text": storedText,
"timestamp": getUtcTimestamp(), "timestamp": getUtcTimestamp(),
"source": "botResponse", "source": "botResponse",
}) })
@ -1159,7 +1175,7 @@ class TeamsbotService:
await _emitSessionEvent(sessionId, "transcript", { await _emitSessionEvent(sessionId, "transcript", {
"id": botTranscript.get("id"), "id": botTranscript.get("id"),
"speaker": self.config.botName, "speaker": self.config.botName,
"text": speechResult.responseText, "text": storedText,
"confidence": 1.0, "confidence": 1.0,
"timestamp": getIsoTimestamp(), "timestamp": getIsoTimestamp(),
"isContinuation": False, "isContinuation": False,
@ -1169,7 +1185,7 @@ class TeamsbotService:
# Reset differential writing tracker so next STT creates a new block # Reset differential writing tracker so next STT creates a new block
self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptSpeaker = self.config.botName
self._lastTranscriptText = speechResult.responseText self._lastTranscriptText = storedText
self._lastTranscriptId = botTranscript.get("id") self._lastTranscriptId = botTranscript.get("id")
self._followUpWindowEnd = time.time() + 15.0 self._followUpWindowEnd = time.time() + 15.0

View file

@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen). - Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text. - Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat":
- responseChannels: ["voice", "chat"]
- responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
- responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus":
- responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
- responseTextForVoice: Der zu sprechende Text
- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
STOP-ERKENNUNG: STOP-ERKENNUNG:
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
(in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet", (in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format: WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
{{ {{
"shouldRespond": true/false, "shouldRespond": true/false,
"responseText": "Deine Antwort hier" oder null, "responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
"responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
"responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
"responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
"reasoning": "Kurze Begruendung deiner Entscheidung", "reasoning": "Kurze Begruendung deiner Entscheidung",
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none", "detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
"commands": [] oder null "commands": [] oder null