feat(teamsbot): per-request chat/voice channels, responseTextForVoice/Chat

Made-with: Cursor
This commit is contained in:
patrick-motsch 2026-02-28 16:12:57 +01:00
parent 65a8026496
commit 1f529568f5
3 changed files with 54 additions and 20 deletions

View file

@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
"""Structured response from the SPEECH_TEAMS AI handler."""
shouldRespond: bool = Field(description="Whether the bot should respond")
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")

View file

@ -986,14 +986,19 @@ class TeamsbotService:
})
return
# Determine response channel (voice, chat, or both)
# Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
channelRaw = self.config.responseChannel
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
sendVoice = channelStr in ("voice", "both")
sendChat = channelStr in ("chat", "both")
# Determine response channel: per-request (AI) overrides config
channels = speechResult.responseChannels
if channels and isinstance(channels, list):
channelStr = ",".join(str(c).lower().strip() for c in channels)
sendVoice = "voice" in channelStr
sendChat = "chat" in channelStr
logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
else:
channelRaw = self.config.responseChannel
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
sendVoice = channelStr in ("voice", "both")
sendChat = channelStr in ("chat", "both")
logger.info(f"Response channel (from config): '{channelStr}'")
if sendVoice and sendChat:
responseType = TeamsbotResponseType.BOTH
@ -1003,7 +1008,13 @@ class TeamsbotService:
responseType = TeamsbotResponseType.CHAT
# Suppress duplicate responses in short windows ("repeat loop" protection).
normalizedResponse = (speechResult.responseText or "").strip().lower()
canonicalText = (
speechResult.responseText
or speechResult.responseTextForVoice
or speechResult.responseTextForChat
or ""
)
normalizedResponse = (canonicalText or "").strip().lower()
nowTs = time.time()
if (
normalizedResponse
@ -1021,8 +1032,13 @@ class TeamsbotService:
})
return
# Resolve text per channel (AI can send different content to voice vs chat)
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
textForChat = speechResult.responseTextForChat or speechResult.responseText
storedText = textForChat or textForVoice or speechResult.responseText
# 4a: Voice response (TTS -> Audio to bot)
if sendVoice:
if sendVoice and textForVoice:
try:
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "requested",
@ -1034,7 +1050,7 @@ class TeamsbotService:
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
)
ttsResult = await voiceInterface.textToSpeech(
text=speechResult.responseText,
text=textForVoice,
languageCode=self.config.language,
voiceName=self.config.voiceId
)
@ -1087,13 +1103,13 @@ class TeamsbotService:
sendChat = True # Fallback to chat if voice-only and TTS failed
# 4b: Chat response (send text message to meeting chat)
if sendChat:
if sendChat and textForChat:
try:
if websocket:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
"text": speechResult.responseText,
"text": textForChat,
}))
logger.info(f"Chat response sent for session {sessionId}")
except Exception as chatErr:
@ -1102,7 +1118,7 @@ class TeamsbotService:
# 4b: Store bot response
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
responseText=speechResult.responseText,
responseText=storedText,
responseType=responseType,
detectedIntent=speechResult.detectedIntent,
reasoning=speechResult.reasoning,
@ -1118,7 +1134,7 @@ class TeamsbotService:
# 4c: Emit SSE event
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
"responseText": speechResult.responseText,
"responseText": storedText,
"responseType": responseType.value,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
@ -1141,7 +1157,7 @@ class TeamsbotService:
botTranscriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=self.config.botName,
text=speechResult.responseText,
text=storedText,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
@ -1151,7 +1167,7 @@ class TeamsbotService:
self._contextBuffer.append({
"speaker": self.config.botName,
"text": speechResult.responseText,
"text": storedText,
"timestamp": getUtcTimestamp(),
"source": "botResponse",
})
@ -1159,7 +1175,7 @@ class TeamsbotService:
await _emitSessionEvent(sessionId, "transcript", {
"id": botTranscript.get("id"),
"speaker": self.config.botName,
"text": speechResult.responseText,
"text": storedText,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
@ -1169,7 +1185,7 @@ class TeamsbotService:
# Reset differential writing tracker so next STT creates a new block
self._lastTranscriptSpeaker = self.config.botName
self._lastTranscriptText = speechResult.responseText
self._lastTranscriptText = storedText
self._lastTranscriptId = botTranscript.get("id")
self._followUpWindowEnd = time.time() + 15.0

View file

@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat":
- responseChannels: ["voice", "chat"]
- responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
- responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus":
- responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
- responseTextForVoice: Der zu sprechende Text
- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
STOP-ERKENNUNG:
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
(in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
{{
"shouldRespond": true/false,
"responseText": "Deine Antwort hier" oder null,
"responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
"responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
"responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
"responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
"reasoning": "Kurze Begruendung deiner Entscheidung",
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
"commands": [] oder null