feat(teamsbot): per-request chat/voice channels, responseTextForVoice/Chat
Made-with: Cursor
This commit is contained in:
parent
65a8026496
commit
1f529568f5
3 changed files with 54 additions and 20 deletions
|
|
@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
|
||||||
"""Structured response from the SPEECH_TEAMS AI handler."""
|
"""Structured response from the SPEECH_TEAMS AI handler."""
|
||||||
shouldRespond: bool = Field(description="Whether the bot should respond")
|
shouldRespond: bool = Field(description="Whether the bot should respond")
|
||||||
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
|
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
|
||||||
|
responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
|
||||||
|
responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
|
||||||
|
responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
|
||||||
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
|
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
|
||||||
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
|
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
|
||||||
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
|
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
|
||||||
|
|
|
||||||
|
|
@ -986,14 +986,19 @@ class TeamsbotService:
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
|
|
||||||
# Determine response channel (voice, chat, or both)
|
# Determine response channel: per-request (AI) overrides config
|
||||||
# Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
|
channels = speechResult.responseChannels
|
||||||
channelRaw = self.config.responseChannel
|
if channels and isinstance(channels, list):
|
||||||
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
|
channelStr = ",".join(str(c).lower().strip() for c in channels)
|
||||||
logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
|
sendVoice = "voice" in channelStr
|
||||||
|
sendChat = "chat" in channelStr
|
||||||
sendVoice = channelStr in ("voice", "both")
|
logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
|
||||||
sendChat = channelStr in ("chat", "both")
|
else:
|
||||||
|
channelRaw = self.config.responseChannel
|
||||||
|
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
|
||||||
|
sendVoice = channelStr in ("voice", "both")
|
||||||
|
sendChat = channelStr in ("chat", "both")
|
||||||
|
logger.info(f"Response channel (from config): '{channelStr}'")
|
||||||
|
|
||||||
if sendVoice and sendChat:
|
if sendVoice and sendChat:
|
||||||
responseType = TeamsbotResponseType.BOTH
|
responseType = TeamsbotResponseType.BOTH
|
||||||
|
|
@ -1003,7 +1008,13 @@ class TeamsbotService:
|
||||||
responseType = TeamsbotResponseType.CHAT
|
responseType = TeamsbotResponseType.CHAT
|
||||||
|
|
||||||
# Suppress duplicate responses in short windows ("repeat loop" protection).
|
# Suppress duplicate responses in short windows ("repeat loop" protection).
|
||||||
normalizedResponse = (speechResult.responseText or "").strip().lower()
|
canonicalText = (
|
||||||
|
speechResult.responseText
|
||||||
|
or speechResult.responseTextForVoice
|
||||||
|
or speechResult.responseTextForChat
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
normalizedResponse = (canonicalText or "").strip().lower()
|
||||||
nowTs = time.time()
|
nowTs = time.time()
|
||||||
if (
|
if (
|
||||||
normalizedResponse
|
normalizedResponse
|
||||||
|
|
@ -1021,8 +1032,13 @@ class TeamsbotService:
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Resolve text per channel (AI can send different content to voice vs chat)
|
||||||
|
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
|
||||||
|
textForChat = speechResult.responseTextForChat or speechResult.responseText
|
||||||
|
storedText = textForChat or textForVoice or speechResult.responseText
|
||||||
|
|
||||||
# 4a: Voice response (TTS -> Audio to bot)
|
# 4a: Voice response (TTS -> Audio to bot)
|
||||||
if sendVoice:
|
if sendVoice and textForVoice:
|
||||||
try:
|
try:
|
||||||
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
||||||
"status": "requested",
|
"status": "requested",
|
||||||
|
|
@ -1034,7 +1050,7 @@ class TeamsbotService:
|
||||||
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
||||||
)
|
)
|
||||||
ttsResult = await voiceInterface.textToSpeech(
|
ttsResult = await voiceInterface.textToSpeech(
|
||||||
text=speechResult.responseText,
|
text=textForVoice,
|
||||||
languageCode=self.config.language,
|
languageCode=self.config.language,
|
||||||
voiceName=self.config.voiceId
|
voiceName=self.config.voiceId
|
||||||
)
|
)
|
||||||
|
|
@ -1087,13 +1103,13 @@ class TeamsbotService:
|
||||||
sendChat = True # Fallback to chat if voice-only and TTS failed
|
sendChat = True # Fallback to chat if voice-only and TTS failed
|
||||||
|
|
||||||
# 4b: Chat response (send text message to meeting chat)
|
# 4b: Chat response (send text message to meeting chat)
|
||||||
if sendChat:
|
if sendChat and textForChat:
|
||||||
try:
|
try:
|
||||||
if websocket:
|
if websocket:
|
||||||
await websocket.send_text(json.dumps({
|
await websocket.send_text(json.dumps({
|
||||||
"type": "sendChatMessage",
|
"type": "sendChatMessage",
|
||||||
"sessionId": sessionId,
|
"sessionId": sessionId,
|
||||||
"text": speechResult.responseText,
|
"text": textForChat,
|
||||||
}))
|
}))
|
||||||
logger.info(f"Chat response sent for session {sessionId}")
|
logger.info(f"Chat response sent for session {sessionId}")
|
||||||
except Exception as chatErr:
|
except Exception as chatErr:
|
||||||
|
|
@ -1102,7 +1118,7 @@ class TeamsbotService:
|
||||||
# 4b: Store bot response
|
# 4b: Store bot response
|
||||||
botResponseData = TeamsbotBotResponse(
|
botResponseData = TeamsbotBotResponse(
|
||||||
sessionId=sessionId,
|
sessionId=sessionId,
|
||||||
responseText=speechResult.responseText,
|
responseText=storedText,
|
||||||
responseType=responseType,
|
responseType=responseType,
|
||||||
detectedIntent=speechResult.detectedIntent,
|
detectedIntent=speechResult.detectedIntent,
|
||||||
reasoning=speechResult.reasoning,
|
reasoning=speechResult.reasoning,
|
||||||
|
|
@ -1118,7 +1134,7 @@ class TeamsbotService:
|
||||||
# 4c: Emit SSE event
|
# 4c: Emit SSE event
|
||||||
await _emitSessionEvent(sessionId, "botResponse", {
|
await _emitSessionEvent(sessionId, "botResponse", {
|
||||||
"id": createdResponse.get("id"),
|
"id": createdResponse.get("id"),
|
||||||
"responseText": speechResult.responseText,
|
"responseText": storedText,
|
||||||
"responseType": responseType.value,
|
"responseType": responseType.value,
|
||||||
"detectedIntent": speechResult.detectedIntent,
|
"detectedIntent": speechResult.detectedIntent,
|
||||||
"reasoning": speechResult.reasoning,
|
"reasoning": speechResult.reasoning,
|
||||||
|
|
@ -1141,7 +1157,7 @@ class TeamsbotService:
|
||||||
botTranscriptData = TeamsbotTranscript(
|
botTranscriptData = TeamsbotTranscript(
|
||||||
sessionId=sessionId,
|
sessionId=sessionId,
|
||||||
speaker=self.config.botName,
|
speaker=self.config.botName,
|
||||||
text=speechResult.responseText,
|
text=storedText,
|
||||||
timestamp=getIsoTimestamp(),
|
timestamp=getIsoTimestamp(),
|
||||||
confidence=1.0,
|
confidence=1.0,
|
||||||
language=self.config.language,
|
language=self.config.language,
|
||||||
|
|
@ -1151,7 +1167,7 @@ class TeamsbotService:
|
||||||
|
|
||||||
self._contextBuffer.append({
|
self._contextBuffer.append({
|
||||||
"speaker": self.config.botName,
|
"speaker": self.config.botName,
|
||||||
"text": speechResult.responseText,
|
"text": storedText,
|
||||||
"timestamp": getUtcTimestamp(),
|
"timestamp": getUtcTimestamp(),
|
||||||
"source": "botResponse",
|
"source": "botResponse",
|
||||||
})
|
})
|
||||||
|
|
@ -1159,7 +1175,7 @@ class TeamsbotService:
|
||||||
await _emitSessionEvent(sessionId, "transcript", {
|
await _emitSessionEvent(sessionId, "transcript", {
|
||||||
"id": botTranscript.get("id"),
|
"id": botTranscript.get("id"),
|
||||||
"speaker": self.config.botName,
|
"speaker": self.config.botName,
|
||||||
"text": speechResult.responseText,
|
"text": storedText,
|
||||||
"confidence": 1.0,
|
"confidence": 1.0,
|
||||||
"timestamp": getIsoTimestamp(),
|
"timestamp": getIsoTimestamp(),
|
||||||
"isContinuation": False,
|
"isContinuation": False,
|
||||||
|
|
@ -1169,7 +1185,7 @@ class TeamsbotService:
|
||||||
|
|
||||||
# Reset differential writing tracker so next STT creates a new block
|
# Reset differential writing tracker so next STT creates a new block
|
||||||
self._lastTranscriptSpeaker = self.config.botName
|
self._lastTranscriptSpeaker = self.config.botName
|
||||||
self._lastTranscriptText = speechResult.responseText
|
self._lastTranscriptText = storedText
|
||||||
self._lastTranscriptId = botTranscript.get("id")
|
self._lastTranscriptId = botTranscript.get("id")
|
||||||
|
|
||||||
self._followUpWindowEnd = time.time() + 15.0
|
self._followUpWindowEnd = time.time() + 15.0
|
||||||
|
|
|
||||||
|
|
@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
|
||||||
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
|
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
|
||||||
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
|
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
|
||||||
|
|
||||||
|
KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
|
||||||
|
- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
|
||||||
|
- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat":
|
||||||
|
- responseChannels: ["voice", "chat"]
|
||||||
|
- responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
|
||||||
|
- responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
|
||||||
|
- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus":
|
||||||
|
- responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
|
||||||
|
- responseTextForVoice: Der zu sprechende Text
|
||||||
|
- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
|
||||||
|
- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
|
||||||
|
|
||||||
STOP-ERKENNUNG:
|
STOP-ERKENNUNG:
|
||||||
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
|
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
|
||||||
(in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
|
(in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
|
||||||
|
|
@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
|
||||||
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
|
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
|
||||||
{{
|
{{
|
||||||
"shouldRespond": true/false,
|
"shouldRespond": true/false,
|
||||||
"responseText": "Deine Antwort hier" oder null,
|
"responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
|
||||||
|
"responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
|
||||||
|
"responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
|
||||||
|
"responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
|
||||||
"reasoning": "Kurze Begruendung deiner Entscheidung",
|
"reasoning": "Kurze Begruendung deiner Entscheidung",
|
||||||
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
|
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
|
||||||
"commands": [] oder null
|
"commands": [] oder null
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue