feat(teamsbot): per-request chat/voice channels, responseTextForVoice/Chat
Made-with: Cursor
This commit is contained in:
parent
65a8026496
commit
1f529568f5
3 changed files with 54 additions and 20 deletions
|
|
@ -246,6 +246,9 @@ class SpeechTeamsResponse(BaseModel):
|
|||
"""Structured response from the SPEECH_TEAMS AI handler."""
|
||||
shouldRespond: bool = Field(description="Whether the bot should respond")
|
||||
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
|
||||
responseTextForVoice: Optional[str] = Field(default=None, description="Text for voice/TTS only; if absent, use responseText")
|
||||
responseTextForChat: Optional[str] = Field(default=None, description="Text for chat only; if absent, use responseText")
|
||||
responseChannels: Optional[List[str]] = Field(default=None, description="Per-request channels: ['voice'], ['chat'], or ['voice','chat']; overrides config if set")
|
||||
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
|
||||
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
|
||||
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
|
||||
|
|
|
|||
|
|
@ -986,14 +986,19 @@ class TeamsbotService:
|
|||
})
|
||||
return
|
||||
|
||||
# Determine response channel (voice, chat, or both)
|
||||
# Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
|
||||
# Determine response channel: per-request (AI) overrides config
|
||||
channels = speechResult.responseChannels
|
||||
if channels and isinstance(channels, list):
|
||||
channelStr = ",".join(str(c).lower().strip() for c in channels)
|
||||
sendVoice = "voice" in channelStr
|
||||
sendChat = "chat" in channelStr
|
||||
logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
|
||||
else:
|
||||
channelRaw = self.config.responseChannel
|
||||
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
|
||||
logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
|
||||
|
||||
sendVoice = channelStr in ("voice", "both")
|
||||
sendChat = channelStr in ("chat", "both")
|
||||
logger.info(f"Response channel (from config): '{channelStr}'")
|
||||
|
||||
if sendVoice and sendChat:
|
||||
responseType = TeamsbotResponseType.BOTH
|
||||
|
|
@ -1003,7 +1008,13 @@ class TeamsbotService:
|
|||
responseType = TeamsbotResponseType.CHAT
|
||||
|
||||
# Suppress duplicate responses in short windows ("repeat loop" protection).
|
||||
normalizedResponse = (speechResult.responseText or "").strip().lower()
|
||||
canonicalText = (
|
||||
speechResult.responseText
|
||||
or speechResult.responseTextForVoice
|
||||
or speechResult.responseTextForChat
|
||||
or ""
|
||||
)
|
||||
normalizedResponse = (canonicalText or "").strip().lower()
|
||||
nowTs = time.time()
|
||||
if (
|
||||
normalizedResponse
|
||||
|
|
@ -1021,8 +1032,13 @@ class TeamsbotService:
|
|||
})
|
||||
return
|
||||
|
||||
# Resolve text per channel (AI can send different content to voice vs chat)
|
||||
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
|
||||
textForChat = speechResult.responseTextForChat or speechResult.responseText
|
||||
storedText = textForChat or textForVoice or speechResult.responseText
|
||||
|
||||
# 4a: Voice response (TTS -> Audio to bot)
|
||||
if sendVoice:
|
||||
if sendVoice and textForVoice:
|
||||
try:
|
||||
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
||||
"status": "requested",
|
||||
|
|
@ -1034,7 +1050,7 @@ class TeamsbotService:
|
|||
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
||||
)
|
||||
ttsResult = await voiceInterface.textToSpeech(
|
||||
text=speechResult.responseText,
|
||||
text=textForVoice,
|
||||
languageCode=self.config.language,
|
||||
voiceName=self.config.voiceId
|
||||
)
|
||||
|
|
@ -1087,13 +1103,13 @@ class TeamsbotService:
|
|||
sendChat = True # Fallback to chat if voice-only and TTS failed
|
||||
|
||||
# 4b: Chat response (send text message to meeting chat)
|
||||
if sendChat:
|
||||
if sendChat and textForChat:
|
||||
try:
|
||||
if websocket:
|
||||
await websocket.send_text(json.dumps({
|
||||
"type": "sendChatMessage",
|
||||
"sessionId": sessionId,
|
||||
"text": speechResult.responseText,
|
||||
"text": textForChat,
|
||||
}))
|
||||
logger.info(f"Chat response sent for session {sessionId}")
|
||||
except Exception as chatErr:
|
||||
|
|
@ -1102,7 +1118,7 @@ class TeamsbotService:
|
|||
# 4b: Store bot response
|
||||
botResponseData = TeamsbotBotResponse(
|
||||
sessionId=sessionId,
|
||||
responseText=speechResult.responseText,
|
||||
responseText=storedText,
|
||||
responseType=responseType,
|
||||
detectedIntent=speechResult.detectedIntent,
|
||||
reasoning=speechResult.reasoning,
|
||||
|
|
@ -1118,7 +1134,7 @@ class TeamsbotService:
|
|||
# 4c: Emit SSE event
|
||||
await _emitSessionEvent(sessionId, "botResponse", {
|
||||
"id": createdResponse.get("id"),
|
||||
"responseText": speechResult.responseText,
|
||||
"responseText": storedText,
|
||||
"responseType": responseType.value,
|
||||
"detectedIntent": speechResult.detectedIntent,
|
||||
"reasoning": speechResult.reasoning,
|
||||
|
|
@ -1141,7 +1157,7 @@ class TeamsbotService:
|
|||
botTranscriptData = TeamsbotTranscript(
|
||||
sessionId=sessionId,
|
||||
speaker=self.config.botName,
|
||||
text=speechResult.responseText,
|
||||
text=storedText,
|
||||
timestamp=getIsoTimestamp(),
|
||||
confidence=1.0,
|
||||
language=self.config.language,
|
||||
|
|
@ -1151,7 +1167,7 @@ class TeamsbotService:
|
|||
|
||||
self._contextBuffer.append({
|
||||
"speaker": self.config.botName,
|
||||
"text": speechResult.responseText,
|
||||
"text": storedText,
|
||||
"timestamp": getUtcTimestamp(),
|
||||
"source": "botResponse",
|
||||
})
|
||||
|
|
@ -1159,7 +1175,7 @@ class TeamsbotService:
|
|||
await _emitSessionEvent(sessionId, "transcript", {
|
||||
"id": botTranscript.get("id"),
|
||||
"speaker": self.config.botName,
|
||||
"text": speechResult.responseText,
|
||||
"text": storedText,
|
||||
"confidence": 1.0,
|
||||
"timestamp": getIsoTimestamp(),
|
||||
"isContinuation": False,
|
||||
|
|
@ -1169,7 +1185,7 @@ class TeamsbotService:
|
|||
|
||||
# Reset differential writing tracker so next STT creates a new block
|
||||
self._lastTranscriptSpeaker = self.config.botName
|
||||
self._lastTranscriptText = speechResult.responseText
|
||||
self._lastTranscriptText = storedText
|
||||
self._lastTranscriptId = botTranscript.get("id")
|
||||
|
||||
self._followUpWindowEnd = time.time() + 15.0
|
||||
|
|
|
|||
|
|
@ -373,6 +373,18 @@ WENN DER USER DICH BITTET ETWAS VORZULESEN / ZUSAMMENZUFASSEN:
|
|||
- Gib IMMER sofort die Zusammenfassung aus (nicht nur ankündigen).
|
||||
- Falls Vorlesen gewuenscht ist, setze zusaetzlich ein "readAloud"-Kommando mit dem Text.
|
||||
|
||||
KANAL-AUSWAHL (Voice vs Chat) - Je nach Anfrage unterschiedlich antworten:
|
||||
- Du kannst pro Anfrage festlegen, ob deine Antwort per Voice (TTS), per Chat, oder beides erfolgt.
|
||||
- Wenn jemand sagt "schreib das in den Chat", "schreib die Zusammenfassung in den Chat", "poste das im Chat":
|
||||
- responseChannels: ["voice", "chat"]
|
||||
- responseTextForVoice: Kurze Bestaetigung (z.B. "Ich schreibe die Zusammenfassung jetzt in den Chat")
|
||||
- responseTextForChat: Der eigentliche Inhalt (z.B. die vollstaendige Zusammenfassung)
|
||||
- Wenn jemand sagt "sag mir das", "lies das vor", "sprich das aus":
|
||||
- responseChannels: ["voice"] oder ["voice","chat"] je nach Kontext
|
||||
- responseTextForVoice: Der zu sprechende Text
|
||||
- Wenn jemand sagt "nur im Chat", "schreib nur": responseChannels: ["chat"]
|
||||
- Wenn keine Kanal-Praeferenz erkennbar: responseChannels weglassen (Config entscheidet), responseText verwenden.
|
||||
|
||||
STOP-ERKENNUNG:
|
||||
Wenn jemand dich bittet aufzuhoeren, still zu sein, zu stoppen, oder nicht mehr zu reden
|
||||
(in JEDER Sprache, z.B. "{botFirstName} stop", "{botFirstName} sei still", "{botFirstName} halt", "{botFirstName} be quiet",
|
||||
|
|
@ -397,7 +409,10 @@ Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Trans
|
|||
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
|
||||
{{
|
||||
"shouldRespond": true/false,
|
||||
"responseText": "Deine Antwort hier" oder null,
|
||||
"responseText": "Deine Antwort hier" oder null (Standard fuer beide Kanäle),
|
||||
"responseTextForVoice": optional - Text nur fuer TTS/Voice (z.B. kurze Bestaetigung),
|
||||
"responseTextForChat": optional - Text nur fuer Chat (z.B. lange Zusammenfassung),
|
||||
"responseChannels": optional - ["voice"], ["chat"] oder ["voice","chat"] je nach User-Anfrage,
|
||||
"reasoning": "Kurze Begruendung deiner Entscheidung",
|
||||
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
|
||||
"commands": [] oder null
|
||||
|
|
|
|||
Loading…
Reference in a new issue