feat(teamsbot): chat message handling, response channel (voice/chat/both), join mode (systemBot/anonymous/userAccount)

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
patrick-motsch 2026-02-16 00:07:42 +01:00
parent cc86b144ac
commit ef813a9304
3 changed files with 115 additions and 29 deletions

View file

@ -46,6 +46,20 @@ class TeamsbotResponseMode(str, Enum):
TRANSCRIBE_ONLY = "transcribeOnly" # Only transcribe, no AI responses TRANSCRIBE_ONLY = "transcribeOnly" # Only transcribe, no AI responses
class TeamsbotResponseChannel(str, Enum):
"""Channel for bot responses."""
VOICE = "voice" # Bot responds only via voice (TTS)
CHAT = "chat" # Bot responds only via chat message
BOTH = "both" # Bot responds via voice AND chat
class TeamsbotJoinMode(str, Enum):
"""How the bot joins the meeting."""
SYSTEM_BOT = "systemBot" # Join with system bot account (backend-managed credentials)
ANONYMOUS = "anonymous" # Join as anonymous guest
USER_ACCOUNT = "userAccount" # Join with user's own Microsoft account (OAuth)
# ============================================================================ # ============================================================================
# Database Models (stored in PostgreSQL) # Database Models (stored in PostgreSQL)
# ============================================================================ # ============================================================================
@ -106,7 +120,7 @@ class TeamsbotBotResponse(BaseModel):
# ============================================================================ # ============================================================================
class TeamsbotConfig(BaseModel): class TeamsbotConfig(BaseModel):
"""Configuration for a Teams Bot feature instance.""" """Configuration for a Teams Bot feature instance (serves as default template for new users)."""
botName: str = Field(default="AI Assistant", description="Default bot display name") botName: str = Field(default="AI Assistant", description="Default bot display name")
backgroundImageUrl: Optional[str] = Field(default=None, description="Default background image URL") backgroundImageUrl: Optional[str] = Field(default=None, description="Default background image URL")
aiSystemPrompt: str = Field( aiSystemPrompt: str = Field(
@ -114,6 +128,7 @@ class TeamsbotConfig(BaseModel):
description="Custom system prompt for the AI analysis" description="Custom system prompt for the AI analysis"
) )
responseMode: TeamsbotResponseMode = Field(default=TeamsbotResponseMode.AUTO, description="How the bot responds") responseMode: TeamsbotResponseMode = Field(default=TeamsbotResponseMode.AUTO, description="How the bot responds")
responseChannel: TeamsbotResponseChannel = Field(default=TeamsbotResponseChannel.VOICE, description="Channel for bot responses: voice, chat, or both")
language: str = Field(default="de-DE", description="Primary language for STT/TTS") language: str = Field(default="de-DE", description="Primary language for STT/TTS")
voiceId: Optional[str] = Field(default=None, description="Google TTS voice ID (e.g., de-DE-Standard-A)") voiceId: Optional[str] = Field(default=None, description="Google TTS voice ID (e.g., de-DE-Standard-A)")
browserBotUrl: Optional[str] = Field(default=None, description="URL of the Browser Bot service. Falls back to TEAMSBOT_BROWSER_BOT_URL env variable if not set per-instance.") browserBotUrl: Optional[str] = Field(default=None, description="URL of the Browser Bot service. Falls back to TEAMSBOT_BROWSER_BOT_URL env variable if not set per-instance.")
@ -141,6 +156,7 @@ class TeamsbotStartSessionRequest(BaseModel):
botName: Optional[str] = Field(default=None, description="Override bot name for this session") botName: Optional[str] = Field(default=None, description="Override bot name for this session")
backgroundImageUrl: Optional[str] = Field(default=None, description="Override background image for this session") backgroundImageUrl: Optional[str] = Field(default=None, description="Override background image for this session")
connectionId: Optional[str] = Field(default=None, description="Microsoft connection ID for Graph API access") connectionId: Optional[str] = Field(default=None, description="Microsoft connection ID for Graph API access")
joinMode: Optional[TeamsbotJoinMode] = Field(default=None, description="How the bot joins: systemBot, anonymous, or userAccount. Defaults to systemBot if credentials configured, else anonymous.")
class TeamsbotSessionResponse(BaseModel): class TeamsbotSessionResponse(BaseModel):
@ -156,6 +172,7 @@ class TeamsbotConfigUpdateRequest(BaseModel):
backgroundImageUrl: Optional[str] = None backgroundImageUrl: Optional[str] = None
aiSystemPrompt: Optional[str] = None aiSystemPrompt: Optional[str] = None
responseMode: Optional[TeamsbotResponseMode] = None responseMode: Optional[TeamsbotResponseMode] = None
responseChannel: Optional[TeamsbotResponseChannel] = None
language: Optional[str] = None language: Optional[str] = None
voiceId: Optional[str] = None voiceId: Optional[str] = None
browserBotUrl: Optional[str] = None browserBotUrl: Optional[str] = None

View file

@ -30,6 +30,7 @@ from .datamodelTeamsbot import (
TeamsbotSessionResponse, TeamsbotSessionResponse,
TeamsbotConfigUpdateRequest, TeamsbotConfigUpdateRequest,
TeamsbotConfig, TeamsbotConfig,
TeamsbotJoinMode,
) )
# Import service # Import service
@ -186,8 +187,35 @@ async def startSession(
appApiUrl = APP_CONFIG.get("APP_API_URL", "") appApiUrl = APP_CONFIG.get("APP_API_URL", "")
gatewayBaseUrl = appApiUrl.rstrip("/") if appApiUrl else str(request.base_url).rstrip("/") gatewayBaseUrl = appApiUrl.rstrip("/") if appApiUrl else str(request.base_url).rstrip("/")
# Determine effective join mode
joinMode = body.joinMode
if not joinMode:
# Default: use system bot if credentials are configured, otherwise anonymous
if config.botAccountEmail and config.botAccountPassword:
joinMode = TeamsbotJoinMode.SYSTEM_BOT
else:
joinMode = TeamsbotJoinMode.ANONYMOUS
# Resolve credentials based on join mode
effectiveEmail = None
effectivePassword = None
if joinMode == TeamsbotJoinMode.SYSTEM_BOT:
effectiveEmail = config.botAccountEmail
effectivePassword = config.botAccountPassword
elif joinMode == TeamsbotJoinMode.USER_ACCOUNT:
# TODO: Resolve OAuth token from user's Microsoft connection
logger.info(f"User account join mode requested but not yet implemented - falling back to anonymous")
joinMode = TeamsbotJoinMode.ANONYMOUS
# ANONYMOUS mode: no credentials
# Temporarily override config credentials for this session's join mode
sessionConfig = config.model_copy(update={
"botAccountEmail": effectiveEmail,
"botAccountPassword": effectivePassword,
})
# Start the bot in background (join meeting via bridge) # Start the bot in background (join meeting via bridge)
service = TeamsbotService(context.user, mandateId, instanceId, config) service = TeamsbotService(context.user, mandateId, instanceId, sessionConfig)
asyncio.create_task( asyncio.create_task(
service.joinMeeting(sessionId, cleanMeetingUrl, body.connectionId, gatewayBaseUrl) service.joinMeeting(sessionId, cleanMeetingUrl, body.connectionId, gatewayBaseUrl)
) )

View file

@ -25,6 +25,7 @@ from .datamodelTeamsbot import (
TeamsbotResponseType, TeamsbotResponseType,
TeamsbotConfig, TeamsbotConfig,
TeamsbotResponseMode, TeamsbotResponseMode,
TeamsbotResponseChannel,
SpeechTeamsResponse, SpeechTeamsResponse,
) )
from .browserBotConnector import BrowserBotConnector from .browserBotConnector import BrowserBotConnector
@ -238,6 +239,20 @@ class TeamsbotService:
websocket=websocket, websocket=websocket,
) )
elif msgType == "chatMessage":
chat = message.get("chat", {})
logger.info(f"[WS-DEBUG] Chat message received: speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}...")
await self._processTranscript(
sessionId=sessionId,
speaker=chat.get("speaker", "Unknown"),
text=chat.get("text", ""),
isFinal=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source="chat",
)
elif msgType == "status": elif msgType == "status":
status = message.get("status") status = message.get("status")
errorMessage = message.get("message") errorMessage = message.get("message")
@ -300,8 +315,9 @@ class TeamsbotService:
interface, interface,
voiceInterface, voiceInterface,
websocket: WebSocket, websocket: WebSocket,
source: str = "caption",
): ):
"""Process a transcript segment from the browser bot's caption scraping.""" """Process a transcript segment from captions or chat messages."""
text = text.strip() text = text.strip()
if not text: if not text:
@ -325,11 +341,12 @@ class TeamsbotService:
createdTranscript = interface.createTranscript(transcriptData) createdTranscript = interface.createTranscript(transcriptData)
# Update context buffer # Update context buffer (mark source for chat messages)
self._contextBuffer.append({ self._contextBuffer.append({
"speaker": speaker or "Unknown", "speaker": speaker or "Unknown",
"text": text, "text": text,
"timestamp": getUtcTimestamp(), "timestamp": getUtcTimestamp(),
"source": source,
}) })
# Keep only last N segments # Keep only last N segments
maxSegments = self.config.contextWindowSegments maxSegments = self.config.contextWindowSegments
@ -442,13 +459,17 @@ class TeamsbotService:
self._lastAiCallTime = time.time() self._lastAiCallTime = time.time()
# Build transcript context from buffer. # Build transcript context from buffer.
# Mark bot's own utterances so the AI knows what it already said. # Mark bot's own utterances and chat messages for the AI.
contextLines = [] contextLines = []
for segment in self._contextBuffer: for segment in self._contextBuffer:
speaker = segment.get("speaker", "Unknown") speaker = segment.get("speaker", "Unknown")
text = segment.get("text", "") text = segment.get("text", "")
segSource = segment.get("source", "caption")
prefix = "Chat" if segSource == "chat" else ""
if self._isBotSpeaker(speaker): if self._isBotSpeaker(speaker):
contextLines.append(f"[YOU ({self.config.botName})]: {text}") contextLines.append(f"[YOU ({self.config.botName})]: {text}")
elif prefix:
contextLines.append(f"[{prefix}: {speaker}]: {text}")
else: else:
contextLines.append(f"[{speaker}]: {text}") contextLines.append(f"[{speaker}]: {text}")
@ -522,35 +543,55 @@ class TeamsbotService:
}) })
return return
# Auto mode: send voice + chat response # Determine response channel (voice, chat, or both)
channel = self.config.responseChannel
responseType = TeamsbotResponseType.BOTH responseType = TeamsbotResponseType.BOTH
if channel == TeamsbotResponseChannel.VOICE:
responseType = TeamsbotResponseType.AUDIO
elif channel == TeamsbotResponseChannel.CHAT:
responseType = TeamsbotResponseType.CHAT
else:
responseType = TeamsbotResponseType.BOTH
# 4a: TTS -> Audio to bridge # 4a: Voice response (TTS -> Audio to bot)
try: if channel in (TeamsbotResponseChannel.VOICE, TeamsbotResponseChannel.BOTH):
ttsResult = await voiceInterface.textToSpeech( try:
text=speechResult.responseText, ttsResult = await voiceInterface.textToSpeech(
languageCode=self.config.language, text=speechResult.responseText,
voiceName=self.config.voiceId languageCode=self.config.language,
) voiceName=self.config.voiceId
)
if ttsResult and isinstance(ttsResult, dict):
audioContent = ttsResult.get("audioContent") if ttsResult and isinstance(ttsResult, dict):
if audioContent and websocket: audioContent = ttsResult.get("audioContent")
# Send TTS audio to bot via WebSocket if audioContent and websocket:
# Bot expects: {type: "playAudio", sessionId, audio: {data, format}} await websocket.send_text(json.dumps({
"type": "playAudio",
"sessionId": sessionId,
"audio": {
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
"format": "mp3",
},
}))
elif audioContent and not websocket:
logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)")
except Exception as ttsErr:
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
if responseType == TeamsbotResponseType.AUDIO:
responseType = TeamsbotResponseType.CHAT # Fallback to chat only
# 4b: Chat response (send text message to meeting chat)
if channel in (TeamsbotResponseChannel.CHAT, TeamsbotResponseChannel.BOTH):
try:
if websocket:
await websocket.send_text(json.dumps({ await websocket.send_text(json.dumps({
"type": "playAudio", "type": "sendChatMessage",
"sessionId": sessionId, "sessionId": sessionId,
"audio": { "text": speechResult.responseText,
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
"format": "mp3",
},
})) }))
elif audioContent and not websocket: except Exception as chatErr:
logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)") logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}")
except Exception as ttsErr:
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
responseType = TeamsbotResponseType.CHAT # Fallback to chat only
# 4b: Store bot response # 4b: Store bot response
botResponseData = TeamsbotBotResponse( botResponseData = TeamsbotBotResponse(