gateway/modules/features/teamsbot/service.py
patrick-motsch 02dedc972d fix(teamsbot): SSE rate-limit 60/min, TTS message format for bot protocol
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-15 15:19:31 +01:00

597 lines
25 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Teamsbot Service - Pipeline Orchestrator.
Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge.
"""
import logging
import json
import asyncio
import time
import base64
from typing import Optional, Dict, Any, List
from fastapi import WebSocket
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp
from .datamodelTeamsbot import (
TeamsbotSessionStatus,
TeamsbotTranscript,
TeamsbotBotResponse,
TeamsbotResponseType,
TeamsbotConfig,
TeamsbotResponseMode,
SpeechTeamsResponse,
)
from .browserBotConnector import BrowserBotConnector
logger = logging.getLogger(__name__)
# =========================================================================
# Minimal Service Context (for AI billing in bridge callbacks)
# =========================================================================
class _ServiceContext:
"""Minimal context providing user/mandate info for AiService billing.
Used by bridge callbacks where a full Services instance is not available."""
def __init__(self, user, mandateId, featureInstanceId=None):
self.user = user
self.mandateId = mandateId
self.featureInstanceId = featureInstanceId
self.featureCode = "teamsbot"
# =========================================================================
# Session Event Queues (for SSE streaming to frontend)
# =========================================================================
_sessionEvents: Dict[str, asyncio.Queue] = {}
async def _emitSessionEvent(sessionId: str, eventType: str, data: Any):
"""Emit an event to the session's SSE stream."""
eventQueue = _sessionEvents.get(sessionId)
if eventQueue:
await eventQueue.put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()})
class TeamsbotService:
"""
Pipeline Orchestrator for Teams Bot sessions.
Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication.
"""
def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig):
self.currentUser = currentUser
self.mandateId = mandateId
self.instanceId = instanceId
self.config = config
self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl())
# State
self._lastAiCallTime: float = 0.0
self._contextBuffer: List[Dict[str, Any]] = []
# =========================================================================
# Session Lifecycle
# =========================================================================
async def joinMeeting(
self,
sessionId: str,
meetingLink: str,
connectionId: Optional[str] = None,
gatewayBaseUrl: str = "",
):
"""Send join command to the Browser Bot service.
The browser bot will:
1. Launch a headless browser
2. Navigate to Teams web app
3. Join the meeting as anonymous guest
4. Enable captions and start scraping
5. Connect back via WebSocket to send transcripts
"""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
# Initialize SSE event queue
_sessionEvents[sessionId] = asyncio.Queue()
try:
# Update status to JOINING
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value})
await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"})
# Send join command to browser bot
session = interface.getSession(sessionId)
if not session:
raise ValueError(f"Session {sessionId} not found")
# Build the full WebSocket URL for the bot to connect back to this gateway instance
# gatewayBaseUrl is passed from the route handler (derived from request.base_url)
wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws"
gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/")
fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}"
result = await self.browserBotConnector.joinMeeting(
sessionId=sessionId,
meetingUrl=meetingLink,
botName=session.get("botName", self.config.botName),
instanceId=self.instanceId,
gatewayWsUrl=fullGatewayWsUrl,
language=self.config.language,
botAccountEmail=self.config.botAccountEmail,
botAccountPassword=self.config.botAccountPassword,
backgroundImageUrl=session.get("backgroundImageUrl") or self.config.backgroundImageUrl,
)
if result.get("success"):
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.JOINING.value, # Will become ACTIVE when bot connects via WS
})
logger.info(f"Browser bot deployment started for session {sessionId}")
else:
errorMsg = result.get("error", "Unknown error joining meeting")
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ERROR.value,
"errorMessage": errorMsg,
})
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg})
logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}")
except Exception as e:
logger.error(f"Error joining meeting for session {sessionId}: {e}")
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ERROR.value,
"errorMessage": str(e),
})
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)})
async def leaveMeeting(self, sessionId: str):
"""Send leave command to the Browser Bot service."""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
try:
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value})
await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"})
await self.browserBotConnector.leaveMeeting(sessionId)
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ENDED.value,
"endedAt": getIsoTimestamp(),
})
await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"})
# Generate meeting summary in background
asyncio.create_task(self._generateMeetingSummary(sessionId))
logger.info(f"Bot left meeting for session {sessionId}")
except Exception as e:
logger.error(f"Error leaving meeting for session {sessionId}: {e}")
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ERROR.value,
"errorMessage": str(e),
"endedAt": getIsoTimestamp(),
})
# Cleanup event queue
_sessionEvents.pop(sessionId, None)
# =========================================================================
# Browser Bot WebSocket Communication
# =========================================================================
async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str):
"""
Main WebSocket handler for Browser Bot communication.
Receives:
- transcript: Caption text scraped from Teams
- status: Bot state changes (joined, in_lobby, left, error)
Sends:
- playAudio: TTS audio for the bot to play in the meeting
"""
from . import interfaceFeatureTeamsbot as interfaceDb
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
voiceInterface = getVoiceInterface(self.currentUser, self.mandateId)
logger.info(f"[WS-DEBUG] WebSocket handler started for session {sessionId}")
try:
msgCount = 0
while True:
data = await websocket.receive()
msgCount += 1
if "text" not in data:
logger.debug(f"[WS-DEBUG] session={sessionId} msg #{msgCount}: non-text data received (keys: {list(data.keys())})")
continue
message = json.loads(data["text"])
msgType = message.get("type")
logger.info(f"[WS-DEBUG] session={sessionId} msg #{msgCount}: type={msgType}")
if msgType == "transcript":
transcript = message.get("transcript", {})
logger.info(f"[WS-DEBUG] Transcript received: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
await self._processTranscript(
sessionId=sessionId,
speaker=transcript.get("speaker", "Unknown"),
text=transcript.get("text", ""),
isFinal=transcript.get("isFinal", True),
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
)
elif msgType == "status":
status = message.get("status")
errorMessage = message.get("message")
logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
await self._handleBotStatus(sessionId, status, errorMessage, interface)
elif msgType == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
except Exception as e:
if "disconnect" not in str(e).lower():
logger.error(f"[WS-DEBUG] WebSocket error for session {sessionId}: {type(e).__name__}: {e}")
logger.info(f"[WS-DEBUG] WebSocket handler ended for session {sessionId} after {msgCount} messages")
async def _handleBotStatus(
self,
sessionId: str,
status: str,
errorMessage: Optional[str],
interface,
):
"""Handle status updates from the browser bot."""
logger.info(f"Bot status update for session {sessionId}: {status}")
statusMap = {
"connecting": TeamsbotSessionStatus.JOINING.value,
"launching": TeamsbotSessionStatus.JOINING.value,
"navigating": TeamsbotSessionStatus.JOINING.value,
"in_lobby": TeamsbotSessionStatus.JOINING.value,
"joined": TeamsbotSessionStatus.ACTIVE.value,
"in_meeting": TeamsbotSessionStatus.ACTIVE.value,
"left": TeamsbotSessionStatus.ENDED.value,
"error": TeamsbotSessionStatus.ERROR.value,
}
dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value)
updates = {"status": dbStatus}
if errorMessage:
updates["errorMessage"] = errorMessage
if dbStatus == TeamsbotSessionStatus.ACTIVE.value:
updates["startedAt"] = getIsoTimestamp()
elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
updates["endedAt"] = getIsoTimestamp()
interface.updateSession(sessionId, updates)
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
# Generate summary when session ends
if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId))
async def _processTranscript(
self,
sessionId: str,
speaker: str,
text: str,
isFinal: bool,
interface,
voiceInterface,
websocket: WebSocket,
):
"""Process a transcript segment from the browser bot's caption scraping."""
text = text.strip()
if not text:
return
# Store transcript segment
transcriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=speaker,
text=text,
timestamp=getIsoTimestamp(),
confidence=1.0, # Captions don't have confidence scores
language=self.config.language,
isFinal=isFinal,
).model_dump()
createdTranscript = interface.createTranscript(transcriptData)
# Update context buffer
self._contextBuffer.append({
"speaker": speaker or "Unknown",
"text": text,
"timestamp": getUtcTimestamp(),
})
# Keep only last N segments
maxSegments = self.config.contextWindowSegments
if len(self._contextBuffer) > maxSegments:
self._contextBuffer = self._contextBuffer[-maxSegments:]
# Emit SSE event for live transcript
await _emitSessionEvent(sessionId, "transcript", {
"id": createdTranscript.get("id"),
"speaker": speaker,
"text": text,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
})
# Update session transcript count
session = interface.getSession(sessionId)
if session:
count = session.get("transcriptSegmentCount", 0) + 1
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
# Check if AI analysis should be triggered (only for final transcripts)
if not isFinal:
return
if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY:
logger.debug(f"Session {sessionId}: responseMode=TRANSCRIBE_ONLY, skipping AI analysis")
return
shouldTrigger = self._shouldTriggerAnalysis(text)
logger.debug(f"Session {sessionId}: shouldTriggerAnalysis={shouldTrigger}, bufferSize={len(self._contextBuffer)}, responseMode={self.config.responseMode}")
if not shouldTrigger:
return
# SPEECH_TEAMS AI analysis
logger.info(f"Session {sessionId}: Triggering AI analysis (buffer: {len(self._contextBuffer)} segments)")
await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript)
def _shouldTriggerAnalysis(self, transcriptText: str) -> bool:
"""
Decide whether to trigger AI analysis based on the latest transcript.
Triggers:
- Bot name mentioned (immediate)
- Periodic interval elapsed
- Cooldown respected
"""
now = time.time()
timeSinceLastCall = now - self._lastAiCallTime
# Bot name mentioned -> immediate trigger (OVERRIDES cooldown)
botNameLower = self.config.botName.lower()
if botNameLower in transcriptText.lower():
logger.info(f"Trigger: Bot name '{self.config.botName}' detected in transcript (overrides cooldown): '{transcriptText[:60]}...'")
return True
# Cooldown check (only for non-name triggers)
if timeSinceLastCall < self.config.triggerCooldownSeconds:
logger.debug(f"Trigger: Cooldown active ({timeSinceLastCall:.1f}s < {self.config.triggerCooldownSeconds}s)")
return False
# Periodic trigger
if timeSinceLastCall >= self.config.triggerIntervalSeconds:
logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s since last call)")
return True
logger.debug(f"Trigger: No trigger ({timeSinceLastCall:.1f}s / {self.config.triggerIntervalSeconds}s interval)")
return False
async def _analyzeAndRespond(
self,
sessionId: str,
interface,
voiceInterface,
websocket: WebSocket,
triggerTranscript: Dict[str, Any],
):
"""Run SPEECH_TEAMS AI analysis and respond if needed."""
self._lastAiCallTime = time.time()
# Build transcript context from buffer
contextLines = []
for segment in self._contextBuffer:
speaker = segment.get("speaker", "Unknown")
text = segment.get("text", "")
contextLines.append(f"[{speaker}]: {text}")
transcriptContext = f"BOT_NAME:{self.config.botName}\n" + "\n".join(contextLines)
# Call SPEECH_TEAMS
try:
from modules.services.serviceAi.mainServiceAi import AiService
# Create minimal service context for AI billing
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
aiService = AiService(serviceCenter=serviceContext)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=self.config.aiSystemPrompt,
context=transcriptContext,
options=AiCallOptions(
operationType=OperationTypeEnum.SPEECH_TEAMS,
priority=PriorityEnum.SPEED,
)
)
response = await aiService.callAi(request)
# Parse structured response
try:
speechResult = SpeechTeamsResponse.model_validate_json(response.content)
except Exception:
# Try to extract JSON from response content
try:
jsonStr = response.content
if "```json" in jsonStr:
jsonStr = jsonStr.split("```json")[1].split("```")[0]
elif "```" in jsonStr:
jsonStr = jsonStr.split("```")[1].split("```")[0]
speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip())
except Exception as parseErr:
logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}")
speechResult = SpeechTeamsResponse(
shouldRespond=False,
reasoning=f"Parse error: {str(parseErr)[:100]}",
detectedIntent="none"
)
logger.info(
f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, "
f"intent={speechResult.detectedIntent}, "
f"reasoning={speechResult.reasoning[:80]}..."
)
# Emit analysis event (always, for debug/UI)
await _emitSessionEvent(sessionId, "analysis", {
"shouldRespond": speechResult.shouldRespond,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
})
# Step 4: Respond if AI decided to
if speechResult.shouldRespond and speechResult.responseText:
if self.config.responseMode == TeamsbotResponseMode.MANUAL:
# In manual mode, suggest but don't send
await _emitSessionEvent(sessionId, "suggestedResponse", {
"responseText": speechResult.responseText,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
})
return
# Auto mode: send voice + chat response
responseType = TeamsbotResponseType.BOTH
# 4a: TTS -> Audio to bridge
try:
ttsResult = await voiceInterface.textToSpeech(
text=speechResult.responseText,
languageCode=self.config.language,
voiceName=self.config.voiceId
)
if ttsResult and isinstance(ttsResult, dict):
audioContent = ttsResult.get("audioContent")
if audioContent and websocket:
# Send TTS audio to bot via WebSocket
# Bot expects: {type: "playAudio", sessionId, audio: {data, format}}
await websocket.send_text(json.dumps({
"type": "playAudio",
"sessionId": sessionId,
"audio": {
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
"format": "mp3",
},
}))
elif audioContent and not websocket:
logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)")
except Exception as ttsErr:
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
responseType = TeamsbotResponseType.CHAT # Fallback to chat only
# 4b: Store bot response
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
responseText=speechResult.responseText,
responseType=responseType,
detectedIntent=speechResult.detectedIntent,
reasoning=speechResult.reasoning,
triggeredByTranscriptId=triggerTranscript.get("id"),
modelName=response.modelName,
processingTime=response.processingTime,
priceCHF=response.priceCHF,
timestamp=getIsoTimestamp(),
).model_dump()
createdResponse = interface.createBotResponse(botResponseData)
# 4c: Emit SSE event
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
"responseText": speechResult.responseText,
"responseType": responseType.value,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
})
# Update session response count
session = interface.getSession(sessionId)
if session:
count = session.get("botResponseCount", 0) + 1
interface.updateSession(sessionId, {"botResponseCount": count})
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
except Exception as e:
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
# =========================================================================
# Meeting Summary
# =========================================================================
async def _generateMeetingSummary(self, sessionId: str):
"""Generate an AI summary of the meeting after it ends."""
try:
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
transcripts = interface.getTranscripts(sessionId)
if not transcripts or len(transcripts) < 5:
return # Not enough content for a summary
# Build full transcript
fullTranscript = "\n".join(
f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}"
for t in transcripts
)
from modules.services.serviceAi.mainServiceAi import AiService
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
aiService = AiService(serviceCenter=serviceContext)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.",
context=fullTranscript,
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.BALANCED,
)
)
response = await aiService.callAi(request)
if response and response.errorCount == 0:
interface.updateSession(sessionId, {"summary": response.content})
logger.info(f"Meeting summary generated for session {sessionId}")
except Exception as e:
logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")