# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Teamsbot Service — WebSocket handler & audio chunk processing. Extracted from service.py. All functions accept `service` (a TeamsbotService instance) as the first parameter so the class can delegate to them. """ import logging import json import asyncio import time import base64 from typing import Optional, Dict, Any from fastapi import WebSocket from modules.shared.timeUtils import getUtcTimestamp logger = logging.getLogger(__name__) async def handleBotWebSocket(service, websocket: WebSocket, sessionId: str): """Main WebSocket handler for Browser Bot communication.""" from . import interfaceFeatureTeamsbot as interfaceDb from modules.interfaces.interfaceVoiceObjects import getVoiceInterface from .service import _activeServices, _emitSessionEvent, sessionEvents from .serviceConversation import _processTranscript, _warmEphemeralPhrasePool interface = interfaceDb.getInterface(service.currentUser, service.mandateId, service.instanceId) voiceInterface = getVoiceInterface(service.currentUser, service.mandateId) session = interface.getSession(sessionId) if session: rawContext = session.get("sessionContext") if rawContext and len(rawContext) > 500: logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...") service._sessionContext = await service._summarizeSessionContext(sessionId, rawContext) elif rawContext: service._sessionContext = rawContext if service._sessionContext: logger.info(f"Session {sessionId}: Session context ready ({len(service._sessionContext)} chars)") try: systemBot = interface.getActiveSystemBot(service.mandateId) service._botAccountEmail = systemBot.get("email") if systemBot else None if service._botAccountEmail: logger.info(f"Session {sessionId}: Bot account email resolved: {service._botAccountEmail}") except Exception: service._botAccountEmail = None service._activeSessionId = sessionId service._websocket = websocket service._voiceInterface = voiceInterface _activeServices[sessionId] = service try: await _emitSessionEvent(sessionId, "botConnectionState", { "connected": True, "timestamp": getUtcTimestamp(), }) except Exception: pass try: service._activePersistentPrompts = interface.getActivePersistentPrompts(sessionId) or [] if service._activePersistentPrompts: logger.info( f"Session {sessionId}: Loaded {len(service._activePersistentPrompts)} active persistent director prompt(s)" ) except Exception as restoreErr: logger.warning(f"Session {sessionId}: Could not restore persistent director prompts: {restoreErr}") service._activePersistentPrompts = [] asyncio.create_task(_warmEphemeralPhrasePool(service, sessionId)) logger.info(f"[WS] Handler started for session {sessionId}") try: msgCount = 0 while True: data = await websocket.receive() msgCount += 1 if "text" not in data: logger.debug(f"[WS] session={sessionId} msg #{msgCount}: non-text data (keys: {list(data.keys())})") continue message = json.loads(data["text"]) msgType = message.get("type") if msgType not in ("audioChunk", "ping"): logger.info(f"[WS] session={sessionId} msg #{msgCount}: type={msgType}") if msgType == "transcript": transcript = message.get("transcript", {}) source = transcript.get("source", "caption") speaker = transcript.get("speaker", "Unknown") textPreview = (transcript.get("text", "") or "")[:60] logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...") await _processTranscript( service, sessionId=sessionId, speaker=transcript.get("speaker", "Unknown"), text=transcript.get("text", ""), isFinal=transcript.get("isFinal", True), interface=interface, voiceInterface=voiceInterface, websocket=websocket, source=source, ) elif msgType == "chatMessage": chat = message.get("chat", {}) isHistory = chat.get("isHistory", False) source = "chatHistory" if isHistory else "chat" logger.info( f"[WS] Chat{'[HISTORY]' if isHistory else ''}: " f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..." ) await _processTranscript( service, sessionId=sessionId, speaker=chat.get("speaker", "Unknown"), text=chat.get("text", ""), isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source=source, ) elif msgType == "status": status = message.get("status") errorMessage = message.get("message") logger.info(f"[WS] Status: status={status}, message={errorMessage}") await _handleBotStatus(service, sessionId, status, errorMessage, interface) elif msgType == "audioChunk": audioData = message.get("audio", {}) audioBase64 = audioData.get("data", "") sampleRate = audioData.get("sampleRate", 16000) captureDiagnostics = audioData.get("captureDiagnostics") or {} if audioBase64: await _processAudioChunk( service, sessionId=sessionId, audioBase64=audioBase64, sampleRate=sampleRate, captureDiagnostics=captureDiagnostics, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) elif msgType == "voiceGreeting": greetingText = message.get("text", "") greetingLang = message.get("language", service.config.language) logger.info( f"[WS] Voice greeting (legacy): text={greetingText[:60]}..., language={greetingLang}" ) if greetingText and voiceInterface: await service._dispatchGreetingToMeeting( sessionId=sessionId, greetingText=greetingText, greetingLang=greetingLang, sendToChat=False, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) elif msgType == "requestGreeting": requestedLang = ( message.get("language") or service.config.language or "" ).strip() or "en-US" botNameHint = ( message.get("botName") or service.config.botName or "" ).strip() or service.config.botName logger.info( f"[WS] Greeting request from bot: language={requestedLang}, name={botNameHint}" ) if voiceInterface: try: greetingText = await service._generateGreetingText( requestedLang ) except Exception as genErr: logger.warning( f"Greeting generation failed for session {sessionId}: {genErr}" ) greetingText = "" if greetingText: await service._dispatchGreetingToMeeting( sessionId=sessionId, greetingText=greetingText, greetingLang=requestedLang, sendToChat=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) else: logger.warning( f"Session {sessionId}: Skipping greeting — AI generation produced no text" ) elif msgType == "ping": await websocket.send_text(json.dumps({"type": "pong"})) elif msgType == "ttsPlaybackAck": playback = message.get("playback", {}) or {} status = playback.get("status", "unknown") ackMessage = playback.get("message") or "Bot playback status update" logger.info( f"[WS] TTS playback ack: status={status}, format={playback.get('format')}, " f"bytesBase64={playback.get('bytesBase64')}" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": f"playback_{status}", "hasWebSocket": True, "message": ackMessage, "timestamp": playback.get("timestamp") or getUtcTimestamp(), "format": playback.get("format"), "bytesBase64": playback.get("bytesBase64"), }) elif msgType == "mfaChallenge": mfaData = message.get("mfa", {}) mfaType = mfaData.get("type", "unknown") displayNumber = mfaData.get("displayNumber") prompt = mfaData.get("prompt", "") logger.info(f"[WS] MFA challenge: type={mfaType}, number={displayNumber}, prompt={prompt[:60]}") await _emitSessionEvent(sessionId, "mfaChallenge", { "mfaType": mfaType, "displayNumber": displayNumber, "prompt": prompt, "timestamp": getUtcTimestamp(), }) from .routeFeatureTeamsbot import mfaCodeQueues, mfaWaitTasks mfaQueue = asyncio.Queue() mfaCodeQueues[sessionId] = mfaQueue mfaWaitTasks[sessionId] = asyncio.create_task( _waitAndForwardMfa(sessionId, mfaQueue, websocket) ) elif msgType == "chatSendFailed": errorData = message.get("error", {}) reason = errorData.get("reason", "unknown") failedText = errorData.get("text", "") logger.warning( f"[WS] Chat send failed for session {sessionId}: " f"reason={reason}, text={failedText[:60]}" ) await _emitSessionEvent(sessionId, "chatSendFailed", { "reason": reason, "message": errorData.get("message", "Chat message could not be sent"), "text": failedText, "timestamp": getUtcTimestamp(), }) elif msgType == "mfaResolved": success = message.get("success", False) logger.info(f"[WS] MFA resolved: success={success}") from .routeFeatureTeamsbot import mfaCodeQueues, mfaWaitTasks task = mfaWaitTasks.pop(sessionId, None) if task and not task.done(): task.cancel() mfaCodeQueues.pop(sessionId, None) await _emitSessionEvent(sessionId, "mfaResolved", { "success": success, "timestamp": getUtcTimestamp(), }) except Exception as e: if "disconnect" not in str(e).lower(): logger.error(f"[WS] Error for session {sessionId}: {type(e).__name__}: {e}") finally: if _activeServices.get(sessionId) is service: _activeServices.pop(sessionId, None) service._websocket = None service._voiceInterface = None service._activeSessionId = None try: await _emitSessionEvent(sessionId, "botConnectionState", { "connected": False, "timestamp": getUtcTimestamp(), }) except Exception: pass logger.info(f"[WS] Handler ended for session {sessionId} after {msgCount} messages") async def _waitAndForwardMfa(sid: str, queue: asyncio.Queue, ws: WebSocket): """Wait for an MFA code from the operator and forward it to the bot.""" from .service import _emitSessionEvent from .routeFeatureTeamsbot import mfaCodeQueues, mfaWaitTasks try: mfaResponse = await asyncio.wait_for(queue.get(), timeout=120.0) logger.info(f"[WS] MFA response received for session {sid}: action={mfaResponse.get('action')}") await ws.send_text(json.dumps({ "type": "mfaResponse", "sessionId": sid, "mfa": mfaResponse, })) except asyncio.TimeoutError: logger.warning(f"[WS] MFA response timeout for session {sid}") await ws.send_text(json.dumps({ "type": "mfaResponse", "sessionId": sid, "mfa": {"action": "timeout"}, })) await _emitSessionEvent(sid, "mfaChallenge", { "mfaType": "timeout", "prompt": "MFA-Zeitlimit ueberschritten. Bitte erneut versuchen.", }) except asyncio.CancelledError: logger.info(f"[WS] MFA wait cancelled for session {sid} (resolved via page)") finally: mfaCodeQueues.pop(sid, None) mfaWaitTasks.pop(sid, None) async def _handleBotStatus( service, sessionId: str, status: str, errorMessage: Optional[str], interface, ): """Handle status updates from the browser bot.""" from .service import _emitSessionEvent from .datamodelTeamsbot import TeamsbotSessionStatus logger.info(f"Bot status update for session {sessionId}: {status}") statusMap = { "connecting": TeamsbotSessionStatus.JOINING.value, "launching": TeamsbotSessionStatus.JOINING.value, "navigating": TeamsbotSessionStatus.JOINING.value, "in_lobby": TeamsbotSessionStatus.JOINING.value, "joined": TeamsbotSessionStatus.ACTIVE.value, "in_meeting": TeamsbotSessionStatus.ACTIVE.value, "left": TeamsbotSessionStatus.ENDED.value, "error": TeamsbotSessionStatus.ERROR.value, } dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value) updates = {"status": dbStatus} if errorMessage: updates["errorMessage"] = errorMessage if dbStatus == TeamsbotSessionStatus.ACTIVE.value: updates["startedAt"] = getUtcTimestamp() elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]: updates["endedAt"] = getUtcTimestamp() interface.updateSession(sessionId, updates) await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage}) if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]: if service._audioBuffer: logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(service._audioBuffer)} bytes)") service._audioBuffer = b"" service._audioBufferStartTime = 0.0 service._audioBufferLastChunkTime = 0.0 if dbStatus == TeamsbotSessionStatus.ENDED.value: asyncio.create_task(service._generateMeetingSummary(sessionId)) async def _processAudioChunk( service, sessionId: str, audioBase64: str, sampleRate: int, captureDiagnostics: Optional[Dict[str, Any]], interface, voiceInterface, websocket: WebSocket, ): """Process an audio chunk from WebRTC capture.""" from .serviceConversation import _processTranscript _MIN_CHUNK_SEC = 1.0 _STALE_TIMEOUT_SEC = 3.0 try: audioBytes = base64.b64decode(audioBase64) if len(audioBytes) < 500: return if captureDiagnostics: trackId = captureDiagnostics.get("trackId") readyState = captureDiagnostics.get("readyState") rms = captureDiagnostics.get("rms") nativeSampleRate = captureDiagnostics.get("nativeSampleRate") logger.debug( f"[AudioChunk] diagnostics: track={trackId}, readyState={readyState}, " f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}" ) isSilent = False if captureDiagnostics and captureDiagnostics.get("rms") is not None: try: rmsVal = float(captureDiagnostics.get("rms")) if rmsVal < 0.0003: isSilent = True except Exception: pass if not voiceInterface: logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}") return now = time.time() effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000 if not isSilent: if not service._audioBuffer: service._audioBufferStartTime = now service._audioBuffer += audioBytes service._audioBufferLastChunkTime = now service._audioBufferSampleRate = effectiveRate bufferDuration = len(service._audioBuffer) / (effectiveRate * 2) if service._audioBuffer else 0.0 bufferAge = (now - service._audioBufferStartTime) if service._audioBuffer else 0.0 shouldFlush = ( service._audioBuffer and ( bufferDuration >= _MIN_CHUNK_SEC or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3) ) ) if not shouldFlush: return flushBytes = service._audioBuffer flushRate = service._audioBufferSampleRate service._audioBuffer = b"" service._audioBufferStartTime = 0.0 service._audioBufferLastChunkTime = 0.0 flushDuration = len(flushBytes) / (flushRate * 2) logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz") phraseHints = list(service._knownSpeakers) if service.config.botName: phraseHints.append(service.config.botName) sttResult = await voiceInterface.speechToText( audioContent=flushBytes, language=service.config.language or "de-DE", sampleRate=flushRate, channels=1, skipFallbacks=True, phraseHints=phraseHints if phraseHints else None, audioFormat="linear16", ) if sttResult and sttResult.get("success") and sttResult.get("text"): text = sttResult["text"].strip() if text: resolvedSpeaker = service._resolveSpeakerForAudioCapture() fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False) logger.info( f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} " f"(fromCaption={fromCaption}), text={text[:80]}..." ) await _processTranscript( service, sessionId=sessionId, speaker=resolvedSpeaker["speaker"], text=text, isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source="audioCapture", speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"], ) except Exception as e: logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}") async def _cancelInFlightSpeech( service, sessionId: str, websocket: Optional[WebSocket], reason: str, ) -> None: """Hard stop everything the bot is currently doing in the meeting.""" from .service import _emitSessionEvent service._answerGenerationCounter += 1 gen = service._answerGenerationCounter logger.info( f"Session {sessionId}: Cancelling in-flight speech " f"(reason={reason}, gen={gen})" ) if service._pendingNameTrigger: logger.info( f"Session {sessionId}: Dropping pending debounced name " f"trigger (was queued before stop)" ) service._pendingNameTrigger = None for taskAttr in ("_currentEscalationTask", "_currentQuickAckTask"): task = getattr(service, taskAttr, None) if task is not None and not task.done(): logger.info( f"Session {sessionId}: Cancelling background task " f"{taskAttr}" ) task.cancel() if websocket is not None: try: await websocket.send_text(json.dumps({ "type": "stopAudio", "sessionId": sessionId, "reason": reason, })) except Exception as stopErr: logger.warning( f"Session {sessionId}: Failed to send stopAudio to " f"browser bot: {stopErr}" ) try: await _emitSessionEvent(sessionId, "speechCancelled", { "reason": reason, "generation": gen, "timestamp": getUtcTimestamp(), }) except Exception: pass