# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Teamsbot Service - Pipeline Orchestrator. Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge. """ import logging import json import asyncio import time import base64 from typing import Optional, Dict, Any, List from fastapi import WebSocket from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp from .datamodelTeamsbot import ( TeamsbotSessionStatus, TeamsbotTranscript, TeamsbotBotResponse, TeamsbotResponseType, TeamsbotConfig, TeamsbotResponseMode, TeamsbotResponseChannel, SpeechTeamsResponse, TeamsbotCommand, ) from .browserBotConnector import BrowserBotConnector logger = logging.getLogger(__name__) # ========================================================================= # Minimal Service Context (for AI billing in bridge callbacks) # ========================================================================= class _ServiceContext: """Minimal context providing user/mandate info for AiService billing. Used by bridge callbacks where a full Services instance is not available.""" def __init__(self, user, mandateId, featureInstanceId=None): self.user = user self.mandateId = mandateId self.featureInstanceId = featureInstanceId self.featureCode = "teamsbot" # ========================================================================= # Session Event Queues (for SSE streaming to frontend) # ========================================================================= _sessionEvents: Dict[str, asyncio.Queue] = {} async def _emitSessionEvent(sessionId: str, eventType: str, data: Any): """Emit an event to the session's SSE stream. Creates the queue on-demand so events are never silently dropped.""" if sessionId not in _sessionEvents: _sessionEvents[sessionId] = asyncio.Queue() await _sessionEvents[sessionId].put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()}) class TeamsbotService: """ Pipeline Orchestrator for Teams Bot sessions. Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication. """ def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig): self.currentUser = currentUser self.mandateId = mandateId self.instanceId = instanceId self.config = config self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl()) # State self._lastAiCallTime: float = 0.0 self._aiAnalysisInProgress: bool = False self._contextBuffer: List[Dict[str, Any]] = [] self._sessionContext: Optional[str] = None # User-provided background context self._contextSummary: Optional[str] = None # AI-generated summary of long context # Differential transcript tracking self._lastTranscriptSpeaker: Optional[str] = None self._lastTranscriptText: Optional[str] = None self._lastTranscriptId: Optional[str] = None self._lastSttTime: float = 0.0 self._lastBotResponseText: Optional[str] = None self._lastBotResponseTs: float = 0.0 # Speaker attribution: simple last-caption-speaker model self._lastCaptionSpeaker: Optional[str] = None self._unattributedTranscriptIds: List[str] = [] self._knownSpeakers: set = set() # Debounced name trigger: wait for speaker to finish before AI analysis self._pendingNameTrigger: Optional[Dict[str, Any]] = None self._followUpWindowEnd: float = 0.0 # ========================================================================= # Session Lifecycle # ========================================================================= async def joinMeeting( self, sessionId: str, meetingLink: str, connectionId: Optional[str] = None, gatewayBaseUrl: str = "", botAccountEmail: Optional[str] = None, botAccountPassword: Optional[str] = None, ): """Send join command to the Browser Bot service. The browser bot will: 1. Launch browser (headful if credentials provided, headless otherwise) 2. Navigate to Teams web app 3. Authenticate if credentials provided, otherwise join as anonymous guest 4. Enable captions/audio capture and start scraping 5. Connect back via WebSocket to send transcripts """ from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) # Initialize SSE event queue _sessionEvents[sessionId] = asyncio.Queue() try: # Update status to JOINING interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value}) await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"}) # Send join command to browser bot session = interface.getSession(sessionId) if not session: raise ValueError(f"Session {sessionId} not found") # Build the full WebSocket URL for the bot to connect back to this gateway instance # gatewayBaseUrl is passed from the route handler (derived from request.base_url) wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws" gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/") fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}" hasAuth = bool(botAccountEmail and botAccountPassword) logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}") result = await self.browserBotConnector.joinMeeting( sessionId=sessionId, meetingUrl=meetingLink, botName=session.get("botName", self.config.botName), instanceId=self.instanceId, gatewayWsUrl=fullGatewayWsUrl, language=self.config.language, botAccountEmail=botAccountEmail, botAccountPassword=botAccountPassword, transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto", debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False, ) if result.get("success"): interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.JOINING.value, # Will become ACTIVE when bot connects via WS }) logger.info(f"Browser bot deployment started for session {sessionId}") else: errorMsg = result.get("error", "Unknown error joining meeting") interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ERROR.value, "errorMessage": errorMsg, }) await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg}) logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}") except Exception as e: logger.error(f"Error joining meeting for session {sessionId}: {e}") interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ERROR.value, "errorMessage": str(e), }) await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)}) async def leaveMeeting(self, sessionId: str): """Send leave command to the Browser Bot service.""" from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) try: interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value}) await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"}) await self.browserBotConnector.leaveMeeting(sessionId) interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ENDED.value, "endedAt": getIsoTimestamp(), }) await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"}) # Generate meeting summary in background asyncio.create_task(self._generateMeetingSummary(sessionId)) logger.info(f"Bot left meeting for session {sessionId}") except Exception as e: logger.error(f"Error leaving meeting for session {sessionId}: {e}") interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ERROR.value, "errorMessage": str(e), "endedAt": getIsoTimestamp(), }) # Cleanup event queue _sessionEvents.pop(sessionId, None) # ========================================================================= # Browser Bot WebSocket Communication # ========================================================================= async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str): """ Main WebSocket handler for Browser Bot communication. Receives: - transcript: Caption text scraped from Teams - status: Bot state changes (joined, in_lobby, left, error) Sends: - playAudio: TTS audio for the bot to play in the meeting """ from . import interfaceFeatureTeamsbot as interfaceDb from modules.interfaces.interfaceVoiceObjects import getVoiceInterface interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) voiceInterface = getVoiceInterface(self.currentUser, self.mandateId) # Load session context (user-provided background knowledge) # If the context is long (>500 chars), summarize it to reduce token usage session = interface.getSession(sessionId) if session: rawContext = session.get("sessionContext") if rawContext and len(rawContext) > 500: logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...") self._sessionContext = await self._summarizeSessionContext(sessionId, rawContext) elif rawContext: self._sessionContext = rawContext if self._sessionContext: logger.info(f"Session {sessionId}: Session context ready ({len(self._sessionContext)} chars)") # Resolve system bot email for speaker detection (prevents bot from triggering AI on own speech) try: systemBot = interface.getActiveSystemBot(self.mandateId) self._botAccountEmail = systemBot.get("email") if systemBot else None if self._botAccountEmail: logger.info(f"Session {sessionId}: Bot account email resolved: {self._botAccountEmail}") except Exception: self._botAccountEmail = None logger.info(f"[WS] Handler started for session {sessionId}") try: msgCount = 0 while True: data = await websocket.receive() msgCount += 1 if "text" not in data: logger.debug(f"[WS] session={sessionId} msg #{msgCount}: non-text data (keys: {list(data.keys())})") continue message = json.loads(data["text"]) msgType = message.get("type") if msgType not in ("audioChunk", "ping"): logger.info(f"[WS] session={sessionId} msg #{msgCount}: type={msgType}") if msgType == "transcript": transcript = message.get("transcript", {}) source = transcript.get("source", "caption") speaker = transcript.get("speaker", "Unknown") textPreview = (transcript.get("text", "") or "")[:60] # Caption/speakerHint: name resolution only; transcript comes from STT logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...") await self._processTranscript( sessionId=sessionId, speaker=transcript.get("speaker", "Unknown"), text=transcript.get("text", ""), isFinal=transcript.get("isFinal", True), interface=interface, voiceInterface=voiceInterface, websocket=websocket, source=source, ) elif msgType == "chatMessage": chat = message.get("chat", {}) isHistory = chat.get("isHistory", False) source = "chatHistory" if isHistory else "chat" logger.info( f"[WS] Chat{'[HISTORY]' if isHistory else ''}: " f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..." ) await self._processTranscript( sessionId=sessionId, speaker=chat.get("speaker", "Unknown"), text=chat.get("text", ""), isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source=source, ) elif msgType == "status": status = message.get("status") errorMessage = message.get("message") logger.info(f"[WS] Status: status={status}, message={errorMessage}") await self._handleBotStatus(sessionId, status, errorMessage, interface) elif msgType == "audioChunk": audioData = message.get("audio", {}) audioBase64 = audioData.get("data", "") sampleRate = audioData.get("sampleRate", 16000) captureDiagnostics = audioData.get("captureDiagnostics") or {} if audioBase64: await self._processAudioChunk( sessionId=sessionId, audioBase64=audioBase64, sampleRate=sampleRate, captureDiagnostics=captureDiagnostics, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) elif msgType == "voiceGreeting": greetingText = message.get("text", "") greetingLang = message.get("language", self.config.language) logger.info(f"[WS] Voice greeting: text={greetingText[:60]}..., language={greetingLang}") if greetingText and voiceInterface: try: await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "requested", "hasWebSocket": True, "message": "Voice greeting TTS requested", "timestamp": getIsoTimestamp(), }) ttsResult = await voiceInterface.textToSpeech( text=greetingText, languageCode=greetingLang, voiceName=self.config.voiceId ) if ttsResult and isinstance(ttsResult, dict): audioContent = ttsResult.get("audioContent") if audioContent: await websocket.send_text(json.dumps({ "type": "playAudio", "sessionId": sessionId, "audio": { "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(), "format": "mp3", } })) logger.info(f"Voice greeting TTS sent for session {sessionId}") await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "dispatched", "hasWebSocket": True, "message": "Voice greeting TTS dispatched to bot", "timestamp": getIsoTimestamp(), }) greetingTranscriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=greetingText, timestamp=getIsoTimestamp(), confidence=1.0, language=greetingLang, isFinal=True, source="botResponse", ).model_dump() greetingTranscript = interface.createTranscript(greetingTranscriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": greetingText, "timestamp": getUtcTimestamp(), "source": "botResponse", }) self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = greetingText self._lastTranscriptId = greetingTranscript.get("id") await _emitSessionEvent(sessionId, "botResponse", { "id": greetingTranscript.get("id"), "responseText": greetingText, "responseType": TeamsbotResponseType.AUDIO.value, "detectedIntent": "greeting", "reasoning": "Automatic join greeting", "timestamp": getIsoTimestamp(), }) await _emitSessionEvent(sessionId, "transcript", { "id": greetingTranscript.get("id"), "speaker": self.config.botName, "text": greetingText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "botResponse", "speakerResolvedFromHint": False, }) except Exception as ttsErr: logger.warning(f"Voice greeting TTS failed for session {sessionId}: {ttsErr}") elif msgType == "ping": await websocket.send_text(json.dumps({"type": "pong"})) elif msgType == "ttsPlaybackAck": playback = message.get("playback", {}) or {} status = playback.get("status", "unknown") ackMessage = playback.get("message") or "Bot playback status update" logger.info( f"[WS] TTS playback ack: status={status}, format={playback.get('format')}, " f"bytesBase64={playback.get('bytesBase64')}" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": f"playback_{status}", "hasWebSocket": True, "message": ackMessage, "timestamp": playback.get("timestamp") or getIsoTimestamp(), "format": playback.get("format"), "bytesBase64": playback.get("bytesBase64"), }) elif msgType == "mfaChallenge": mfaData = message.get("mfa", {}) mfaType = mfaData.get("type", "unknown") displayNumber = mfaData.get("displayNumber") prompt = mfaData.get("prompt", "") logger.info(f"[WS] MFA challenge: type={mfaType}, number={displayNumber}, prompt={prompt[:60]}") await _emitSessionEvent(sessionId, "mfaChallenge", { "mfaType": mfaType, "displayNumber": displayNumber, "prompt": prompt, "timestamp": getIsoTimestamp(), }) from .routeFeatureTeamsbot import _mfaCodeQueues, _mfaWaitTasks mfaQueue = asyncio.Queue() _mfaCodeQueues[sessionId] = mfaQueue async def _waitAndForwardMfa(sid, queue, ws): try: mfaResponse = await asyncio.wait_for(queue.get(), timeout=120.0) logger.info(f"[WS] MFA response received for session {sid}: action={mfaResponse.get('action')}") await ws.send_text(json.dumps({ "type": "mfaResponse", "sessionId": sid, "mfa": mfaResponse, })) except asyncio.TimeoutError: logger.warning(f"[WS] MFA response timeout for session {sid}") await ws.send_text(json.dumps({ "type": "mfaResponse", "sessionId": sid, "mfa": {"action": "timeout"}, })) await _emitSessionEvent(sid, "mfaChallenge", { "mfaType": "timeout", "prompt": "MFA-Zeitlimit ueberschritten. Bitte erneut versuchen.", }) except asyncio.CancelledError: logger.info(f"[WS] MFA wait cancelled for session {sid} (resolved via page)") finally: _mfaCodeQueues.pop(sid, None) _mfaWaitTasks.pop(sid, None) _mfaWaitTasks[sessionId] = asyncio.create_task( _waitAndForwardMfa(sessionId, mfaQueue, websocket) ) elif msgType == "chatSendFailed": errorData = message.get("error", {}) reason = errorData.get("reason", "unknown") failedText = errorData.get("text", "") logger.warning( f"[WS] Chat send failed for session {sessionId}: " f"reason={reason}, text={failedText[:60]}" ) await _emitSessionEvent(sessionId, "chatSendFailed", { "reason": reason, "message": errorData.get("message", "Chat message could not be sent"), "text": failedText, "timestamp": getIsoTimestamp(), }) elif msgType == "mfaResolved": success = message.get("success", False) logger.info(f"[WS] MFA resolved: success={success}") from .routeFeatureTeamsbot import _mfaCodeQueues, _mfaWaitTasks task = _mfaWaitTasks.pop(sessionId, None) if task and not task.done(): task.cancel() _mfaCodeQueues.pop(sessionId, None) await _emitSessionEvent(sessionId, "mfaResolved", { "success": success, "timestamp": getIsoTimestamp(), }) except Exception as e: if "disconnect" not in str(e).lower(): logger.error(f"[WS] Error for session {sessionId}: {type(e).__name__}: {e}") logger.info(f"[WS] Handler ended for session {sessionId} after {msgCount} messages") async def _handleBotStatus( self, sessionId: str, status: str, errorMessage: Optional[str], interface, ): """Handle status updates from the browser bot.""" logger.info(f"Bot status update for session {sessionId}: {status}") statusMap = { "connecting": TeamsbotSessionStatus.JOINING.value, "launching": TeamsbotSessionStatus.JOINING.value, "navigating": TeamsbotSessionStatus.JOINING.value, "in_lobby": TeamsbotSessionStatus.JOINING.value, "joined": TeamsbotSessionStatus.ACTIVE.value, "in_meeting": TeamsbotSessionStatus.ACTIVE.value, "left": TeamsbotSessionStatus.ENDED.value, "error": TeamsbotSessionStatus.ERROR.value, } dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value) updates = {"status": dbStatus} if errorMessage: updates["errorMessage"] = errorMessage if dbStatus == TeamsbotSessionStatus.ACTIVE.value: updates["startedAt"] = getIsoTimestamp() elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]: updates["endedAt"] = getIsoTimestamp() interface.updateSession(sessionId, updates) await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage}) # Generate summary when session ends if dbStatus == TeamsbotSessionStatus.ENDED.value: asyncio.create_task(self._generateMeetingSummary(sessionId)) async def _processAudioChunk( self, sessionId: str, audioBase64: str, sampleRate: int, captureDiagnostics: Optional[Dict[str, Any]], interface, voiceInterface, websocket: WebSocket, ): """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline.""" import base64 try: audioBytes = base64.b64decode(audioBase64) if len(audioBytes) < 1000: return if captureDiagnostics: trackId = captureDiagnostics.get("trackId") readyState = captureDiagnostics.get("readyState") rms = captureDiagnostics.get("rms") nativeSampleRate = captureDiagnostics.get("nativeSampleRate") logger.debug( f"[AudioChunk] diagnostics: track={trackId}, readyState={readyState}, " f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}" ) # Use RMS from capture diagnostics to skip real silence. # Byte-variation heuristics produced false positives and dropped valid speech. if captureDiagnostics and captureDiagnostics.get("rms") is not None: try: rmsVal = float(captureDiagnostics.get("rms")) if rmsVal < 0.0003: logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})") return except Exception: pass if not voiceInterface: logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}") return # Treat sampleRate=0 as unknown (triggers auto-detection) effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None phraseHints = list(self._knownSpeakers) if self.config.botName: phraseHints.append(self.config.botName) sttResult = await voiceInterface.speechToText( audioContent=audioBytes, language=self.config.language or "de-DE", sampleRate=effectiveSampleRate, channels=1, skipFallbacks=True, phraseHints=phraseHints if phraseHints else None, alternativeLanguages=["en-US"], ) if sttResult and sttResult.get("success") and sttResult.get("text"): text = sttResult["text"].strip() if text: resolvedSpeaker = self._resolveSpeakerForAudioCapture() fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False) logger.info( f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} " f"(fromCaption={fromCaption}), text={text[:80]}..." ) await self._processTranscript( sessionId=sessionId, speaker=resolvedSpeaker["speaker"], text=text, isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source="audioCapture", speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"], ) except Exception as e: logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}") def _registerSpeakerHint(self, speaker: str, text: str, sessionId: str = ""): """Track current speaker from captions for STT attribution. When the first non-bot caption arrives, retroactively attributes any STT segments that were created before a speaker was known.""" if not speaker: return normalizedSpeaker = speaker.strip() if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker): return prevSpeaker = self._lastCaptionSpeaker self._lastCaptionSpeaker = normalizedSpeaker self._knownSpeakers.add(normalizedSpeaker) if prevSpeaker is None and self._unattributedTranscriptIds: from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) for tid in self._unattributedTranscriptIds: interface.updateTranscript(tid, {"speaker": normalizedSpeaker}) for seg in self._contextBuffer: if seg.get("speaker") == "Unknown" and seg.get("source") == "audioCapture": seg["speaker"] = normalizedSpeaker if self._lastTranscriptSpeaker == "Unknown": self._lastTranscriptSpeaker = normalizedSpeaker logger.info( f"Session {sessionId}: Retroactive speaker attribution: " f"{len(self._unattributedTranscriptIds)} segments -> {normalizedSpeaker}" ) self._unattributedTranscriptIds.clear() if self._pendingNameTrigger: self._pendingNameTrigger["lastActivity"] = time.time() def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]: """Speaker name for audio chunks — uses the last caption speaker.""" if self._lastCaptionSpeaker: return {"speaker": self._lastCaptionSpeaker, "speakerResolvedFromHint": True} return {"speaker": "Unknown", "speakerResolvedFromHint": False} async def _processTranscript( self, sessionId: str, speaker: str, text: str, isFinal: bool, interface, voiceInterface, websocket: WebSocket, source: str = "caption", speakerResolvedFromHint: Optional[bool] = None, ): """Process a transcript segment from captions or chat messages. Differential writing: When the same speaker continues (text grows incrementally as captions stream), we UPDATE the existing DB record instead of creating a cascade of near-duplicate rows. A new record is only created when the speaker changes or the text is not a continuation of the previous segment. """ text = text.strip() if not text: return # Captions are used ONLY for speaker name resolution (never as transcript). # Transcript text comes exclusively from audio STT or chat. # Address detection (bot name in caption) still triggers AI analysis # using existing audio-based context — but caption text itself is NOT # added to the context buffer. if source in ("caption", "speakerHint"): self._registerSpeakerHint(speaker, text, sessionId) if ( source == "speakerHint" and isFinal and not self._isBotSpeaker(speaker) and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY and self._detectBotName(text) ): triggerTranscript = {"id": None, "speaker": speaker, "text": text, "source": source} isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, triggerTranscript) if isNew: logger.info(f"Session {sessionId}: Bot name in caption, debounce trigger started") asyncio.create_task(self._checkPendingNameTrigger()) return # Chat history: messages sent before the bot joined the meeting. # Stored in DB for reference but NOT added to the AI context buffer, # because old messages (e.g. "nyla, summarize the protocol") would # be treated as current requests when AI analysis is triggered. if source == "chatHistory": transcriptData = TeamsbotTranscript( sessionId=sessionId, speaker=speaker, text=text, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, source="chatHistory", ).model_dump() createdTranscript = interface.createTranscript(transcriptData) await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": speaker, "text": text, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "chatHistory", "isHistory": True, }) logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}") return # Filter out the bot's own speech (caption/audioCapture) — garbled text # pollutes context. Chat from the bot is clean text and must appear in # the transcript for all participants. isBotSpeaker = self._isBotSpeaker(speaker) if isBotSpeaker and source != "chat": logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...") return # Differential transcript writing: # audioCapture from same speaker → append text (merge STT chunks into one block) # Start a new block after a pause (>5s gap between STT results) sttPauseThreshold = 5.0 isMerge = ( source == "audioCapture" and self._lastTranscriptSpeaker == speaker and self._lastTranscriptText is not None and self._lastTranscriptId is not None and (time.time() - self._lastSttTime) < sttPauseThreshold ) if isMerge: mergedText = f"{self._lastTranscriptText} {text}" interface.updateTranscript(self._lastTranscriptId, { "text": mergedText, "isFinal": isFinal, }) self._lastTranscriptText = mergedText createdTranscript = {"id": self._lastTranscriptId} if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker: self._contextBuffer[-1]["text"] = mergedText else: transcriptData = TeamsbotTranscript( sessionId=sessionId, speaker=speaker, text=text, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=isFinal, source=source, ).model_dump() createdTranscript = interface.createTranscript(transcriptData) self._lastTranscriptSpeaker = speaker self._lastTranscriptText = text self._lastTranscriptId = createdTranscript.get("id") if source == "audioCapture" and speaker == "Unknown": self._unattributedTranscriptIds.append(createdTranscript.get("id")) self._contextBuffer.append({ "speaker": speaker or "Unknown", "text": text, "timestamp": getUtcTimestamp(), "source": source, }) maxSegments = self.config.contextWindowSegments if len(self._contextBuffer) > maxSegments: if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5: asyncio.create_task(self._summarizeContextBuffer(sessionId)) self._contextBuffer = self._contextBuffer[-maxSegments:] session = interface.getSession(sessionId) if session: count = session.get("transcriptSegmentCount", 0) + 1 interface.updateSession(sessionId, {"transcriptSegmentCount": count}) if source == "audioCapture": self._lastSttTime = time.time() displayText = self._lastTranscriptText if isMerge else text await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": speaker, "text": displayText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": isMerge, "source": source, "speakerResolvedFromHint": ( speakerResolvedFromHint if speakerResolvedFromHint is not None else False ), }) if not isFinal: return if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY: return # Bot's own chat: stored for display only, never trigger AI if source == "chat" and isBotSpeaker: return # Stop phrases: trigger immediately without debounce (root cause: 3s debounce delayed stop) if self._isStopPhrase(text): logger.info(f"Session {sessionId}: Stop phrase detected, triggering analysis immediately") await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript) return # Update activity for any pending debounced trigger if self._pendingNameTrigger: self._pendingNameTrigger["lastActivity"] = time.time() # Bot name detection → debounced trigger (wait for speaker to finish) if self._detectBotName(text): isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript) if isNew: asyncio.create_task(self._checkPendingNameTrigger()) return # Follow-up window: after a bot response, trigger AI for any human speech # without requiring the bot name — the AI decides via shouldRespond if ( source == "audioCapture" and not self._isBotSpeaker(speaker) and time.time() < self._followUpWindowEnd and not self._pendingNameTrigger ): isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript) if isNew: logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)") asyncio.create_task(self._checkPendingNameTrigger()) return # Periodic trigger (only when no debounce pending) if not self._pendingNameTrigger: shouldTrigger = self._shouldTriggerAnalysis(text) if shouldTrigger: logger.info(f"Session {sessionId}: Periodic trigger (buffer: {len(self._contextBuffer)} segments)") await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript) def _isBotSpeaker(self, speaker: str) -> bool: """Check if a transcript speaker is the bot itself. Teams captions show the bot as e.g. "BotName (Unverified)" or "Nyla Larsson" depending on auth/anonymous join. We match against: - The configured/derived bot name - The bot account display name if authenticated """ if not speaker: return False speakerLower = speaker.lower().strip() # Match against configured bot name botName = self.config.botName.lower().strip() if botName and botName in speakerLower: return True # Match against bot account email prefix (e.g. "nyla.larsson" from "nyla.larsson@poweron.swiss") botAccountEmail = getattr(self, '_botAccountEmail', None) or getattr(self.config, 'botAccountEmail', None) if botAccountEmail: emailPrefix = botAccountEmail.split("@")[0].lower().replace(".", " ") if emailPrefix in speakerLower: return True return False def _shouldTriggerAnalysis(self, transcriptText: str, allowPeriodic: bool = True) -> bool: """ Decide whether to trigger AI analysis based on the latest transcript. Bot name detection is handled separately via debounce. This method only checks periodic/cooldown triggers. """ now = time.time() timeSinceLastCall = now - self._lastAiCallTime if timeSinceLastCall < self.config.triggerCooldownSeconds: return False if allowPeriodic and timeSinceLastCall >= self.config.triggerIntervalSeconds: logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s)") return True return False def _isStopPhrase(self, text: str) -> bool: """Check if text is a stop command (stop, halt, be quiet, etc.). Triggers immediate analysis.""" if not text or len(text.strip()) < 2: return False t = text.strip().lower() words = [w.strip(".,!?:;\"'()[]") for w in t.split() if w.strip()] wordSet = set(words) stopWords = {"stop", "stopp", "halt", "ruhe", "stille", "schweig", "arrete", "quiet", "shut"} if wordSet & stopWords: return True if "sei still" in t or "be quiet" in t or "shut up" in t or "aufhoeren" in t or "aufhören" in t: return True return False def _detectBotName(self, text: str) -> bool: """Check if text contains the bot's name (exact or phonetically similar).""" botNameLower = self.config.botName.lower() textLower = text.lower() if botNameLower in textLower: return True botFirstName = botNameLower.split()[0] if " " in botNameLower else botNameLower if len(botFirstName) >= 3: for word in textLower.split(): cleanWord = word.strip(".,!?:;\"'()[]") if not cleanWord or len(cleanWord) < 3: continue if cleanWord == botFirstName: return True if cleanWord[0] == botFirstName[0] and abs(len(cleanWord) - len(botFirstName)) <= 2: common = sum(1 for c in set(botFirstName) if c in cleanWord) similarity = common / max(len(set(botFirstName)), len(set(cleanWord))) if similarity >= 0.6: return True return False def _setPendingNameTrigger(self, sessionId, interface, voiceInterface, websocket, triggerTranscript) -> bool: """Set or update a debounced name trigger. Returns True if newly set.""" if self._pendingNameTrigger: self._pendingNameTrigger["lastActivity"] = time.time() return False self._pendingNameTrigger = { "sessionId": sessionId, "interface": interface, "voiceInterface": voiceInterface, "websocket": websocket, "triggerTranscript": triggerTranscript, "detectedAt": time.time(), "lastActivity": time.time(), } return True async def _checkPendingNameTrigger(self, delaySec: float = 3.0): """Async loop: fire the pending name trigger once the speaker is quiet.""" await asyncio.sleep(delaySec) if not self._pendingNameTrigger: return now = time.time() lastActivity = self._pendingNameTrigger.get("lastActivity", 0) detectedAt = self._pendingNameTrigger.get("detectedAt", 0) quietSec = now - lastActivity totalWaitSec = now - detectedAt if quietSec >= 3.0 or totalWaitSec >= 15.0: trigger = self._pendingNameTrigger self._pendingNameTrigger = None logger.info( f"Session {trigger['sessionId']}: Debounced name trigger fires " f"(quiet={quietSec:.1f}s, totalWait={totalWaitSec:.1f}s)" ) await self._analyzeAndRespond( trigger["sessionId"], trigger["interface"], trigger["voiceInterface"], trigger["websocket"], trigger["triggerTranscript"], ) else: remaining = max(0.5, 3.0 - quietSec) asyncio.create_task(self._checkPendingNameTrigger(remaining)) async def _analyzeAndRespond( self, sessionId: str, interface, voiceInterface, websocket: WebSocket, triggerTranscript: Dict[str, Any], ): """Run SPEECH_TEAMS AI analysis and respond if needed.""" if self._aiAnalysisInProgress: logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger") return self._aiAnalysisInProgress = True self._lastAiCallTime = time.time() # Build transcript context from buffer. # Mark bot's own utterances and chat messages for the AI. contextLines = [] for segment in self._contextBuffer: speaker = segment.get("speaker", "Unknown") text = segment.get("text", "") segSource = segment.get("source", "caption") prefix = "Chat" if segSource == "chat" else "" if self._isBotSpeaker(speaker): contextLines.append(f"[YOU ({self.config.botName})]: {text}") elif prefix: contextLines.append(f"[{prefix}: {speaker}]: {text}") else: contextLines.append(f"[{speaker}]: {text}") # Include session context if provided by the user at session start sessionContextStr = "" if self._sessionContext: sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{self._sessionContext}\n" # Include summary of earlier conversation if available summaryStr = "" if self._contextSummary: summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{self._contextSummary}\n" transcriptContext = f"BOT_NAME:{self.config.botName}{sessionContextStr}{summaryStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines) # Call SPEECH_TEAMS try: from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService # Create minimal service context for AI billing serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId) aiService = AiService(serviceCenter=serviceContext) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=self.config.aiSystemPrompt, context=transcriptContext, options=AiCallOptions( operationType=OperationTypeEnum.SPEECH_TEAMS, priority=PriorityEnum.SPEED, ) ) response = await aiService.callAi(request) # Parse structured response try: speechResult = SpeechTeamsResponse.model_validate_json(response.content) except Exception: # Try to extract JSON from response content try: jsonStr = response.content if "```json" in jsonStr: jsonStr = jsonStr.split("```json")[1].split("```")[0] elif "```" in jsonStr: jsonStr = jsonStr.split("```")[1].split("```")[0] speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip()) except Exception as parseErr: logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}") speechResult = SpeechTeamsResponse( shouldRespond=False, reasoning=f"Parse error: {str(parseErr)[:100]}", detectedIntent="none" ) logger.info( f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, " f"intent={speechResult.detectedIntent}, " f"reasoning={speechResult.reasoning[:80]}..." ) # Emit analysis event (always, for debug/UI) await _emitSessionEvent(sessionId, "analysis", { "shouldRespond": speechResult.shouldRespond, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, }) # Step 4a: Handle STOP intent -- stop audio immediately if speechResult.detectedIntent == "stop": logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}") if websocket: try: await websocket.send_text(json.dumps({ "type": "stopAudio", "sessionId": sessionId, })) except Exception as stopErr: logger.warning(f"Failed to send stop command: {stopErr}") return # Step 4b: Respond if AI decided to if speechResult.shouldRespond and speechResult.responseText: if self.config.responseMode == TeamsbotResponseMode.MANUAL: # In manual mode, suggest but don't send await _emitSessionEvent(sessionId, "suggestedResponse", { "responseText": speechResult.responseText, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, }) return # Determine response channel: per-request (AI) overrides config channels = speechResult.responseChannels if channels and isinstance(channels, list): channelStr = ",".join(str(c).lower().strip() for c in channels) sendVoice = "voice" in channelStr sendChat = "chat" in channelStr logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}") else: channelRaw = self.config.responseChannel channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip() sendVoice = channelStr in ("voice", "both") sendChat = channelStr in ("chat", "both") logger.info(f"Response channel (from config): '{channelStr}'") if sendVoice and sendChat: responseType = TeamsbotResponseType.BOTH elif sendVoice: responseType = TeamsbotResponseType.AUDIO else: responseType = TeamsbotResponseType.CHAT # Suppress duplicate responses in short windows ("repeat loop" protection). canonicalText = ( speechResult.responseText or speechResult.responseTextForVoice or speechResult.responseTextForChat or "" ) normalizedResponse = (canonicalText or "").strip().lower() nowTs = time.time() if ( normalizedResponse and self._lastBotResponseText == normalizedResponse and (nowTs - self._lastBotResponseTs) < 90 ): logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window") await _emitSessionEvent(sessionId, "analysis", { "shouldRespond": False, "detectedIntent": speechResult.detectedIntent, "reasoning": "Suppressed duplicate response within 90s", "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, }) return # Resolve text per channel (AI can send different content to voice vs chat) textForVoice = speechResult.responseTextForVoice or speechResult.responseText textForChat = speechResult.responseTextForChat or speechResult.responseText storedText = textForChat or textForVoice or speechResult.responseText # 4a: Voice response (TTS -> Audio to bot) if sendVoice and textForVoice: try: await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "requested", "hasWebSocket": websocket is not None, "message": "TTS generation requested", "timestamp": getIsoTimestamp(), }) logger.info( f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})" ) ttsResult = await voiceInterface.textToSpeech( text=textForVoice, languageCode=self.config.language, voiceName=self.config.voiceId ) if not ttsResult or not isinstance(ttsResult, dict): raise RuntimeError("TTS returned invalid result payload") if ttsResult.get("success") is False: raise RuntimeError(f"TTS backend error: {ttsResult.get('error', 'unknown')}") audioContent = ttsResult.get("audioContent") if not audioContent: raise RuntimeError("TTS returned no audioContent") if websocket: await websocket.send_text(json.dumps({ "type": "playAudio", "sessionId": sessionId, "audio": { "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(), "format": "mp3", }, })) logger.info(f"Session {sessionId}: TTS audio dispatched to bot") await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "dispatched", "hasWebSocket": True, "message": "TTS audio dispatched to bot", "timestamp": getIsoTimestamp(), }) else: logger.warning( f"Session {sessionId}: TTS audio generated but cannot be played (bot websocket unavailable, likely fallback mode)" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "unavailable", "hasWebSocket": False, "message": "TTS audio generated but bot websocket unavailable", "timestamp": getIsoTimestamp(), }) except Exception as ttsErr: logger.warning(f"TTS failed for session {sessionId}: {ttsErr}") await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "failed", "hasWebSocket": websocket is not None, "message": str(ttsErr), "timestamp": getIsoTimestamp(), }) if not sendChat: sendChat = True # Fallback to chat if voice-only and TTS failed # 4b: Chat response (send text message to meeting chat) if sendChat and textForChat: try: if websocket: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": textForChat, })) logger.info(f"Chat response sent for session {sessionId}") except Exception as chatErr: logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}") # 4b: Store bot response botResponseData = TeamsbotBotResponse( sessionId=sessionId, responseText=storedText, responseType=responseType, detectedIntent=speechResult.detectedIntent, reasoning=speechResult.reasoning, triggeredByTranscriptId=triggerTranscript.get("id"), modelName=response.modelName, processingTime=response.processingTime, priceCHF=response.priceCHF, timestamp=getIsoTimestamp(), ).model_dump() createdResponse = interface.createBotResponse(botResponseData) # 4c: Emit SSE event await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), "responseText": storedText, "responseType": responseType.value, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, "timestamp": botResponseData.get("timestamp"), }) # Update session response count session = interface.getSession(sessionId) if session: count = session.get("botResponseCount", 0) + 1 interface.updateSession(sessionId, {"botResponseCount": count}) self._lastBotResponseText = normalizedResponse self._lastBotResponseTs = nowTs # Record bot response in transcript (exactly once, regardless of channel) botTranscriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=storedText, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, ).model_dump() botTranscript = interface.createTranscript(botTranscriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": storedText, "timestamp": getUtcTimestamp(), "source": "botResponse", }) await _emitSessionEvent(sessionId, "transcript", { "id": botTranscript.get("id"), "speaker": self.config.botName, "text": storedText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "botResponse", "speakerResolvedFromHint": False, }) # Reset differential writing tracker so next STT creates a new block self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = storedText self._lastTranscriptId = botTranscript.get("id") self._followUpWindowEnd = time.time() + 15.0 logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s") # Step 5: Execute AI-issued commands (if any) if speechResult.commands: await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket) # When AI used only commands (no responseText), emit botResponse SSE # so the UI shows the response. Extract text from sendChat commands. if speechResult.shouldRespond and not speechResult.responseText: cmdTexts = [ c.params.get("text", "") for c in speechResult.commands if c.action == "sendChat" and c.params and c.params.get("text") ] combinedText = " ".join(cmdTexts) if cmdTexts else None if combinedText: botResponseData = TeamsbotBotResponse( sessionId=sessionId, responseText=combinedText, responseType=TeamsbotResponseType.CHAT, detectedIntent=speechResult.detectedIntent, reasoning=speechResult.reasoning, triggeredByTranscriptId=triggerTranscript.get("id"), modelName=response.modelName, processingTime=response.processingTime, priceCHF=response.priceCHF, timestamp=getIsoTimestamp(), ).model_dump() createdResponse = interface.createBotResponse(botResponseData) await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), "responseText": combinedText, "responseType": TeamsbotResponseType.CHAT.value, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, "timestamp": botResponseData.get("timestamp"), }) session = interface.getSession(sessionId) if session: count = session.get("botResponseCount", 0) + 1 interface.updateSession(sessionId, {"botResponseCount": count}) self._followUpWindowEnd = time.time() + 15.0 logger.info( f"Bot responded via commands in session {sessionId}: " f"intent={speechResult.detectedIntent}, follow-up window open for 15s" ) except Exception as e: logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True) await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"}) finally: self._aiAnalysisInProgress = False # ========================================================================= # AI Command Execution # ========================================================================= async def _executeCommands( self, sessionId: str, commands: List[TeamsbotCommand], voiceInterface, websocket: WebSocket, ): """Execute structured commands returned by the AI. Each command is dispatched to a dedicated handler function.""" for cmd in commands: action = cmd.action params = cmd.params or {} logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}") try: if action == "toggleTranscript": await self._cmdToggleTranscript(sessionId, params, websocket) elif action == "toggleChat": await self._cmdToggleChat(sessionId, params, websocket) elif action == "sendChat": await self._cmdSendChat(sessionId, params, websocket) elif action == "readChat": await self._cmdReadChat(sessionId, params, voiceInterface, websocket) elif action == "readAloud": await self._cmdReadAloud(sessionId, params, voiceInterface, websocket) elif action == "changeLanguage": await self._cmdChangeLanguage(sessionId, params) elif action in ("toggleMic", "toggleCamera"): await self._cmdToggleMicOrCamera(sessionId, action, params, websocket) elif action == "sendMail": await self._cmdSendMail(sessionId, params) elif action == "storeDocument": await self._cmdStoreDocument(sessionId, params) else: logger.warning(f"Session {sessionId}: Unknown command '{action}'") except Exception as cmdErr: logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}") async def _cmdToggleTranscript(self, sessionId: str, params: dict, websocket: WebSocket): """Caption on/off - toggle Teams live transcript capture.""" enable = params.get("enable", True) if websocket: await websocket.send_text(json.dumps({ "type": "botCommand", "sessionId": sessionId, "command": "toggleTranscript", "params": {"enable": enable}, })) async def _cmdToggleChat(self, sessionId: str, params: dict, websocket: WebSocket): """Chat on/off - enable/disable meeting chat monitoring.""" enable = params.get("enable", True) if websocket: await websocket.send_text(json.dumps({ "type": "botCommand", "sessionId": sessionId, "command": "toggleChat", "params": {"enable": enable}, })) async def _cmdSendChat(self, sessionId: str, params: dict, websocket: WebSocket): """Send a message to the meeting chat and record it in transcript/SSE.""" chatText = params.get("text", "") if not chatText: return if websocket: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": chatText, })) logger.info(f"Chat command sent for session {sessionId}") from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) transcriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=chatText, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, source="chat", ).model_dump() createdTranscript = interface.createTranscript(transcriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": chatText, "timestamp": getUtcTimestamp(), "source": "chat", }) self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = chatText self._lastTranscriptId = createdTranscript.get("id") self._lastBotResponseText = chatText.strip().lower() self._lastBotResponseTs = time.time() await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": self.config.botName, "text": chatText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "chat", "speakerResolvedFromHint": False, }) async def _cmdReadChat( self, sessionId: str, params: dict, voiceInterface, websocket: WebSocket, ): """Read chat messages (from DB) with optional fromdatetime/todatetime, then speak or send to chat.""" from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) transcripts = interface.getTranscripts(sessionId) fromDt = params.get("fromdatetime") or params.get("fromDateTime") toDt = params.get("todatetime") or params.get("toDateTime") chatOnly = [t for t in transcripts if t.get("source") in ("chat", "chatHistory")] if fromDt: chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") >= fromDt] if toDt: chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") <= toDt] summary = "\n".join(f"[{t.get('speaker', '?')}]: {t.get('text', '')}" for t in chatOnly[-20:]) if not summary: summary = "Keine Chat-Nachrichten im angegebenen Zeitraum." if voiceInterface and websocket: ttsResult = await voiceInterface.textToSpeech( text=summary[:2000], languageCode=self.config.language, voiceName=self.config.voiceId, ) if ttsResult and isinstance(ttsResult, dict) and ttsResult.get("audioContent"): audioContent = ttsResult["audioContent"] await websocket.send_text(json.dumps({ "type": "playAudio", "sessionId": sessionId, "audio": { "data": base64.b64encode( audioContent if isinstance(audioContent, bytes) else audioContent.encode() ).decode(), "format": "mp3", }, })) async def _cmdReadAloud( self, sessionId: str, params: dict, voiceInterface, websocket: WebSocket, ): """Read text aloud via TTS and play in meeting.""" readText = params.get("text", "") if readText and voiceInterface: ttsResult = await voiceInterface.textToSpeech( text=readText, languageCode=self.config.language, voiceName=self.config.voiceId, ) if ttsResult and isinstance(ttsResult, dict): audioContent = ttsResult.get("audioContent") if audioContent and websocket: await websocket.send_text(json.dumps({ "type": "playAudio", "sessionId": sessionId, "audio": { "data": base64.b64encode( audioContent if isinstance(audioContent, bytes) else audioContent.encode() ).decode(), "format": "mp3", }, })) async def _cmdChangeLanguage(self, sessionId: str, params: dict): """Change bot language.""" newLang = params.get("language", "") if newLang: self.config = self.config.model_copy(update={"language": newLang}) logger.info(f"Session {sessionId}: Language changed to '{newLang}'") await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang}) async def _cmdToggleMicOrCamera( self, sessionId: str, action: str, params: dict, websocket: WebSocket, ): """Toggle mic or camera in the meeting.""" if websocket: await websocket.send_text(json.dumps({ "type": "botCommand", "sessionId": sessionId, "command": action, "params": params, })) async def _cmdSendMail(self, sessionId: str, params: dict): """Send email via Service Center MessagingService.""" recipient = params.get("recipient") or params.get("to", "") subject = params.get("subject", "") message = params.get("message") or params.get("body", "") if not recipient or not subject: logger.warning(f"Session {sessionId}: sendMail requires recipient and subject") return try: from modules.serviceCenter import ServiceCenterContext, getService ctx = ServiceCenterContext( user=self.currentUser, mandate_id=self.mandateId, feature_instance_id=self.instanceId, ) messaging = getService("messaging", ctx) success = messaging.sendEmailDirect( recipient=recipient, subject=subject, message=message, userId=str(self.currentUser.id) if self.currentUser else None, ) if success: logger.info(f"Session {sessionId}: Email sent to {recipient}") else: logger.warning(f"Session {sessionId}: Email send failed for {recipient}") except Exception as e: logger.warning(f"Session {sessionId}: sendMail failed: {e}") async def _cmdStoreDocument(self, sessionId: str, params: dict): """Store document via Service Center SharepointService.""" sitePath = params.get("sitePath") or params.get("site", "") folderPath = params.get("folderPath") or params.get("folder", "") fileName = params.get("fileName", "document.txt") content = params.get("content", "") if isinstance(content, str): content = content.encode("utf-8") if not sitePath or not folderPath: logger.warning(f"Session {sessionId}: storeDocument requires sitePath and folderPath") return try: from modules.serviceCenter import ServiceCenterContext, getService ctx = ServiceCenterContext( user=self.currentUser, mandate_id=self.mandateId, feature_instance_id=self.instanceId, ) sharepoint = getService("sharepoint", ctx) if not sharepoint.setAccessTokenFromConnection(self.currentUser): logger.warning(f"Session {sessionId}: SharePoint connection not configured") return site = await sharepoint.getSiteByStandardPath(sitePath) if not site: logger.warning(f"Session {sessionId}: SharePoint site not found: {sitePath}") return result = await sharepoint.uploadFile( siteId=site["id"], folderPath=folderPath, fileName=fileName, content=content, ) if "error" in result: logger.warning(f"Session {sessionId}: storeDocument failed: {result['error']}") else: logger.info(f"Session {sessionId}: Document stored: {fileName}") except Exception as e: logger.warning(f"Session {sessionId}: storeDocument failed: {e}") # ========================================================================= # Context Summarization (for long sessions) # ========================================================================= async def _summarizeSessionContext(self, sessionId: str, rawContext: str) -> str: """Summarize a long user-provided session context to its essential points. This reduces token usage in every subsequent AI call.""" try: from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId) aiService = AiService(serviceCenter=serviceContext) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=( "Fasse den folgenden Kontext auf die wesentlichen Punkte zusammen. " "Behalte alle wichtigen Fakten, Namen, Zahlen, Entscheidungen und Aktionspunkte. " "Entferne Fuelltext und Wiederholungen. " "Antworte NUR mit der Zusammenfassung, keine Erklaerungen oder Einleitungen." ), context=rawContext, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ) ) response = await aiService.callAi(request) if response and response.errorCount == 0 and response.content: summary = response.content.strip() logger.info(f"Session {sessionId}: Context summarized from {len(rawContext)} to {len(summary)} chars") return summary except Exception as e: logger.warning(f"Session context summarization failed for {sessionId}: {e}") # Fallback: return original (truncated if very long) return rawContext[:2000] if len(rawContext) > 2000 else rawContext async def _summarizeContextBuffer(self, sessionId: str): """Summarize the older part of the context buffer to preserve information without exceeding the context window. This runs in the background.""" try: if self._contextSummary: return # Already summarized recently # Take the older half of the buffer for summarization halfPoint = len(self._contextBuffer) // 2 oldSegments = self._contextBuffer[:halfPoint] if len(oldSegments) < 10: return # Not enough to summarize # Build text to summarize lines = [] for seg in oldSegments: speaker = seg.get("speaker", "Unknown") text = seg.get("text", "") lines.append(f"[{speaker}]: {text}") textToSummarize = "\n".join(lines) from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId) aiService = AiService(serviceCenter=serviceContext) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt="Fasse das folgende Meeting-Transkript in 3-5 Saetzen zusammen. Nenne die wichtigsten Themen, Entscheidungen und offene Fragen. Antworte NUR mit der Zusammenfassung, keine Erklaerungen.", context=textToSummarize, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ) ) response = await aiService.callAi(request) if response and response.errorCount == 0: self._contextSummary = response.content.strip() logger.info(f"Session {sessionId}: Context summarized ({len(oldSegments)} segments -> {len(self._contextSummary)} chars)") except Exception as e: logger.warning(f"Context summarization failed for session {sessionId}: {e}") # ========================================================================= # Meeting Summary # ========================================================================= async def _generateMeetingSummary(self, sessionId: str): """Generate an AI summary of the meeting after it ends.""" try: from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) transcripts = interface.getTranscripts(sessionId) if not transcripts or len(transcripts) < 5: return # Not enough content for a summary # Build full transcript fullTranscript = "\n".join( f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}" for t in transcripts ) from modules.serviceCenter.services.serviceAi.mainServiceAi import AiService serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId) aiService = AiService(serviceCenter=serviceContext) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.", context=fullTranscript, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.BALANCED, ) ) response = await aiService.callAi(request) if response and response.errorCount == 0: interface.updateSession(sessionId, {"summary": response.content}) logger.info(f"Meeting summary generated for session {sessionId}") except Exception as e: logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")