# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Teamsbot Service - Pipeline Orchestrator. Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge. """ import logging import json import re import asyncio import time import base64 from typing import Optional, Dict, Any, List, Callable from fastapi import WebSocket from modules.datamodels.datamodelUam import User from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp from modules.serviceCenter import getService as _getServiceCenterService from modules.serviceCenter.context import ServiceCenterContext from .datamodelTeamsbot import ( TeamsbotSessionStatus, TeamsbotTranscript, TeamsbotBotResponse, TeamsbotResponseType, TeamsbotConfig, TeamsbotResponseMode, TeamsbotResponseChannel, TeamsbotDetectedIntent, SpeechTeamsResponse, TeamsbotCommand, TeamsbotDirectorPrompt, TeamsbotDirectorPromptStatus, TeamsbotDirectorPromptMode, DIRECTOR_PROMPT_TEXT_LIMIT, DIRECTOR_PROMPT_FILE_LIMIT, ) from .browserBotConnector import BrowserBotConnector logger = logging.getLogger(__name__) # Agent run limits for director prompts / speech escalation (meeting context). # Higher than default workspace agent: Teams research + tool chains need depth. TEAMSBOT_AGENT_MAX_ROUNDS = 8 TEAMSBOT_AGENT_MAX_COST_CHF = 0.12 # How many recent director-prompt briefings (one-shot + persistent) we keep in # session memory so SPEECH_TEAMS triggers and speech escalation can still see # the operator's attached files + analysis after the prompt itself was consumed. _RECENT_DIRECTOR_BRIEFINGS_MAX = 6 # Quick-ack ("Moment...") UX: fire a SHORT TTS the moment the bot's name is # detected so the speaker hears within ~1s that the bot reacted, instead of # waiting for the full debounce + SPEECH_TEAMS + agent pipeline (~5-30s). # Throttled per session to avoid acking every fragment of a long utterance. _QUICK_ACK_MIN_INTERVAL_SEC = 25.0 # Number of phrase variants we generate per kind (rotated round-robin so back- # to-back acks/notices don't sound identical). _EPHEMERAL_PHRASE_VARIANTS = 4 # Localisation INTENTS for ephemeral phrases. Each kind describes WHAT the # phrase should express; the actual wording is produced at runtime by the AI # in the bot's configured language + persona. The intent text below is the # instruction passed to the LLM (English, since it's a model directive โ€” the # OUTPUT will be in the configured spoken language). Add new ephemeral phrase # kinds here, never inline string literals at the call site. _EPHEMERAL_PHRASE_INTENTS: Dict[str, str] = { "quickAck": ( "Very short verbal acknowledgment (1 to 4 words) the assistant says " "the moment its name is recognised, BEFORE it has formulated a full " "answer. The intent is purely 'I heard you, I'm thinking' โ€” natural, " "conversational, never a complete sentence." ), "agentBusy": ( "One short sentence (max ~12 words) the assistant says BEFORE starting " "a longer research / tool-call task, so the audience knows the answer " "will take a few seconds. Polite, professional, calm." ), "agentRound": ( "One short sentence (max ~14 words) the assistant says BETWEEN rounds " "of a longer agent task to signal that work is still in progress. " "Include the placeholder tokens '{round}' and '{maxRounds}' so the " "caller can substitute the actual numbers โ€” e.g. 'Step {round} of " "{maxRounds}, still working.'" ), } def _voiceLineLooksLikeBillingOrMeta(line: str) -> bool: """Heuristic: trailing lines that are separators or billing/usage footers.""" s = line.strip() if not s: return True lower = s.lower() if re.match(r"^[-=*_]{3,}\s*$", s): return True if re.match(r"^#{1,6}\s*(usage|billing|costs?|meta|technical|statistics)\b", lower): return True if "chf" in lower and re.search(r"\d", s): if re.search( r"\b(total|usage|cost|billing|token|spent|used|price|estimate|" r"rounds?|calls?|duration|processing\s*time|model\s*calls?)\b", lower, ): return True if "token" in lower and re.search(r"\d", s): if re.search(r"\b(total|usage|prompt|completion)\b", lower): return True pl = lower.replace(" ", "") if "progressafter" in pl and ("aicalls:" in pl or "toolcalls:" in pl): return True return False _EMOJI_PATTERN = re.compile( "[" "\U0001F300-\U0001FAFF" # symbols & pictographs, emoticons, transport, supplemental "\U00002600-\U000027BF" # misc symbols + dingbats (incl. โš™ ๐Ÿ” ๐Ÿ”Œ โœ“ โœ—) "\U0001F1E6-\U0001F1FF" # regional indicator (flags) "\U00002B00-\U00002BFF" # arrows, geometric "\U0001F900-\U0001F9FF" # supplemental symbols (incl. ๐Ÿค– ๐Ÿง ) "\U0000FE0F" # variation selector-16 (emoji presentation) "]+", flags=re.UNICODE, ) def _voiceFriendlyMeetingText(raw: str) -> str: """Sanitise a chat/markdown response so it can be SPOKEN naturally. Aggressive cleanup โ€” when a TTS engine reads raw markdown out loud the listener hears "hash hash hash Zusammenfassung pipe pipe pipe", which is unbearable in a meeting. The chat / DB / UI keep the original text; only the audio path goes through this sanitiser. What we strip: * Code fences and inline code * Markdown emphasis (**bold**, *italic*, __bold__, _italic_) * Markdown links โ†’ keep label * Headings (# .. ######) * Markdown tables (any line with two or more pipes is dropped wholesale) * Horizontal rules (---, ***, ___ on their own line) * Bullet markers (-, *, โ€ข, ยท) and numbered list markers (1., 2)) at line start * Emojis (full Unicode pictograph ranges + variation selector) * Decorative trailing colons on bullet headings * Stray pipes left over from inline tables * Trailing billing / "maximum rounds reached" / "budget exceeded" footers Whitespace is then collapsed to single spaces. """ if not raw: return "" # Trim trailing operator/billing footers BEFORE any structural rewrite # so we don't waste effort sanitising a footer that gets dropped. low = raw.lower() if "maximum rounds reached" in low: m = re.search(r"(?is)maximum\s+rounds\s+reached", raw) if m: head = raw[: m.start()].strip() raw = head or ( "Die Abklaerung brauchte mehr Schritte als vorgesehen; Details stehen im Chat." ) if "budget exceeded" in low: m = re.search(r"(?is)budget\s+exceeded", raw) if m: head = raw[: m.start()].strip() raw = head or "Das eingestellte Kostenlimit ist erreicht; Details stehen im Chat." lines = raw.strip().split("\n") while lines and _voiceLineLooksLikeBillingOrMeta(lines[-1]): lines.pop() t = "\n".join(lines).strip() if not t: t = raw.strip() # 1) Strip code blocks (multi-line first, then inline) t = re.sub(r"```[\s\S]*?```", " ", t) t = re.sub(r"`([^`]+)`", r"\1", t) # 2) Drop markdown table rows (any line with two or more pipes) and the # separator lines they come with (|---|---|). A paragraph that just # happens to contain ONE pipe survives. cleanedLines: List[str] = [] for ln in t.split("\n"): stripped = ln.strip() if stripped.count("|") >= 2: continue if re.fullmatch(r"\s*\|?[\s\-:|]+\|?\s*", stripped) and "-" in stripped: continue cleanedLines.append(ln) t = "\n".join(cleanedLines) # 3) Drop horizontal rule lines (---, ***, ___, with optional spaces) t = re.sub(r"(?m)^\s*([-*_])\s*\1\s*\1[\s\1]*$", "", t) # 4) Headings: drop the leading hashes t = re.sub(r"(?m)^\s*#{1,6}\s+", "", t) # 5) Bullet markers at line start โ€” keep the content, drop the bullet t = re.sub(r"(?m)^\s*[-*โ€ขยท]\s+", "", t) # 6) Numbered list markers at line start ("1.", "2)", "3 -") t = re.sub(r"(?m)^\s*\d+[\.\)]\s+", "", t) # 7) Emphasis markers (after bullets so a "**Bold:**" heading is handled) t = re.sub(r"\*\*([^*]+)\*\*", r"\1", t) t = re.sub(r"\*([^*\n]+)\*", r"\1", t) t = re.sub(r"__([^_]+)__", r"\1", t) t = re.sub(r"(?` `{` `}` `[` `]` `(` `)` # `_` `&` `@` `$` `%` `` -- replaced with a space so word # boundaries are preserved. t = re.sub(r"[*#~^=+|\\<>{}\[\]()_&@$%`]+", " ", t) # 10e) Drop ASCII double-quote (single quotes are legitimate apostrophes # in contractions like "don't" / "geht's", so we keep U+0027). t = t.replace('"', "") # 10f) Slash between letters/digits โ€” TTS reads "slash". Replace with # " or " for readability when it separates words like "und/oder". t = re.sub(r"(?<=\w)\s*/\s*(?=\w)", " oder ", t) # Any remaining stray slash is just whitespace. t = t.replace("/", " ") # 10g) Trim multiple punctuation runs ("...!!!" โ†’ "..." / "!" / etc.) t = re.sub(r"([\.,;:!\?])\1{1,}", r"\1", t) # Remove orphan punctuation directly preceded by whitespace # (common after symbol stripping: " , ", " . "). t = re.sub(r"\s+([\.,;:!\?])", r"\1", t) # Collapse trailing colon at end of meaningful phrase to a period for # nicer cadence ("Was ist PowerOn:" โ†’ "Was ist PowerOn."). t = re.sub(r":\s*$", ".", t.rstrip()) # 10h) Collapse " :" tail of MULTI-LINE blocks the same way. t = re.sub(r"\s+:\s*$", ":", t, flags=re.MULTILINE) # 11) Collapse whitespace to single spaces; protect sentence breaks by # turning paragraph blanks into a period if the previous chunk # didn't already terminate. paragraphs = [p.strip() for p in re.split(r"\n\s*\n", t) if p.strip()] rebuilt: List[str] = [] for p in paragraphs: p = re.sub(r"\s+", " ", p).strip() if not p: continue if not re.search(r"[\.!\?\u2026:]\s*$", p): p = p.rstrip() + "." rebuilt.append(p) t = " ".join(rebuilt) t = re.sub(r"\s+", " ", t).strip() # If we sanitised away everything (e.g. the input was *only* a markdown # table or a wall of pictographs) return empty โ€” the caller (TTS / voice # summary) treats empty as "nothing to say", which is the safe default. # Falling back to raw markdown here would leak the very symbols we just # spent ten passes removing. return t # Google Cloud TTS rejects single sentences that exceed ~5000 bytes. The Chirp3 # voices are stricter: long, comma-heavy sentences (no terminating punctuation) # also fail with "Sentence ... is too long". We chunk well below the documented # limit AND inject sentence terminators so the synthesizer accepts every chunk. _TTS_MAX_CHUNK_CHARS = 800 def _splitTextForTts(text: str, maxChars: int = _TTS_MAX_CHUNK_CHARS) -> List[str]: """Split a long voice line into TTS-safe chunks at sentence/paragraph boundaries. The result preserves order and contains no empty strings. A single sentence longer than ``maxChars`` is hard-cut at word boundaries. """ cleaned = (text or "").strip() if not cleaned: return [] if len(cleaned) <= maxChars: return [cleaned] sentencePattern = re.compile(r"(?<=[\.!\?\u2026])\s+|\n+") rawSentences = [s.strip() for s in sentencePattern.split(cleaned) if s and s.strip()] if not rawSentences: rawSentences = [cleaned] chunks: List[str] = [] buffer = "" for sentence in rawSentences: if len(sentence) > maxChars: if buffer: chunks.append(buffer.strip()) buffer = "" words = sentence.split(" ") current = "" for word in words: candidate = (current + " " + word).strip() if current else word if len(candidate) > maxChars and current: chunks.append(current.strip()) current = word else: current = candidate if current: if not re.search(r"[\.!\?\u2026]\s*$", current): current = current.rstrip() + "." chunks.append(current.strip()) continue candidate = (buffer + " " + sentence).strip() if buffer else sentence if len(candidate) > maxChars and buffer: chunks.append(buffer.strip()) buffer = sentence else: buffer = candidate if buffer: chunks.append(buffer.strip()) finalized: List[str] = [] for c in chunks: if not c: continue if not re.search(r"[\.!\?\u2026]\s*$", c): c = c.rstrip() + "." finalized.append(c) return finalized async def _speakTextChunked( websocket: Optional[WebSocket], voiceInterface: Any, sessionId: str, voiceText: str, languageCode: str, voiceName: Optional[str], isCancelled: Optional[Callable[[], bool]] = None, ) -> Dict[str, Any]: """Run TTS in chunks and dispatch each ``playAudio`` over the websocket. Returns ``{"success": bool, "chunks": int, "played": int, "error": Optional[str], "cancelled": bool}``. Failure for one chunk does NOT abort the rest; partial playback still counts as ``success=True`` so the caller can decide whether to add a chat fallback for the missing parts. ``isCancelled`` is an optional zero-arg predicate the caller passes in to signal "abort the remaining chunks". It is checked BEFORE each Google TTS round-trip and again BEFORE each websocket send, so a stop word in the meeting can interrupt a multi-chunk dispatch within at most one chunk boundary instead of waiting for the whole answer to finish. """ chunks = _splitTextForTts(voiceText) result: Dict[str, Any] = {"success": False, "chunks": len(chunks), "played": 0, "error": None, "cancelled": False} if not chunks: result["error"] = "no text" return result if voiceInterface is None: result["error"] = "no voice interface" return result lastError: Optional[str] = None for idx, chunk in enumerate(chunks, start=1): if isCancelled is not None and isCancelled(): result["cancelled"] = True logger.info( f"Session {sessionId}: TTS chunk loop cancelled before chunk " f"{idx}/{len(chunks)} (user stop or newer answer in flight)" ) break try: ttsResult = await voiceInterface.textToSpeech( text=chunk, languageCode=languageCode, voiceName=voiceName, ) except Exception as ttsErr: # pragma: no cover - network/runtime errors lastError = f"chunk {idx}/{len(chunks)} raised: {ttsErr}" logger.warning(f"Session {sessionId}: TTS {lastError}") continue if not isinstance(ttsResult, dict) or ttsResult.get("success") is False: err = (ttsResult or {}).get("error", "unknown") if isinstance(ttsResult, dict) else "no result" lastError = f"chunk {idx}/{len(chunks)} failed: {err}" logger.warning(f"Session {sessionId}: TTS {lastError}") continue audioContent = ttsResult.get("audioContent") if not audioContent: lastError = f"chunk {idx}/{len(chunks)} returned no audioContent" logger.warning(f"Session {sessionId}: TTS {lastError}") continue if websocket is None: lastError = "websocket unavailable" break if isCancelled is not None and isCancelled(): result["cancelled"] = True logger.info( f"Session {sessionId}: TTS chunk loop cancelled before " f"sending chunk {idx}/{len(chunks)} (audio dropped)" ) break try: await websocket.send_text(json.dumps({ "type": "playAudio", "sessionId": sessionId, "audio": { "data": base64.b64encode( audioContent if isinstance(audioContent, bytes) else audioContent.encode() ).decode(), "format": "mp3", }, })) result["played"] += 1 except Exception as wsErr: # pragma: no cover - websocket failures lastError = f"chunk {idx}/{len(chunks)} websocket send failed: {wsErr}" logger.warning(f"Session {sessionId}: TTS {lastError}") break result["success"] = result["played"] > 0 if lastError: result["error"] = lastError return result def _coercePersistedDetectedIntent(raw: Optional[str]) -> tuple: """Map free-form intent labels (e.g. agent:directorPrompt) to TeamsbotDetectedIntent for DB persistence; return (enum, meta_suffix_or_None for reasoning).""" if not raw or not str(raw).strip(): return TeamsbotDetectedIntent.NONE, None s = str(raw).strip().lower() for member in TeamsbotDetectedIntent: if member.value == s: return member, None if s.startswith("agent:"): return TeamsbotDetectedIntent.PROACTIVE, str(raw).strip()[:120] return TeamsbotDetectedIntent.NONE, str(raw).strip()[:120] # Director prompts are PRIVATE operator instructions โ€” they must NOT be echoed # verbatim into the meeting. The agent is asked to start its FINAL answer with # either ``MEETING_REPLY:`` (followed by the text actually meant for the meeting) # or ``SILENT:`` / ``INTERNAL_ONLY:`` (followed by an internal note for the # operator UI). Anything else โ†’ treat as silent (safe default). _DIRECTOR_REPLY_PATTERN = re.compile( r"^\s*(MEETING_REPLY|MEETING|REPLY|SAY|SPEAK)\s*:\s*", re.IGNORECASE, ) _DIRECTOR_SILENT_PATTERN = re.compile( r"^\s*(SILENT|INTERNAL(?:_ONLY)?|NOTE|NO_MEETING_OUTPUT|ACK(?:NOWLEDGE)?)\s*:\s*", re.IGNORECASE, ) def _parseDirectorPromptFinal(finalText: str) -> Dict[str, Any]: """Parse the agent's final answer for a director prompt. Returns ``{"kind": "meeting"|"silent", "meetingText": str, "internalNote": str}``. Default is ``silent`` so unmarked replies are NOT broadcast into the meeting. """ text = (finalText or "").strip() if not text: return {"kind": "silent", "meetingText": "", "internalNote": ""} meetingMatch = _DIRECTOR_REPLY_PATTERN.match(text) if meetingMatch: body = text[meetingMatch.end():].strip() return {"kind": "meeting", "meetingText": body, "internalNote": ""} silentMatch = _DIRECTOR_SILENT_PATTERN.match(text) if silentMatch: body = text[silentMatch.end():].strip() return {"kind": "silent", "meetingText": "", "internalNote": body} # No marker โ†’ safe default: do NOT spam the meeting with the agent's # internal reasoning. Keep the full text as an internal note for the # operator UI so nothing is lost. return {"kind": "silent", "meetingText": "", "internalNote": text} # ========================================================================= # Active Service Registry (sessionId -> running TeamsbotService instance) # # Required so HTTP endpoints (e.g. director-prompt POST) can reach the # TeamsbotService instance currently holding the live websocket + voice # interface for that session, without going through the websocket loop. # ========================================================================= _activeServices: Dict[str, "TeamsbotService"] = {} def getActiveService(sessionId: str) -> Optional["TeamsbotService"]: """Return the running TeamsbotService for a session, or None if not active.""" return _activeServices.get(sessionId) # ========================================================================= # AI Service Factory (for billing-aware AI calls) # ========================================================================= def createAiService(user, mandateId, featureInstanceId=None): """Create a properly wired AiService via the service center.""" ctx = ServiceCenterContext( user=user, mandate_id=mandateId, feature_instance_id=featureInstanceId, feature_code="teamsbot", ) return _getServiceCenterService("ai", ctx) # ========================================================================= # Session Event Queues (for SSE streaming to frontend) # ========================================================================= sessionEvents: Dict[str, asyncio.Queue] = {} async def _emitSessionEvent(sessionId: str, eventType: str, data: Any): """Emit an event to the session's SSE stream. Creates the queue on-demand so events are never silently dropped.""" if sessionId not in sessionEvents: sessionEvents[sessionId] = asyncio.Queue() await sessionEvents[sessionId].put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()}) def _normalizeGatewayHostForBotWs(host: str) -> str: """Use IPv4 loopback for local dev WebSocket URLs passed to the Node browser-bot. Node on Windows often resolves ``localhost`` to ``::1`` first; Uvicorn bound to ``0.0.0.0`` typically accepts IPv4 only, so the bot gets ``ECONNREFUSED ::1``. """ h = host.strip() lower = h.lower() if lower == "localhost": return "127.0.0.1" if lower.startswith("localhost:"): return "127.0.0.1" + h[len("localhost"):] if lower.startswith("[::1]:"): return "127.0.0.1" + h.partition("]")[2] if lower in ("[::1]", "::1"): return "127.0.0.1" return h class TeamsbotService: """ Pipeline Orchestrator for Teams Bot sessions. Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication. """ def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig): self.currentUser = currentUser self.mandateId = mandateId self.instanceId = instanceId self.config = config self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl()) # State self._lastAiCallTime: float = 0.0 self._aiAnalysisInProgress: bool = False self._contextBuffer: List[Dict[str, Any]] = [] self._sessionContext: Optional[str] = None # User-provided background context self._contextSummary: Optional[str] = None # AI-generated summary of long context # Differential transcript tracking self._lastTranscriptSpeaker: Optional[str] = None self._lastTranscriptText: Optional[str] = None self._lastTranscriptId: Optional[str] = None self._lastSttTime: float = 0.0 self._lastBotResponseText: Optional[str] = None self._lastBotResponseTs: float = 0.0 # Speaker attribution: simple last-caption-speaker model self._lastCaptionSpeaker: Optional[str] = None self._unattributedTranscriptIds: List[str] = [] self._knownSpeakers: set = set() # Debounced name trigger: wait for speaker to finish before AI analysis self._pendingNameTrigger: Optional[Dict[str, Any]] = None self._followUpWindowEnd: float = 0.0 # Quick-ack throttle (timestamp of the last short "Moment..." ack we # spoke into the meeting). Without this guard a long sentence with # multiple name mentions would trigger several acks in a row. self._lastQuickAckTs: float = 0.0 # Session-scoped phrase pool for SHORT ephemeral utterances (quick # acks, "checking..." notices, per-round progress). Lazily populated # by the AI in the bot's configured language + persona โ€” no hardcoded # strings or hardcoded language branching anywhere downstream. Keyed # by the kinds defined in ``_EPHEMERAL_PHRASE_INTENTS``. # * ``self._phrasePool[kind]`` -> list of variants for that kind # * ``self._phrasePoolIdx[kind]`` -> round-robin pointer # Concurrent generation calls for the same kind are serialised by the # lock so we don't spawn duplicate AI requests on a burst. self._phrasePool: Dict[str, List[str]] = {} self._phrasePoolIdx: Dict[str, int] = {} self._phrasePoolLock: asyncio.Lock = asyncio.Lock() # Voice pipeline: a single per-session lock that serialises every TTS # dispatch into the meeting. Without it three independent code paths # (SPEECH_TEAMS direct answer, agent escalation final answer, and # operator-driven director prompt) can all reach # ``websocket.send_text({"type": "playAudio", ...})`` at the same time # and the browser bot then plays interleaved chunks โ€” i.e. "two bots # talking over each other" exactly as the operator suspects. Chat # (text) sends are NOT locked: they're cheap and can interleave fine. self._meetingTtsLock: asyncio.Lock = asyncio.Lock() # Generation counter incremented every time we begin producing a NEW # meeting answer OR every time the user issues a hard stop. Any TTS # chunk loop captures the counter value at start; before sending # each chunk to the bot it re-checks the counter and bails out if # it has moved on. This is what makes "Stopp" actually feel # instantaneous: the in-flight TTS dispatch loop drops itself the # moment the next chunk would have been sent, without waiting for # any AI round-trip or extra Google TTS call to come back. self._answerGenerationCounter: int = 0 # Tracking handles for cancellable background tasks. Keeping a # reference lets ``_cancelInFlightSpeech`` actually call # ``task.cancel()`` instead of just hoping the task notices the # generation counter has moved on. Cleared in the task's own # ``finally`` block. self._currentEscalationTask: Optional[asyncio.Task] = None self._currentQuickAckTask: Optional[asyncio.Task] = None # Whether an agent escalation task is in flight. Kept separate from # ``_aiAnalysisInProgress`` (which only covers the SPEECH_TEAMS phase) # so a new speech trigger that arrives WHILE the agent is still # researching does not start a parallel SPEECH_TEAMS that would then # answer at the same time as the agent. self._agentEscalationInFlight: bool = False # Live transport handles for out-of-band actions (director prompts, agent escalation). # Set in handleBotWebSocket once the bot connects; cleared on disconnect. self._activeSessionId: Optional[str] = None self._websocket: Optional[WebSocket] = None self._voiceInterface = None # Persistent director prompts kept in memory for context injection across triggers. # Loaded from DB on (re)connect; mutated by submit/delete director prompt routes. self._activePersistentPrompts: List[Dict[str, Any]] = [] # Recent director-prompt briefings (one-shot AND persistent) โ€” keeps the # operator's attached files and the agent's internal analysis available # for later SPEECH_TEAMS triggers, even after a one-shot prompt has been # consumed. Without this pool, the bot "forgets" attached docs as soon # as the director prompt finished, and answers later meeting questions # ("summarize the doc") with general babble instead of the file content. # Capped by ``_RECENT_DIRECTOR_BRIEFINGS_MAX`` to bound prompt size. self._recentDirectorBriefings: List[Dict[str, Any]] = [] # ========================================================================= # Session Lifecycle # ========================================================================= async def joinMeeting( self, sessionId: str, meetingLink: str, connectionId: Optional[str] = None, gatewayBaseUrl: str = "", botAccountEmail: Optional[str] = None, botAccountPassword: Optional[str] = None, ): """Send join command to the Browser Bot service. The browser bot will: 1. Launch browser (headful if credentials provided, headless otherwise) 2. Navigate to Teams web app 3. Authenticate if credentials provided, otherwise join as anonymous guest 4. Enable captions/audio capture and start scraping 5. Connect back via WebSocket to send transcripts """ from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) # Initialize SSE event queue sessionEvents[sessionId] = asyncio.Queue() try: # Update status to JOINING interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value}) await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"}) # Send join command to browser bot session = interface.getSession(sessionId) if not session: raise ValueError(f"Session {sessionId} not found") # Build the full WebSocket URL for the bot to connect back to this gateway instance # gatewayBaseUrl is passed from the route handler (derived from request.base_url) wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws" gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/") gatewayHost = _normalizeGatewayHostForBotWs(gatewayHost) fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}" hasAuth = bool(botAccountEmail and botAccountPassword) logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}") result = await self.browserBotConnector.joinMeeting( sessionId=sessionId, meetingUrl=meetingLink, botName=session.get("botName", self.config.botName), instanceId=self.instanceId, gatewayWsUrl=fullGatewayWsUrl, language=self.config.language, botAccountEmail=botAccountEmail, botAccountPassword=botAccountPassword, transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto", debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False, ) if result.get("success"): interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.JOINING.value, # Will become ACTIVE when bot connects via WS }) logger.info(f"Browser bot deployment started for session {sessionId}") else: errorMsg = result.get("error", "Unknown error joining meeting") interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ERROR.value, "errorMessage": errorMsg, }) await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg}) logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}") except Exception as e: logger.error(f"Error joining meeting for session {sessionId}: {e}") interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ERROR.value, "errorMessage": str(e), }) await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)}) async def leaveMeeting(self, sessionId: str): """Send leave command to the Browser Bot service.""" from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) try: interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value}) await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"}) await self.browserBotConnector.leaveMeeting(sessionId) interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ENDED.value, "endedAt": getIsoTimestamp(), }) await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"}) # Generate meeting summary in background asyncio.create_task(self._generateMeetingSummary(sessionId)) logger.info(f"Bot left meeting for session {sessionId}") except Exception as e: logger.error(f"Error leaving meeting for session {sessionId}: {e}") interface.updateSession(sessionId, { "status": TeamsbotSessionStatus.ERROR.value, "errorMessage": str(e), "endedAt": getIsoTimestamp(), }) # Cleanup event queue sessionEvents.pop(sessionId, None) # ========================================================================= # Browser Bot WebSocket Communication # ========================================================================= async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str): """ Main WebSocket handler for Browser Bot communication. Receives: - transcript: Caption text scraped from Teams - status: Bot state changes (joined, in_lobby, left, error) Sends: - playAudio: TTS audio for the bot to play in the meeting """ from . import interfaceFeatureTeamsbot as interfaceDb from modules.interfaces.interfaceVoiceObjects import getVoiceInterface interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) voiceInterface = getVoiceInterface(self.currentUser, self.mandateId) # Load session context (user-provided background knowledge) # If the context is long (>500 chars), summarize it to reduce token usage session = interface.getSession(sessionId) if session: rawContext = session.get("sessionContext") if rawContext and len(rawContext) > 500: logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...") self._sessionContext = await self._summarizeSessionContext(sessionId, rawContext) elif rawContext: self._sessionContext = rawContext if self._sessionContext: logger.info(f"Session {sessionId}: Session context ready ({len(self._sessionContext)} chars)") # Resolve system bot email for speaker detection (prevents bot from triggering AI on own speech) try: systemBot = interface.getActiveSystemBot(self.mandateId) self._botAccountEmail = systemBot.get("email") if systemBot else None if self._botAccountEmail: logger.info(f"Session {sessionId}: Bot account email resolved: {self._botAccountEmail}") except Exception: self._botAccountEmail = None # Register the live service so out-of-band callers (director prompts, # agent escalation) can deliver text/audio through this same websocket. self._activeSessionId = sessionId self._websocket = websocket self._voiceInterface = voiceInterface _activeServices[sessionId] = self # Notify the operator UI that the bot's WebSocket is now live so the # director-prompt panel can enable its submit button. try: await _emitSessionEvent(sessionId, "botConnectionState", { "connected": True, "timestamp": getIsoTimestamp(), }) except Exception: pass # Restore active persistent director prompts from DB (survives reconnects). try: self._activePersistentPrompts = interface.getActivePersistentPrompts(sessionId) or [] if self._activePersistentPrompts: logger.info( f"Session {sessionId}: Loaded {len(self._activePersistentPrompts)} active persistent director prompt(s)" ) except Exception as restoreErr: logger.warning(f"Session {sessionId}: Could not restore persistent director prompts: {restoreErr}") self._activePersistentPrompts = [] # Pre-warm the ephemeral phrase pool in the background so the first # quick-ack ("Moment...") and interim agent notice don't have to wait # for the AI round-trip. Best-effort: if generation fails, the # corresponding ephemeral cue is silently skipped at runtime โ€” never # falls back to hardcoded language strings. asyncio.create_task(self._warmEphemeralPhrasePool(sessionId)) logger.info(f"[WS] Handler started for session {sessionId}") try: msgCount = 0 while True: data = await websocket.receive() msgCount += 1 if "text" not in data: logger.debug(f"[WS] session={sessionId} msg #{msgCount}: non-text data (keys: {list(data.keys())})") continue message = json.loads(data["text"]) msgType = message.get("type") if msgType not in ("audioChunk", "ping"): logger.info(f"[WS] session={sessionId} msg #{msgCount}: type={msgType}") if msgType == "transcript": transcript = message.get("transcript", {}) source = transcript.get("source", "caption") speaker = transcript.get("speaker", "Unknown") textPreview = (transcript.get("text", "") or "")[:60] # Caption/speakerHint: name resolution only; transcript comes from STT logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...") await self._processTranscript( sessionId=sessionId, speaker=transcript.get("speaker", "Unknown"), text=transcript.get("text", ""), isFinal=transcript.get("isFinal", True), interface=interface, voiceInterface=voiceInterface, websocket=websocket, source=source, ) elif msgType == "chatMessage": chat = message.get("chat", {}) isHistory = chat.get("isHistory", False) source = "chatHistory" if isHistory else "chat" logger.info( f"[WS] Chat{'[HISTORY]' if isHistory else ''}: " f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..." ) await self._processTranscript( sessionId=sessionId, speaker=chat.get("speaker", "Unknown"), text=chat.get("text", ""), isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source=source, ) elif msgType == "status": status = message.get("status") errorMessage = message.get("message") logger.info(f"[WS] Status: status={status}, message={errorMessage}") await self._handleBotStatus(sessionId, status, errorMessage, interface) elif msgType == "audioChunk": audioData = message.get("audio", {}) audioBase64 = audioData.get("data", "") sampleRate = audioData.get("sampleRate", 16000) captureDiagnostics = audioData.get("captureDiagnostics") or {} if audioBase64: await self._processAudioChunk( sessionId=sessionId, audioBase64=audioBase64, sampleRate=sampleRate, captureDiagnostics=captureDiagnostics, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) elif msgType == "voiceGreeting": # Legacy path: older bot images send a pre-built greeting # text. New bots use ``requestGreeting`` and let the # Gateway own greeting generation. greetingText = message.get("text", "") greetingLang = message.get("language", self.config.language) logger.info( f"[WS] Voice greeting (legacy): text={greetingText[:60]}..., language={greetingLang}" ) if greetingText and voiceInterface: await self._dispatchGreetingToMeeting( sessionId=sessionId, greetingText=greetingText, greetingLang=greetingLang, sendToChat=False, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) elif msgType == "requestGreeting": # New path: bot just signals "I have joined" โ€” Gateway # generates the greeting text via AI in the configured # language + persona, then dispatches it to BOTH the # meeting chat (sendChatMessage command) and TTS. No # hardcoded language strings on the bot side. requestedLang = ( message.get("language") or self.config.language or "" ).strip() or "en-US" botNameHint = ( message.get("botName") or self.config.botName or "" ).strip() or self.config.botName logger.info( f"[WS] Greeting request from bot: language={requestedLang}, name={botNameHint}" ) if voiceInterface: try: greetingText = await self._generateGreetingText( requestedLang ) except Exception as genErr: logger.warning( f"Greeting generation failed for session {sessionId}: {genErr}" ) greetingText = "" if greetingText: await self._dispatchGreetingToMeeting( sessionId=sessionId, greetingText=greetingText, greetingLang=requestedLang, sendToChat=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, ) else: logger.warning( f"Session {sessionId}: Skipping greeting โ€” AI generation produced no text" ) elif msgType == "ping": await websocket.send_text(json.dumps({"type": "pong"})) elif msgType == "ttsPlaybackAck": playback = message.get("playback", {}) or {} status = playback.get("status", "unknown") ackMessage = playback.get("message") or "Bot playback status update" logger.info( f"[WS] TTS playback ack: status={status}, format={playback.get('format')}, " f"bytesBase64={playback.get('bytesBase64')}" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": f"playback_{status}", "hasWebSocket": True, "message": ackMessage, "timestamp": playback.get("timestamp") or getIsoTimestamp(), "format": playback.get("format"), "bytesBase64": playback.get("bytesBase64"), }) elif msgType == "mfaChallenge": mfaData = message.get("mfa", {}) mfaType = mfaData.get("type", "unknown") displayNumber = mfaData.get("displayNumber") prompt = mfaData.get("prompt", "") logger.info(f"[WS] MFA challenge: type={mfaType}, number={displayNumber}, prompt={prompt[:60]}") await _emitSessionEvent(sessionId, "mfaChallenge", { "mfaType": mfaType, "displayNumber": displayNumber, "prompt": prompt, "timestamp": getIsoTimestamp(), }) from .routeFeatureTeamsbot import mfaCodeQueues, mfaWaitTasks mfaQueue = asyncio.Queue() mfaCodeQueues[sessionId] = mfaQueue async def _waitAndForwardMfa(sid, queue, ws): try: mfaResponse = await asyncio.wait_for(queue.get(), timeout=120.0) logger.info(f"[WS] MFA response received for session {sid}: action={mfaResponse.get('action')}") await ws.send_text(json.dumps({ "type": "mfaResponse", "sessionId": sid, "mfa": mfaResponse, })) except asyncio.TimeoutError: logger.warning(f"[WS] MFA response timeout for session {sid}") await ws.send_text(json.dumps({ "type": "mfaResponse", "sessionId": sid, "mfa": {"action": "timeout"}, })) await _emitSessionEvent(sid, "mfaChallenge", { "mfaType": "timeout", "prompt": "MFA-Zeitlimit ueberschritten. Bitte erneut versuchen.", }) except asyncio.CancelledError: logger.info(f"[WS] MFA wait cancelled for session {sid} (resolved via page)") finally: mfaCodeQueues.pop(sid, None) mfaWaitTasks.pop(sid, None) mfaWaitTasks[sessionId] = asyncio.create_task( _waitAndForwardMfa(sessionId, mfaQueue, websocket) ) elif msgType == "chatSendFailed": errorData = message.get("error", {}) reason = errorData.get("reason", "unknown") failedText = errorData.get("text", "") logger.warning( f"[WS] Chat send failed for session {sessionId}: " f"reason={reason}, text={failedText[:60]}" ) await _emitSessionEvent(sessionId, "chatSendFailed", { "reason": reason, "message": errorData.get("message", "Chat message could not be sent"), "text": failedText, "timestamp": getIsoTimestamp(), }) elif msgType == "mfaResolved": success = message.get("success", False) logger.info(f"[WS] MFA resolved: success={success}") from .routeFeatureTeamsbot import mfaCodeQueues, mfaWaitTasks task = mfaWaitTasks.pop(sessionId, None) if task and not task.done(): task.cancel() mfaCodeQueues.pop(sessionId, None) await _emitSessionEvent(sessionId, "mfaResolved", { "success": success, "timestamp": getIsoTimestamp(), }) except Exception as e: if "disconnect" not in str(e).lower(): logger.error(f"[WS] Error for session {sessionId}: {type(e).__name__}: {e}") finally: if _activeServices.get(sessionId) is self: _activeServices.pop(sessionId, None) self._websocket = None self._voiceInterface = None self._activeSessionId = None try: await _emitSessionEvent(sessionId, "botConnectionState", { "connected": False, "timestamp": getIsoTimestamp(), }) except Exception: pass logger.info(f"[WS] Handler ended for session {sessionId} after {msgCount} messages") async def _handleBotStatus( self, sessionId: str, status: str, errorMessage: Optional[str], interface, ): """Handle status updates from the browser bot.""" logger.info(f"Bot status update for session {sessionId}: {status}") statusMap = { "connecting": TeamsbotSessionStatus.JOINING.value, "launching": TeamsbotSessionStatus.JOINING.value, "navigating": TeamsbotSessionStatus.JOINING.value, "in_lobby": TeamsbotSessionStatus.JOINING.value, "joined": TeamsbotSessionStatus.ACTIVE.value, "in_meeting": TeamsbotSessionStatus.ACTIVE.value, "left": TeamsbotSessionStatus.ENDED.value, "error": TeamsbotSessionStatus.ERROR.value, } dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value) updates = {"status": dbStatus} if errorMessage: updates["errorMessage"] = errorMessage if dbStatus == TeamsbotSessionStatus.ACTIVE.value: updates["startedAt"] = getIsoTimestamp() elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]: updates["endedAt"] = getIsoTimestamp() interface.updateSession(sessionId, updates) await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage}) # Generate summary when session ends if dbStatus == TeamsbotSessionStatus.ENDED.value: asyncio.create_task(self._generateMeetingSummary(sessionId)) async def _processAudioChunk( self, sessionId: str, audioBase64: str, sampleRate: int, captureDiagnostics: Optional[Dict[str, Any]], interface, voiceInterface, websocket: WebSocket, ): """Process an audio chunk from WebRTC capture โ€” run STT and feed into transcript pipeline.""" import base64 try: audioBytes = base64.b64decode(audioBase64) if len(audioBytes) < 1000: return if captureDiagnostics: trackId = captureDiagnostics.get("trackId") readyState = captureDiagnostics.get("readyState") rms = captureDiagnostics.get("rms") nativeSampleRate = captureDiagnostics.get("nativeSampleRate") logger.debug( f"[AudioChunk] diagnostics: track={trackId}, readyState={readyState}, " f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}" ) # Use RMS from capture diagnostics to skip real silence. # Byte-variation heuristics produced false positives and dropped valid speech. if captureDiagnostics and captureDiagnostics.get("rms") is not None: try: rmsVal = float(captureDiagnostics.get("rms")) if rmsVal < 0.0003: logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})") return except Exception: pass if not voiceInterface: logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}") return # Treat sampleRate=0 as unknown (triggers auto-detection) effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None phraseHints = list(self._knownSpeakers) if self.config.botName: phraseHints.append(self.config.botName) sttResult = await voiceInterface.speechToText( audioContent=audioBytes, language=self.config.language or "de-DE", sampleRate=effectiveSampleRate, channels=1, skipFallbacks=True, phraseHints=phraseHints if phraseHints else None, alternativeLanguages=["en-US"], ) if sttResult and sttResult.get("success") and sttResult.get("text"): text = sttResult["text"].strip() if text: resolvedSpeaker = self._resolveSpeakerForAudioCapture() fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False) logger.info( f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} " f"(fromCaption={fromCaption}), text={text[:80]}..." ) await self._processTranscript( sessionId=sessionId, speaker=resolvedSpeaker["speaker"], text=text, isFinal=True, interface=interface, voiceInterface=voiceInterface, websocket=websocket, source="audioCapture", speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"], ) except Exception as e: logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}") def _registerSpeakerHint(self, speaker: str, text: str, sessionId: str = ""): """Track current speaker from captions for STT attribution. When the first non-bot caption arrives, retroactively attributes any STT segments that were created before a speaker was known.""" if not speaker: return normalizedSpeaker = speaker.strip() if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker): return prevSpeaker = self._lastCaptionSpeaker self._lastCaptionSpeaker = normalizedSpeaker self._knownSpeakers.add(normalizedSpeaker) if prevSpeaker is None and self._unattributedTranscriptIds: from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) for tid in self._unattributedTranscriptIds: interface.updateTranscript(tid, {"speaker": normalizedSpeaker}) for seg in self._contextBuffer: if seg.get("speaker") == "Unknown" and seg.get("source") == "audioCapture": seg["speaker"] = normalizedSpeaker if self._lastTranscriptSpeaker == "Unknown": self._lastTranscriptSpeaker = normalizedSpeaker logger.info( f"Session {sessionId}: Retroactive speaker attribution: " f"{len(self._unattributedTranscriptIds)} segments -> {normalizedSpeaker}" ) self._unattributedTranscriptIds.clear() if self._pendingNameTrigger: self._pendingNameTrigger["lastActivity"] = time.time() def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]: """Speaker name for audio chunks โ€” uses the last caption speaker.""" if self._lastCaptionSpeaker: return {"speaker": self._lastCaptionSpeaker, "speakerResolvedFromHint": True} return {"speaker": "Unknown", "speakerResolvedFromHint": False} async def _processTranscript( self, sessionId: str, speaker: str, text: str, isFinal: bool, interface, voiceInterface, websocket: WebSocket, source: str = "caption", speakerResolvedFromHint: Optional[bool] = None, ): """Process a transcript segment from captions or chat messages. Differential writing: When the same speaker continues (text grows incrementally as captions stream), we UPDATE the existing DB record instead of creating a cascade of near-duplicate rows. A new record is only created when the speaker changes or the text is not a continuation of the previous segment. """ text = text.strip() if not text: return # Captions are used ONLY for speaker name resolution (never as transcript). # Transcript text comes exclusively from audio STT or chat. # Address detection (bot name in caption) still triggers AI analysis # using existing audio-based context โ€” but caption text itself is NOT # added to the context buffer. if source in ("caption", "speakerHint"): self._registerSpeakerHint(speaker, text, sessionId) if ( source == "speakerHint" and isFinal and not self._isBotSpeaker(speaker) and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY and self._detectBotName(text) ): triggerTranscript = {"id": None, "speaker": speaker, "text": text, "source": source} isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, triggerTranscript) if isNew: logger.info(f"Session {sessionId}: Bot name in caption, debounce trigger started") asyncio.create_task(self._checkPendingNameTrigger()) # Fire a short audible "Moment..." in parallel so the # speaker hears the bot react immediately, instead of # waiting for debounce + SPEECH_TEAMS + agent (~5-30s). self._currentQuickAckTask = asyncio.create_task( self._runQuickAck(sessionId) ) return # Chat history: messages sent before the bot joined the meeting. # Stored in DB for reference but NOT added to the AI context buffer, # because old messages (e.g. "nyla, summarize the protocol") would # be treated as current requests when AI analysis is triggered. if source == "chatHistory": transcriptData = TeamsbotTranscript( sessionId=sessionId, speaker=speaker, text=text, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, source="chatHistory", ).model_dump() createdTranscript = interface.createTranscript(transcriptData) await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": speaker, "text": text, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "chatHistory", "isHistory": True, }) logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}") return # Filter out the bot's own speech (caption/audioCapture) โ€” garbled text # pollutes context. Chat from the bot is clean text and must appear in # the transcript for all participants. isBotSpeaker = self._isBotSpeaker(speaker) if isBotSpeaker and source != "chat": logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...") return # Differential transcript writing: # audioCapture from same speaker โ†’ append text (merge STT chunks into one block) # Start a new block after a pause (>5s gap between STT results) sttPauseThreshold = 5.0 isMerge = ( source == "audioCapture" and self._lastTranscriptSpeaker == speaker and self._lastTranscriptText is not None and self._lastTranscriptId is not None and (time.time() - self._lastSttTime) < sttPauseThreshold ) if isMerge: mergedText = f"{self._lastTranscriptText} {text}" interface.updateTranscript(self._lastTranscriptId, { "text": mergedText, "isFinal": isFinal, }) self._lastTranscriptText = mergedText createdTranscript = {"id": self._lastTranscriptId} if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker: self._contextBuffer[-1]["text"] = mergedText else: transcriptData = TeamsbotTranscript( sessionId=sessionId, speaker=speaker, text=text, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=isFinal, source=source, ).model_dump() createdTranscript = interface.createTranscript(transcriptData) self._lastTranscriptSpeaker = speaker self._lastTranscriptText = text self._lastTranscriptId = createdTranscript.get("id") if source == "audioCapture" and speaker == "Unknown": self._unattributedTranscriptIds.append(createdTranscript.get("id")) self._contextBuffer.append({ "speaker": speaker or "Unknown", "text": text, "timestamp": getUtcTimestamp(), "source": source, }) maxSegments = self.config.contextWindowSegments if len(self._contextBuffer) > maxSegments: if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5: asyncio.create_task(self._summarizeContextBuffer(sessionId)) self._contextBuffer = self._contextBuffer[-maxSegments:] session = interface.getSession(sessionId) if session: count = session.get("transcriptSegmentCount", 0) + 1 interface.updateSession(sessionId, {"transcriptSegmentCount": count}) if source == "audioCapture": self._lastSttTime = time.time() displayText = self._lastTranscriptText if isMerge else text await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": speaker, "text": displayText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": isMerge, "source": source, "speakerResolvedFromHint": ( speakerResolvedFromHint if speakerResolvedFromHint is not None else False ), }) if not isFinal: return if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY: return # Bot's own chat: stored for display only, never trigger AI if source == "chat" and isBotSpeaker: return # Stop phrases: HARD STOP, no AI round-trip. We previously routed # this through ``_analyzeAndRespond`` which spent 1-2 seconds in # the speech LLM just to classify the intent, during which the # current TTS kept playing โ€” and the LLM round-trip would also # produce yet another response that joined the queue. The new # path goes straight to the browser bot's audio cancel and # invalidates everything else in flight. if self._isStopPhrase(text): logger.info( f"Session {sessionId}: Stop phrase detected ('{text.strip()[:60]}'), " f"hard-cancelling in-flight speech immediately" ) await self._cancelInFlightSpeech( sessionId=sessionId, websocket=websocket, reason="userStopPhrase", ) return # Update activity for any pending debounced trigger if self._pendingNameTrigger: self._pendingNameTrigger["lastActivity"] = time.time() # Bot name detection โ†’ debounced trigger (wait for speaker to finish) if self._detectBotName(text): isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript) if isNew: asyncio.create_task(self._checkPendingNameTrigger()) # Audible early-feedback ack ("Moment...") in parallel โ€” runs # while we still wait the debounce window and SPEECH_TEAMS # decides what to actually answer. self._currentQuickAckTask = asyncio.create_task( self._runQuickAck(sessionId) ) return # Follow-up window: after a bot response, trigger AI for any human speech # without requiring the bot name โ€” the AI decides via shouldRespond if ( source == "audioCapture" and not self._isBotSpeaker(speaker) and time.time() < self._followUpWindowEnd and not self._pendingNameTrigger ): isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript) if isNew: logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)") asyncio.create_task(self._checkPendingNameTrigger()) return # Periodic trigger (only when no debounce pending) if not self._pendingNameTrigger: shouldTrigger = self._shouldTriggerAnalysis(text) if shouldTrigger: logger.info(f"Session {sessionId}: Periodic trigger (buffer: {len(self._contextBuffer)} segments)") await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript) def _isBotSpeaker(self, speaker: str) -> bool: """Check if a transcript speaker is the bot itself. Teams captions show the bot as e.g. "BotName (Unverified)" or "Nyla Larsson" depending on auth/anonymous join. We match against: - The configured/derived bot name - The bot account display name if authenticated """ if not speaker: return False speakerLower = speaker.lower().strip() # Match against configured bot name botName = self.config.botName.lower().strip() if botName and botName in speakerLower: return True # Match against bot account email prefix (e.g. "nyla.larsson" from "nyla.larsson@poweron.swiss") botAccountEmail = getattr(self, '_botAccountEmail', None) or getattr(self.config, 'botAccountEmail', None) if botAccountEmail: emailPrefix = botAccountEmail.split("@")[0].lower().replace(".", " ") if emailPrefix in speakerLower: return True return False def _shouldTriggerAnalysis(self, transcriptText: str, allowPeriodic: bool = True) -> bool: """ Decide whether to trigger AI analysis based on the latest transcript. Bot name detection is handled separately via debounce. This method only checks periodic/cooldown triggers. """ now = time.time() timeSinceLastCall = now - self._lastAiCallTime if timeSinceLastCall < self.config.triggerCooldownSeconds: return False if allowPeriodic and timeSinceLastCall >= self.config.triggerIntervalSeconds: logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s)") return True return False def _isStopPhrase(self, text: str) -> bool: """Check if text is an immediate-cancel command from the meeting. Recognised intents (any language we hear in practice): * Hard stop: stop / stopp / halt / ruhe / stille / arrete / quiet / shut * Pause / wait: warte / wait / moment / pause / hold (hold on) * Silence: sei still / be quiet / shut up / aufhoeren / aufhรถren / silence Hits trigger the direct stop pipeline in ``_cancelInFlightSpeech``: kill TTS, invalidate pending generations, clear name-trigger debounce. Critically: NO new AI call is fired โ€” the user explicitly asked the bot to be quiet, so the worst thing we could do is generate yet another response on top of the one we just cancelled. """ if not text or len(text.strip()) < 2: return False t = text.strip().lower() words = [w.strip(".,!?:;\"'()[]") for w in t.split() if w.strip()] wordSet = set(words) stopWords = { # Hard-stop verbs "stop", "stopp", "halt", "ruhe", "stille", "schweig", "arrete", "quiet", "shut", "silence", # Pause / wait verbs (still "be quiet now" semantics) "warte", "wait", "moment", "pause", } if wordSet & stopWords: return True if ( "sei still" in t or "be quiet" in t or "shut up" in t or "hold on" in t or "aufhoeren" in t or "aufhรถren" in t ): return True return False def _makeAnswerCancelHook(self) -> Callable[[], bool]: """Capture the current ``_answerGenerationCounter`` and return a zero-arg predicate that returns ``True`` once a hard stop (or any future "supersede this answer" event) has bumped the counter. Pass the returned predicate as ``isCancelled`` into ``_speakTextChunked`` so a multi-chunk dispatch can bail out between chunks instead of speaking a 30-second answer to the end. """ snapshot = self._answerGenerationCounter return lambda: self._answerGenerationCounter != snapshot async def _cancelInFlightSpeech( self, sessionId: str, websocket: Optional[WebSocket], reason: str, ) -> None: """Hard stop everything the bot is currently doing in the meeting. Pipeline (ALL synchronous from the caller's point of view, no AI round-trips): 1. Bump ``_answerGenerationCounter`` so any in-flight TTS chunk loop, agent escalation or quick-ack drops its remaining work the moment it next checks the counter. 2. Clear ``_pendingNameTrigger`` so a debounced "speaker just said the bot name" trigger that was queued before the stop word cannot wake up 3 seconds later and answer anyway. 3. Cancel tracked background tasks (escalation, quick-ack). The tasks themselves swallow ``CancelledError`` in their finally block. 4. Send ``{"type":"stopAudio"}`` to the browser bot โ€” it stops the current playback in the AudioContext and clears its play queue so nothing buffered comes through afterwards. Deliberately does NOT generate a new response. The user just told the bot to be quiet; producing a "Okay, ich bin still" reply on top would be the exact opposite of what was asked for. """ self._answerGenerationCounter += 1 gen = self._answerGenerationCounter logger.info( f"Session {sessionId}: Cancelling in-flight speech " f"(reason={reason}, gen={gen})" ) if self._pendingNameTrigger: logger.info( f"Session {sessionId}: Dropping pending debounced name " f"trigger (was queued before stop)" ) self._pendingNameTrigger = None for taskAttr in ("_currentEscalationTask", "_currentQuickAckTask"): task = getattr(self, taskAttr, None) if task is not None and not task.done(): logger.info( f"Session {sessionId}: Cancelling background task " f"{taskAttr}" ) task.cancel() if websocket is not None: try: await websocket.send_text(json.dumps({ "type": "stopAudio", "sessionId": sessionId, "reason": reason, })) except Exception as stopErr: logger.warning( f"Session {sessionId}: Failed to send stopAudio to " f"browser bot: {stopErr}" ) try: await _emitSessionEvent(sessionId, "speechCancelled", { "reason": reason, "generation": gen, "timestamp": getIsoTimestamp(), }) except Exception: pass def _detectBotName(self, text: str) -> bool: """Check if text contains the bot's name (exact or phonetically similar).""" botNameLower = self.config.botName.lower() textLower = text.lower() if botNameLower in textLower: return True botFirstName = botNameLower.split()[0] if " " in botNameLower else botNameLower if len(botFirstName) >= 3: for word in textLower.split(): cleanWord = word.strip(".,!?:;\"'()[]") if not cleanWord or len(cleanWord) < 3: continue if cleanWord == botFirstName: return True if cleanWord[0] == botFirstName[0] and abs(len(cleanWord) - len(botFirstName)) <= 2: common = sum(1 for c in set(botFirstName) if c in cleanWord) similarity = common / max(len(set(botFirstName)), len(set(cleanWord))) if similarity >= 0.6: return True return False def _setPendingNameTrigger(self, sessionId, interface, voiceInterface, websocket, triggerTranscript) -> bool: """Set or update a debounced name trigger. Returns True if newly set.""" if self._pendingNameTrigger: self._pendingNameTrigger["lastActivity"] = time.time() return False self._pendingNameTrigger = { "sessionId": sessionId, "interface": interface, "voiceInterface": voiceInterface, "websocket": websocket, "triggerTranscript": triggerTranscript, "detectedAt": time.time(), "lastActivity": time.time(), } return True async def _warmEphemeralPhrasePool(self, sessionId: str) -> None: """Fire-and-forget background task: generate the ephemeral phrase pool for every kind defined in ``_EPHEMERAL_PHRASE_INTENTS`` so the first quick-ack / interim notice doesn't pay the AI round-trip latency at runtime. Failures are logged but never raised โ€” the runtime selectors handle empty pools by silently skipping the cue.""" try: for kind in _EPHEMERAL_PHRASE_INTENTS: try: await self._getEphemeralPhrases(kind) except Exception as innerErr: logger.warning( f"Session {sessionId}: Phrase pool warmup failed for " f"kind={kind}: {innerErr}" ) except Exception as warmErr: logger.warning( f"Session {sessionId}: Phrase pool warmup task crashed: {warmErr}" ) # ---------------------------------------------------------------- Voice # When the bot's full answer is a long structured chat post (markdown # tables, bullet lists, headings, multi-paragraph) we MUST NOT read it # out verbatim into the meeting โ€” even after sanitisation it sounds # like a wall of text and easily takes 5+ minutes. The chat keeps the # full answer; the audio path goes through ``_summarizeForVoice`` which # asks the AI for a 1-3 sentence spoken paraphrase in the configured # bot persona / language. # Threshold: anything longer than this many characters (after sanitise) # OR any answer whose source contains markdown structure (tables / # multiple bullets / multiple headings) gets condensed before TTS. _VOICE_DIRECT_MAX_CHARS = 600 _VOICE_SUMMARY_MAX_CHARS = 350 @staticmethod def _looksLikeStructuredText(raw: str) -> bool: """Heuristic: does the original answer have markdown structure that would be miserable to listen to verbatim? Used to trigger the AI summary path even when the sanitised text is short enough.""" if not raw: return False if raw.count("|") >= 4: # at least one markdown table row return True if raw.count("\n#") >= 1: # at least one heading after newline return True if raw.count("\n- ") + raw.count("\n* ") + raw.count("\nโ€ข ") >= 3: return True # 3+ bullets โ†’ list-like if re.search(r"\n\d+[\.\)]\s", raw): # numbered list count = len(re.findall(r"(?m)^\s*\d+[\.\)]\s", raw)) if count >= 3: return True return False async def _summarizeForVoice( self, sessionId: str, rawAnswer: str, ) -> str: """Return a SHORT, naturally-spoken paraphrase of ``rawAnswer`` for TTS playback. Falls back to the sanitised + truncated original if the AI call fails โ€” never blocks the response. The chat / DB / UI keep the original ``rawAnswer`` untouched. Only the voice channel goes through this condensation. """ if not rawAnswer or not rawAnswer.strip(): return "" sanitised = _voiceFriendlyMeetingText(rawAnswer) # Short + unstructured โ†’ speak as-is, no AI round-trip if ( len(sanitised) <= self._VOICE_DIRECT_MAX_CHARS and not self._looksLikeStructuredText(rawAnswer) ): return sanitised targetLang = (self.config.language or "de-DE").strip() botName = (self.config.botName or "").strip() or "the assistant" persona = (self.config.aiSystemPrompt or "").strip() personaBlock = ( f"\n\nBOT PERSONA / TONE:\n{persona}\n" if persona else "" ) prompt = ( f"You are condensing a long written answer into a SHORT spoken " f"paraphrase that the assistant '{botName}' will say out loud " f"into a Microsoft Teams meeting. The full written answer is " f"already in the meeting chat โ€” your job is to summarise it for " f"the EAR, not the eye.\n\n" f"STRICT REQUIREMENTS:\n" f"1. Output language: BCP-47 '{targetLang}'. No other language.\n" f"2. 1 to 3 sentences, max ~{self._VOICE_SUMMARY_MAX_CHARS} characters total.\n" f"3. Natural spoken style โ€” no headings, no bullet points, no " f"tables, no markdown, no emojis, no enumerations like 'Erstens... " f"Zweitens...' unless that genuinely flows in speech.\n" f"4. Capture the essence and the most important conclusion. Do " f"NOT try to fit every detail. Listeners can read the chat for " f"the full version.\n" f"5. End by gently pointing the audience to the chat for details, " f"e.g. 'Details stehen im Chat.' (adapted to the target language).\n" f"6. Output ONLY the spoken text. No JSON, no quotes around it, " f"no preamble like 'Here is the summary:'.\n" f"{personaBlock}\n" f"FULL WRITTEN ANSWER (markdown-formatted, sometimes long):\n" f"---\n{rawAnswer.strip()[:6000]}\n---\n" ) try: aiService = createAiService( self.currentUser, self.mandateId, self.instanceId ) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=prompt, context="", options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ), ) response = await aiService.callAi(request) except Exception as aiErr: logger.warning( f"Session {sessionId}: Voice summary AI call failed: {aiErr}" ) return sanitised[: self._VOICE_DIRECT_MAX_CHARS] if not response or response.errorCount != 0 or not response.content: logger.warning( f"Session {sessionId}: Voice summary returned empty/error" ) return sanitised[: self._VOICE_DIRECT_MAX_CHARS] spoken = response.content.strip() # Defensive sanitiser pass โ€” the model usually obeys the # "no markdown" instruction but not always. spoken = _voiceFriendlyMeetingText(spoken) if not spoken: return sanitised[: self._VOICE_DIRECT_MAX_CHARS] logger.info( f"Session {sessionId}: Voice summary generated " f"(orig={len(rawAnswer)} chars, sanitised={len(sanitised)}, " f"spoken={len(spoken)})" ) return spoken async def _pickQuickAckText(self) -> Optional[str]: """Return a short ack text in the bot's configured language. The actual phrases are AI-generated once per session (cached) and rotated round-robin so consecutive acks don't sound identical. Returns ``None`` only if AI generation completely failed and no fallback variant could be produced โ€” in that case the caller silently skips the ack.""" return await self._pickEphemeralPhrase("quickAck") async def _pickEphemeralPhrase( self, kind: str, substitutions: Optional[Dict[str, Any]] = None, ) -> Optional[str]: """Round-robin selector over the cached phrase pool for ``kind``. Lazily generates the pool on first use. ``substitutions`` is applied to the chosen phrase via ``str.format(**substitutions)`` so kinds like ``agentRound`` can render ``{round}`` / ``{maxRounds}``. Returns ``None`` if no phrases are available.""" variants = await self._getEphemeralPhrases(kind) if not variants: return None idx = self._phrasePoolIdx.get(kind, 0) % len(variants) self._phrasePoolIdx[kind] = (idx + 1) % len(variants) chosen = variants[idx] if substitutions: try: chosen = chosen.format(**substitutions) except (KeyError, IndexError, ValueError) as fmtErr: # The AI didn't include the expected placeholder โ€” return the # raw phrase rather than crash. The user still hears something # in the right language; only the numeric hint is missing. logger.debug( f"Ephemeral phrase substitution failed for kind={kind}: {fmtErr}" ) return chosen async def _getEphemeralPhrases(self, kind: str) -> List[str]: """Return the cached pool of AI-generated variants for ``kind``, generating it on first request. Subsequent calls hit the in-memory cache. Concurrent first-time callers are serialised by the pool lock so only ONE AI request is fired per kind per session.""" cached = self._phrasePool.get(kind) if cached: return cached async with self._phrasePoolLock: cached = self._phrasePool.get(kind) if cached: return cached phrases = await self._generateEphemeralPhrases( kind, _EPHEMERAL_PHRASE_VARIANTS ) if phrases: self._phrasePool[kind] = phrases return phrases async def _generateEphemeralPhrases( self, kind: str, count: int ) -> List[str]: """Ask the AI to produce ``count`` short utterances for ``kind`` in the bot's configured language and persona. Returns ``[]`` on any failure โ€” callers must treat empty as 'silently skip this ephemeral cue', NEVER fall back to a hardcoded localized string.""" intent = _EPHEMERAL_PHRASE_INTENTS.get(kind) if not intent: logger.warning(f"Unknown ephemeral phrase kind requested: {kind}") return [] targetLang = (self.config.language or "").strip() or "en-US" botName = (self.config.botName or "the assistant").strip() persona = (self.config.aiSystemPrompt or "").strip() # The prompt is in English on purpose โ€” these are instructions to the # LLM, not user-facing text. The OUTPUT is required to be in # ``targetLang``. We ask for a strict JSON array so parsing is robust. prompt = ( f"You are localizing short SPOKEN-LANGUAGE utterances for a " f"meeting assistant named '{botName}'.\n\n" f"Persona / style guide for the assistant:\n" f"{persona or '(no persona configured โ€” use a neutral, polite, professional tone)'}\n\n" f"Target spoken language (BCP-47 code): {targetLang}\n\n" f"Utterance intent:\n{intent}\n\n" f"Generate {count} DIFFERENT variants matching this intent, in " f"the target language. Variants should feel natural when spoken " f"aloud, not robotic. Do NOT include the assistant's name in " f"the variants.\n\n" f"Output STRICTLY a JSON array of {count} plain-text strings, " f"with no markdown fences, no commentary, no surrounding " f"quotation marks beyond the JSON syntax itself. Example " f"format: [\"...\", \"...\", \"...\", \"...\"]" ) try: aiService = createAiService( self.currentUser, self.mandateId, self.instanceId ) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=prompt, context="", options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ), ) response = await aiService.callAi(request) except Exception as aiErr: logger.warning( f"Ephemeral phrase generation failed (kind={kind}, lang={targetLang}): {aiErr}" ) return [] if not response or response.errorCount != 0 or not response.content: logger.warning( f"Ephemeral phrase generation returned empty/error " f"(kind={kind}, lang={targetLang})" ) return [] raw = response.content.strip() # Strip optional ```json ... ``` fences before parsing. raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```\s*$", "", raw) try: arr = json.loads(raw) except json.JSONDecodeError as parseErr: logger.warning( f"Ephemeral phrase generation: could not parse JSON " f"(kind={kind}, lang={targetLang}): {parseErr} " f"raw={raw[:200]}" ) return [] if not isinstance(arr, list): return [] cleaned = [ str(v).strip() for v in arr if isinstance(v, str) and str(v).strip() ] cleaned = cleaned[:count] if cleaned: logger.info( f"Ephemeral phrase pool generated (kind={kind}, " f"lang={targetLang}, count={len(cleaned)})" ) return cleaned def _shouldFireQuickAck(self) -> bool: """Centralized gate so the call sites stay short and consistent.""" now = time.time() if (now - self._lastQuickAckTs) < _QUICK_ACK_MIN_INTERVAL_SEC: return False # If we are already producing a real response, the ack would step on # the actual answer's TTS โ€” skip it. Same for an in-flight agent # escalation: the agent will deliver its own answer (and we already # spoke an interim "moment please" when it started). if self._aiAnalysisInProgress or self._agentEscalationInFlight: return False # Voice channel must be active. Chat-only mode would just spam "...". channelRaw = self.config.responseChannel channelStr = ( channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw) ).lower().strip() if channelStr not in ("voice", "both"): return False if self.config.responseMode in ( TeamsbotResponseMode.MANUAL, TeamsbotResponseMode.TRANSCRIBE_ONLY, ): return False return True async def _runQuickAck(self, sessionId: str) -> None: """Background task: speak the short ack into the meeting via TTS. Designed to be fired as ``asyncio.create_task(self._runQuickAck(...))`` the moment the bot's name is detected โ€” does not block the regular debounced analysis pipeline. Persists nothing to the DB and emits no botResponse event; this is purely an audio cue ("Moment...") so the speaker hears within ~1s that the bot is reacting. """ websocket = self._websocket voiceInterface = self._voiceInterface if websocket is None or voiceInterface is None: return if not self._shouldFireQuickAck(): return ackText = await self._pickQuickAckText() if not ackText: return # Mark the throttle BEFORE TTS so two near-simultaneous detections # don't both fire (TTS dispatch can take a few hundred ms). self._lastQuickAckTs = time.time() try: await _emitSessionEvent(sessionId, "quickAck", { "text": ackText, "timestamp": getIsoTimestamp(), }) cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: outcome = await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=ackText, languageCode=self.config.language, voiceName=self.config.voiceId, isCancelled=cancelHook, ) if not outcome.get("success"): logger.info( f"Session {sessionId}: Quick ack TTS failed silently " f"({outcome.get('error')}) โ€” main response will still go through" ) except asyncio.CancelledError: logger.info(f"Session {sessionId}: Quick ack cancelled by stop signal") except Exception as ackErr: logger.warning(f"Session {sessionId}: Quick ack failed: {ackErr}") finally: self._currentQuickAckTask = None async def _checkPendingNameTrigger(self, delaySec: float = 3.0): """Async loop: fire the pending name trigger once the speaker is quiet.""" await asyncio.sleep(delaySec) if not self._pendingNameTrigger: return now = time.time() lastActivity = self._pendingNameTrigger.get("lastActivity", 0) detectedAt = self._pendingNameTrigger.get("detectedAt", 0) quietSec = now - lastActivity totalWaitSec = now - detectedAt if quietSec >= 3.0 or totalWaitSec >= 15.0: trigger = self._pendingNameTrigger self._pendingNameTrigger = None logger.info( f"Session {trigger['sessionId']}: Debounced name trigger fires " f"(quiet={quietSec:.1f}s, totalWait={totalWaitSec:.1f}s)" ) await self._analyzeAndRespond( trigger["sessionId"], trigger["interface"], trigger["voiceInterface"], trigger["websocket"], trigger["triggerTranscript"], ) else: remaining = max(0.5, 3.0 - quietSec) asyncio.create_task(self._checkPendingNameTrigger(remaining)) async def _analyzeAndRespond( self, sessionId: str, interface, voiceInterface, websocket: WebSocket, triggerTranscript: Dict[str, Any], ): """Run SPEECH_TEAMS AI analysis and respond if needed.""" if self._aiAnalysisInProgress: logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger") return # An agent escalation from a previous trigger may still be researching # (it lives in its own task, ``_aiAnalysisInProgress`` was already # released when SPEECH_TEAMS returned). If we let a fresh SPEECH_TEAMS # run now, both pipelines would race to the meeting voice channel and # the operator would hear "two bots talking". Skip until the agent # finishes; the speaker can re-trigger by saying the bot name again # if they have a new question. if self._agentEscalationInFlight: logger.info( f"Session {sessionId}: Agent escalation still in flight โ€” " f"skipping new SPEECH_TEAMS trigger to prevent overlapping replies" ) return self._aiAnalysisInProgress = True self._lastAiCallTime = time.time() # Build transcript context from buffer. # Mark bot's own utterances and chat messages for the AI. contextLines = [] for segment in self._contextBuffer: speaker = segment.get("speaker", "Unknown") text = segment.get("text", "") segSource = segment.get("source", "caption") prefix = "Chat" if segSource == "chat" else "" if self._isBotSpeaker(speaker): contextLines.append(f"[YOU ({self.config.botName})]: {text}") elif prefix: contextLines.append(f"[{prefix}: {speaker}]: {text}") else: contextLines.append(f"[{speaker}]: {text}") # Include session context if provided by the user at session start sessionContextStr = "" if self._sessionContext: sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{self._sessionContext}\n" # Include summary of earlier conversation if available summaryStr = "" if self._contextSummary: summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{self._contextSummary}\n" # Persistent director prompts: private operator instructions that stay # in effect across triggers (e.g. "respond in English", "always be brief"). directorStr = self._buildPersistentDirectorContext() transcriptContext = f"BOT_NAME:{self.config.botName}{sessionContextStr}{summaryStr}{directorStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines) # Call SPEECH_TEAMS try: aiService = createAiService(self.currentUser, self.mandateId, self.instanceId) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=self.config.aiSystemPrompt, context=transcriptContext, options=AiCallOptions( operationType=OperationTypeEnum.SPEECH_TEAMS, priority=PriorityEnum.SPEED, ) ) response = await aiService.callAi(request) # Parse structured response try: speechResult = SpeechTeamsResponse.model_validate_json(response.content) except Exception: # Try to extract JSON from response content try: jsonStr = response.content if "```json" in jsonStr: jsonStr = jsonStr.split("```json")[1].split("```")[0] elif "```" in jsonStr: jsonStr = jsonStr.split("```")[1].split("```")[0] speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip()) except Exception as parseErr: logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}") speechResult = SpeechTeamsResponse( shouldRespond=False, reasoning=f"Parse error: {str(parseErr)[:100]}", detectedIntent="none" ) logger.info( f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, " f"intent={speechResult.detectedIntent}, " f"reasoning={speechResult.reasoning[:80]}..." ) # Emit analysis event (always, for debug/UI) await _emitSessionEvent(sessionId, "analysis", { "shouldRespond": speechResult.shouldRespond, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, "needsAgent": speechResult.needsAgent, "agentReason": speechResult.agentReason, }) # Hybrid routing: SPEECH_TEAMS detected a complex request that # requires the full agent (web research, mail, multi-step). Hand # off to the agent path; do NOT speak the SPEECH_TEAMS placeholder. if speechResult.needsAgent: # Director prompts (persistent + recent one-shot) have already # delivered files to the operator. The escalation agent MUST see # them โ€” otherwise it answers "summarize the doc" with general # babble because the SPEECH_TEAMS prompt itself never had file # access. We also forward the prior agent analysis so the # escalation can build on, not duplicate, the earlier work. briefings = self._collectActiveDirectorBriefings() briefingFileIds = self._collectDirectorFileIds() briefingBlock = "" if briefings: parts = [] for b in briefings: seg = f"- ({b.get('mode')}) {b.get('text', '')}".rstrip() if b.get("fileIds"): seg += f"\n attachedFileIds: {', '.join(b['fileIds'])}" if b.get("note"): note = b["note"] seg += ( "\n priorAgentAnalysis: " + (note if len(note) <= 800 else note[:800] + "...") ) parts.append(seg) briefingBlock = ( "\n\nACTIVE_OPERATOR_BRIEFINGS (private; you may read the " "attached files via summarizeContent / readFile / " "readContentObjects to answer the user precisely; do NOT " "quote the directive text itself):\n" + "\n".join(parts) ) logger.info( f"Session {sessionId}: SPEECH_TEAMS escalates to agent. " f"Reason: {speechResult.agentReason or speechResult.reasoning} | " f"briefings={len(briefings)}, fileIds={len(briefingFileIds)}" ) taskBrief = ( (speechResult.agentReason or speechResult.responseText or "Verarbeite die juengste Sprecheranfrage und antworte ins Meeting.") + briefingBlock ) # Mark escalation as in-flight BEFORE we create the task so the # ``_aiAnalysisInProgress=False`` released in our finally block # cannot let a competing speech trigger sneak past the gate # before the agent task has even been scheduled. self._agentEscalationInFlight = True self._currentEscalationTask = asyncio.create_task( self._runEscalationAndRelease( sessionId=sessionId, taskBrief=taskBrief, briefingFileIds=briefingFileIds, triggerTranscriptId=triggerTranscript.get("id"), ) ) return # Step 4a: Handle STOP intent -- stop audio immediately if speechResult.detectedIntent == "stop": logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}") if websocket: try: await websocket.send_text(json.dumps({ "type": "stopAudio", "sessionId": sessionId, })) except Exception as stopErr: logger.warning(f"Failed to send stop command: {stopErr}") return # Step 4b: Respond if AI decided to if speechResult.shouldRespond and speechResult.responseText: if self.config.responseMode == TeamsbotResponseMode.MANUAL: # In manual mode, suggest but don't send await _emitSessionEvent(sessionId, "suggestedResponse", { "responseText": speechResult.responseText, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, }) return # Determine response channel: per-request (AI) overrides config channels = speechResult.responseChannels if channels and isinstance(channels, list): channelStr = ",".join(str(c).lower().strip() for c in channels) sendVoice = "voice" in channelStr sendChat = "chat" in channelStr logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}") else: channelRaw = self.config.responseChannel channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip() sendVoice = channelStr in ("voice", "both") sendChat = channelStr in ("chat", "both") logger.info(f"Response channel (from config): '{channelStr}'") if sendVoice and sendChat: responseType = TeamsbotResponseType.BOTH elif sendVoice: responseType = TeamsbotResponseType.AUDIO else: responseType = TeamsbotResponseType.CHAT # Suppress duplicate responses in short windows ("repeat loop" protection). canonicalText = ( speechResult.responseText or speechResult.responseTextForVoice or speechResult.responseTextForChat or "" ) normalizedResponse = (canonicalText or "").strip().lower() nowTs = time.time() if ( normalizedResponse and self._lastBotResponseText == normalizedResponse and (nowTs - self._lastBotResponseTs) < 90 ): logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window") await _emitSessionEvent(sessionId, "analysis", { "shouldRespond": False, "detectedIntent": speechResult.detectedIntent, "reasoning": "Suppressed duplicate response within 90s", "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, }) return # Resolve text per channel (AI can send different content to voice vs chat) textForVoice = speechResult.responseTextForVoice or speechResult.responseText textForChat = speechResult.responseTextForChat or speechResult.responseText storedText = textForChat or textForVoice or speechResult.responseText # 4a: Voice response (TTS -> Audio to bot, chunked for long replies) if sendVoice and textForVoice: await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "requested", "hasWebSocket": websocket is not None, "message": "TTS generation requested", "timestamp": getIsoTimestamp(), }) logger.info( f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})" ) if not websocket: logger.warning( f"Session {sessionId}: TTS skipped (bot websocket unavailable, likely fallback mode)" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "unavailable", "hasWebSocket": False, "message": "TTS skipped โ€” bot websocket unavailable", "timestamp": getIsoTimestamp(), }) if not sendChat: sendChat = True else: # Long / structured answers โ†’ AI condenses for ear; chat keeps full text. spokenText = await self._summarizeForVoice(sessionId, textForVoice) cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: ttsOutcome = await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=spokenText, languageCode=self.config.language, voiceName=self.config.voiceId, isCancelled=cancelHook, ) if ttsOutcome.get("success"): logger.info( f"Session {sessionId}: TTS audio dispatched to bot " f"(chunks={ttsOutcome.get('chunks')}, played={ttsOutcome.get('played')})" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "dispatched", "hasWebSocket": True, "chunks": ttsOutcome.get("chunks"), "played": ttsOutcome.get("played"), "timestamp": getIsoTimestamp(), }) else: logger.warning( f"TTS failed for session {sessionId}: {ttsOutcome.get('error')}" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "failed", "hasWebSocket": True, "chunks": ttsOutcome.get("chunks"), "played": ttsOutcome.get("played"), "message": ttsOutcome.get("error"), "timestamp": getIsoTimestamp(), }) if not sendChat: sendChat = True # Fallback to chat if voice-only and TTS failed # 4b: Chat response (send text message to meeting chat) if sendChat and textForChat: try: if websocket: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": textForChat, })) logger.info(f"Chat response sent for session {sessionId}") except Exception as chatErr: logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}") # 4b: Store bot response botResponseData = TeamsbotBotResponse( sessionId=sessionId, responseText=storedText, responseType=responseType, detectedIntent=speechResult.detectedIntent, reasoning=speechResult.reasoning, triggeredByTranscriptId=triggerTranscript.get("id"), modelName=response.modelName, processingTime=response.processingTime, priceCHF=response.priceCHF, timestamp=getIsoTimestamp(), ).model_dump() createdResponse = interface.createBotResponse(botResponseData) # 4c: Emit SSE event await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), "responseText": storedText, "responseType": responseType.value, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, "timestamp": botResponseData.get("timestamp"), }) # Update session response count session = interface.getSession(sessionId) if session: count = session.get("botResponseCount", 0) + 1 interface.updateSession(sessionId, {"botResponseCount": count}) self._lastBotResponseText = normalizedResponse self._lastBotResponseTs = nowTs # Record bot response in transcript (exactly once, regardless of channel) botTranscriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=storedText, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, ).model_dump() botTranscript = interface.createTranscript(botTranscriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": storedText, "timestamp": getUtcTimestamp(), "source": "botResponse", }) await _emitSessionEvent(sessionId, "transcript", { "id": botTranscript.get("id"), "speaker": self.config.botName, "text": storedText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "botResponse", "speakerResolvedFromHint": False, }) # Reset differential writing tracker so next STT creates a new block self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = storedText self._lastTranscriptId = botTranscript.get("id") self._followUpWindowEnd = time.time() + 15.0 logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s") # Step 5: Execute AI-issued commands (if any) if speechResult.commands: await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket) # When AI used only commands (no responseText), emit botResponse SSE # so the UI shows the response. Extract text from sendChat commands. if speechResult.shouldRespond and not speechResult.responseText: cmdTexts = [ c.params.get("text", "") for c in speechResult.commands if c.action == "sendChat" and c.params and c.params.get("text") ] combinedText = " ".join(cmdTexts) if cmdTexts else None if combinedText: botResponseData = TeamsbotBotResponse( sessionId=sessionId, responseText=combinedText, responseType=TeamsbotResponseType.CHAT, detectedIntent=speechResult.detectedIntent, reasoning=speechResult.reasoning, triggeredByTranscriptId=triggerTranscript.get("id"), modelName=response.modelName, processingTime=response.processingTime, priceCHF=response.priceCHF, timestamp=getIsoTimestamp(), ).model_dump() createdResponse = interface.createBotResponse(botResponseData) await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), "responseText": combinedText, "responseType": TeamsbotResponseType.CHAT.value, "detectedIntent": speechResult.detectedIntent, "reasoning": speechResult.reasoning, "modelName": response.modelName, "processingTime": response.processingTime, "priceCHF": response.priceCHF, "timestamp": botResponseData.get("timestamp"), }) session = interface.getSession(sessionId) if session: count = session.get("botResponseCount", 0) + 1 interface.updateSession(sessionId, {"botResponseCount": count}) self._followUpWindowEnd = time.time() + 15.0 logger.info( f"Bot responded via commands in session {sessionId}: " f"intent={speechResult.detectedIntent}, follow-up window open for 15s" ) except Exception as e: logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True) await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"}) finally: self._aiAnalysisInProgress = False async def _runEscalationAndRelease( self, sessionId: str, taskBrief: str, briefingFileIds: List[str], triggerTranscriptId: Optional[str], ) -> None: """Background wrapper for ``_runAgentForMeeting`` that holds the ``_agentEscalationInFlight`` flag for the entire duration of the agent run โ€” not just for the moment we schedule the task. Without this wrapper, ``_aiAnalysisInProgress`` would already be ``False`` while the agent is still researching, and a fresh SPEECH_TEAMS trigger from a new utterance would race the agent to the voice channel.""" try: await self._runAgentForMeeting( sessionId=sessionId, taskText=taskBrief, fileIds=briefingFileIds, sourceLabel="speechEscalation", triggerTranscriptId=triggerTranscriptId, ) except asyncio.CancelledError: logger.info( f"Session {sessionId}: Escalation agent task cancelled by stop signal" ) except Exception as escErr: logger.error( f"Session {sessionId}: Escalation agent task failed: " f"{type(escErr).__name__}: {escErr}", exc_info=True, ) finally: self._agentEscalationInFlight = False self._currentEscalationTask = None # ========================================================================= # AI Command Execution # ========================================================================= async def _executeCommands( self, sessionId: str, commands: List[TeamsbotCommand], voiceInterface, websocket: WebSocket, ): """Execute structured commands returned by the AI. Each command is dispatched to a dedicated handler function.""" for cmd in commands: action = cmd.action params = cmd.params or {} logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}") try: if action == "toggleTranscript": await self._cmdToggleTranscript(sessionId, params, websocket) elif action == "toggleChat": await self._cmdToggleChat(sessionId, params, websocket) elif action == "sendChat": await self._cmdSendChat(sessionId, params, websocket) elif action == "readChat": await self._cmdReadChat(sessionId, params, voiceInterface, websocket) elif action == "readAloud": await self._cmdReadAloud(sessionId, params, voiceInterface, websocket) elif action == "changeLanguage": await self._cmdChangeLanguage(sessionId, params) elif action in ("toggleMic", "toggleCamera"): await self._cmdToggleMicOrCamera(sessionId, action, params, websocket) elif action == "sendMail": await self._cmdSendMail(sessionId, params) elif action == "storeDocument": await self._cmdStoreDocument(sessionId, params) else: logger.warning(f"Session {sessionId}: Unknown command '{action}'") except Exception as cmdErr: logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}") async def _cmdToggleTranscript(self, sessionId: str, params: dict, websocket: WebSocket): """Caption on/off - toggle Teams live transcript capture.""" enable = params.get("enable", True) if websocket: await websocket.send_text(json.dumps({ "type": "botCommand", "sessionId": sessionId, "command": "toggleTranscript", "params": {"enable": enable}, })) async def _cmdToggleChat(self, sessionId: str, params: dict, websocket: WebSocket): """Chat on/off - enable/disable meeting chat monitoring.""" enable = params.get("enable", True) if websocket: await websocket.send_text(json.dumps({ "type": "botCommand", "sessionId": sessionId, "command": "toggleChat", "params": {"enable": enable}, })) async def _cmdSendChat(self, sessionId: str, params: dict, websocket: WebSocket): """Send a message to the meeting chat and record it in transcript/SSE.""" chatText = params.get("text", "") if not chatText: return if websocket: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": chatText, })) logger.info(f"Chat command sent for session {sessionId}") from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) transcriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=chatText, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, source="chat", ).model_dump() createdTranscript = interface.createTranscript(transcriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": chatText, "timestamp": getUtcTimestamp(), "source": "chat", }) self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = chatText self._lastTranscriptId = createdTranscript.get("id") self._lastBotResponseText = chatText.strip().lower() self._lastBotResponseTs = time.time() await _emitSessionEvent(sessionId, "transcript", { "id": createdTranscript.get("id"), "speaker": self.config.botName, "text": chatText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "chat", "speakerResolvedFromHint": False, }) async def _cmdReadChat( self, sessionId: str, params: dict, voiceInterface, websocket: WebSocket, ): """Read chat messages (from DB) with optional fromdatetime/todatetime, then speak or send to chat.""" from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) transcripts = interface.getTranscripts(sessionId) fromDt = params.get("fromdatetime") or params.get("fromDateTime") toDt = params.get("todatetime") or params.get("toDateTime") chatOnly = [t for t in transcripts if t.get("source") in ("chat", "chatHistory")] if fromDt: chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") >= fromDt] if toDt: chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") <= toDt] summary = "\n".join(f"[{t.get('speaker', '?')}]: {t.get('text', '')}" for t in chatOnly[-20:]) if not summary: summary = "Keine Chat-Nachrichten im angegebenen Zeitraum." if voiceInterface and websocket: spokenSummary = await self._summarizeForVoice(sessionId, summary[:2000]) cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=spokenSummary, languageCode=self.config.language, voiceName=self.config.voiceId, isCancelled=cancelHook, ) async def _cmdReadAloud( self, sessionId: str, params: dict, voiceInterface, websocket: WebSocket, ): """Read text aloud via TTS and play in meeting.""" readText = params.get("text", "") if readText and voiceInterface and websocket: cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=_voiceFriendlyMeetingText(readText), languageCode=self.config.language, voiceName=self.config.voiceId, isCancelled=cancelHook, ) async def _cmdChangeLanguage(self, sessionId: str, params: dict): """Change bot language.""" newLang = params.get("language", "") if newLang: self.config = self.config.model_copy(update={"language": newLang}) logger.info(f"Session {sessionId}: Language changed to '{newLang}'") await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang}) async def _cmdToggleMicOrCamera( self, sessionId: str, action: str, params: dict, websocket: WebSocket, ): """Toggle mic or camera in the meeting.""" if websocket: await websocket.send_text(json.dumps({ "type": "botCommand", "sessionId": sessionId, "command": action, "params": params, })) async def _cmdSendMail(self, sessionId: str, params: dict): """Send email via Service Center MessagingService.""" recipient = params.get("recipient") or params.get("to", "") subject = params.get("subject", "") message = params.get("message") or params.get("body", "") if not recipient or not subject: logger.warning(f"Session {sessionId}: sendMail requires recipient and subject") return try: from modules.serviceCenter import ServiceCenterContext, getService ctx = ServiceCenterContext( user=self.currentUser, mandate_id=self.mandateId, feature_instance_id=self.instanceId, ) messaging = getService("messaging", ctx) success = messaging.sendEmailDirect( recipient=recipient, subject=subject, message=message, userId=str(self.currentUser.id) if self.currentUser else None, ) if success: logger.info(f"Session {sessionId}: Email sent to {recipient}") else: logger.warning(f"Session {sessionId}: Email send failed for {recipient}") except Exception as e: logger.warning(f"Session {sessionId}: sendMail failed: {e}") async def _cmdStoreDocument(self, sessionId: str, params: dict): """Store document via Service Center SharepointService.""" sitePath = params.get("sitePath") or params.get("site", "") folderPath = params.get("folderPath") or params.get("folder", "") fileName = params.get("fileName", "document.txt") content = params.get("content", "") if isinstance(content, str): content = content.encode("utf-8") if not sitePath or not folderPath: logger.warning(f"Session {sessionId}: storeDocument requires sitePath and folderPath") return try: from modules.serviceCenter import ServiceCenterContext, getService ctx = ServiceCenterContext( user=self.currentUser, mandate_id=self.mandateId, feature_instance_id=self.instanceId, ) sharepoint = getService("sharepoint", ctx) if not sharepoint.setAccessTokenFromConnection(self.currentUser): logger.warning(f"Session {sessionId}: SharePoint connection not configured") return site = await sharepoint.getSiteByStandardPath(sitePath) if not site: logger.warning(f"Session {sessionId}: SharePoint site not found: {sitePath}") return result = await sharepoint.uploadFile( siteId=site["id"], folderPath=folderPath, fileName=fileName, content=content, ) if "error" in result: logger.warning(f"Session {sessionId}: storeDocument failed: {result['error']}") else: logger.info(f"Session {sessionId}: Document stored: {fileName}") except Exception as e: logger.warning(f"Session {sessionId}: storeDocument failed: {e}") # ========================================================================= # Director Prompts (private operator instructions during a live meeting) # ========================================================================= def _collectActiveDirectorBriefings(self) -> List[Dict[str, Any]]: """Return the deduplicated list of director-prompt briefings that are currently relevant for the meeting context: every active persistent prompt PLUS every recent one-shot prompt that still sits in the ``_recentDirectorBriefings`` pool. Each entry carries ``text``, ``fileIds`` (UDB attachments), ``mode``, ``promptId`` and ``note`` (the agent's internal analysis from the SILENT director run, if any). """ seen: Dict[str, Dict[str, Any]] = {} for p in self._activePersistentPrompts: pid = p.get("id") or "" seen[pid] = { "promptId": pid, "mode": p.get("mode") or "persistent", "text": (p.get("text") or "").strip(), "fileIds": list(p.get("fileIds") or []), "note": (p.get("responseText") or "").strip(), } for b in self._recentDirectorBriefings: pid = b.get("promptId") or "" if pid in seen: # Refresh note with the latest analysis if the persistent run # produced one after the prompt was first loaded from DB. if b.get("note"): seen[pid]["note"] = b["note"] continue seen[pid] = { "promptId": pid, "mode": b.get("mode") or "oneShot", "text": (b.get("text") or "").strip(), "fileIds": list(b.get("fileIds") or []), "note": (b.get("note") or "").strip(), } return [v for v in seen.values() if v.get("text") or v.get("fileIds")] def _collectDirectorFileIds(self) -> List[str]: """Flat, deduplicated list of UDB file IDs attached to any currently relevant director prompt (persistent + recent one-shot). Used when SPEECH_TEAMS escalates to the agent so the agent can actually READ the documents the operator already provided.""" out: List[str] = [] seen: set = set() for b in self._collectActiveDirectorBriefings(): for fid in b.get("fileIds") or []: if fid and fid not in seen: seen.add(fid) out.append(fid) return out def _buildPersistentDirectorContext(self) -> str: """Render active director-prompt briefings as private operator guidance for the SPEECH_TEAMS system prompt context block. Surfaces three things SPEECH_TEAMS otherwise misses: * the operator's directive text (as before) * the IDs of any UDB files the operator attached โ€” so SPEECH_TEAMS knows the documents exist and can decide to escalate to the agent, which has the tooling to read them. * the agent's previous internal analysis of the prompt (the SILENT ``MEETING_REPLY/SILENT`` decision's note), so SPEECH_TEAMS can answer short questions without re-running the agent. """ briefings = self._collectActiveDirectorBriefings() if not briefings: return "" lines: List[str] = [] for b in briefings: entry = f"- ({b.get('mode', 'persistent')}) {b.get('text', '')}".rstrip() fileIds = b.get("fileIds") or [] if fileIds: entry += ( "\n ATTACHED_FILES (operator-provided documents โ€” the AGENT " "has tools to read them via summarizeContent / readFile / " "readContentObjects): " + ", ".join(fileIds) ) note = b.get("note") if note: noteShort = note if len(note) <= 600 else note[:600] + "..." entry += f"\n AGENT_ANALYSIS (already computed by the bot): {noteShort}" lines.append(entry) return ( "\nOPERATOR_DIRECTIVES (private; never quote them verbatim, just follow them. " "If the user asks about an attached document, use AGENT_ANALYSIS first; " "if more depth is needed, set needsAgent=true so the agent can re-read the file):\n" + "\n".join(lines) + "\n" ) def _recordDirectorBriefing( self, prompt: Dict[str, Any], internalNote: str, meetingText: str, ) -> None: """Append a director-prompt briefing to the session-scoped pool so the attached files and the agent's analysis stay available for subsequent SPEECH_TEAMS triggers โ€” even after a one-shot prompt was consumed. Idempotent per ``promptId`` (latest entry wins).""" pid = prompt.get("id") or "" # Drop any older entry for the same prompt so we keep the freshest note. self._recentDirectorBriefings = [ b for b in self._recentDirectorBriefings if b.get("promptId") != pid ] self._recentDirectorBriefings.append({ "promptId": pid, "mode": prompt.get("mode") or "oneShot", "text": (prompt.get("text") or "").strip(), "fileIds": list(prompt.get("fileIds") or []), "note": (internalNote or meetingText or "").strip(), "recordedAt": getIsoTimestamp(), }) if len(self._recentDirectorBriefings) > _RECENT_DIRECTOR_BRIEFINGS_MAX: self._recentDirectorBriefings = self._recentDirectorBriefings[ -_RECENT_DIRECTOR_BRIEFINGS_MAX: ] async def submitDirectorPrompt( self, sessionId: str, operatorUserId: str, text: str, mode: TeamsbotDirectorPromptMode, fileIds: List[str], ) -> Dict[str, Any]: """Persist a new director prompt and trigger immediate agent processing. Returns the created prompt record. Processing happens asynchronously and emits SSE events ('directorPrompt') for the operator UI. """ from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) promptData = TeamsbotDirectorPrompt( sessionId=sessionId, instanceId=self.instanceId, operatorUserId=operatorUserId, text=text, mode=mode, fileIds=fileIds or [], status=TeamsbotDirectorPromptStatus.QUEUED, ).model_dump() created = interface.createDirectorPrompt(promptData) # Persistent prompts join in-memory directives immediately so they # also influence subsequent SPEECH_TEAMS triggers, not only the # one-shot agent run we kick off below. if mode == TeamsbotDirectorPromptMode.PERSISTENT: self._activePersistentPrompts.append(created) await _emitSessionEvent(sessionId, "directorPrompt", { "id": created.get("id"), "status": created.get("status"), "mode": created.get("mode"), "text": created.get("text"), "fileIds": created.get("fileIds", []), "createdAt": created.get("createdAt"), }) asyncio.create_task(self._processDirectorPrompt(created)) return created async def removePersistentPrompt(self, promptId: str) -> bool: """Remove a persistent director prompt (operator clicked 'remove').""" from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) sessionId = self._activeSessionId prompt = interface.getDirectorPrompt(promptId) if not prompt: return False interface.updateDirectorPrompt(promptId, { "status": TeamsbotDirectorPromptStatus.CONSUMED.value, "consumedAt": getIsoTimestamp(), "statusMessage": "Removed by operator", }) self._activePersistentPrompts = [ p for p in self._activePersistentPrompts if p.get("id") != promptId ] # Also drop the briefing copy so SPEECH_TEAMS forgets the doc reference # immediately; otherwise the bot would keep "remembering" a doc the # operator just retired. self._recentDirectorBriefings = [ b for b in self._recentDirectorBriefings if b.get("promptId") != promptId ] if sessionId: await _emitSessionEvent(sessionId, "directorPrompt", { "id": promptId, "status": TeamsbotDirectorPromptStatus.CONSUMED.value, "mode": prompt.get("mode"), "text": prompt.get("text"), "removed": True, }) return True async def _processDirectorPrompt(self, prompt: Dict[str, Any]) -> None: """Run the agent for a director prompt and deliver the FINAL text into the meeting via TTS + chat (using the bot's existing channels).""" from . import interfaceFeatureTeamsbot as interfaceDb sessionId = prompt.get("sessionId") promptId = prompt.get("id") interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) interface.updateDirectorPrompt(promptId, { "status": TeamsbotDirectorPromptStatus.RUNNING.value, }) await _emitSessionEvent(sessionId, "directorPrompt", { "id": promptId, "status": TeamsbotDirectorPromptStatus.RUNNING.value, }) # Build a task brief for the agent that surfaces the meeting context. recentTranscript = self._renderRecentTranscriptForAgent(maxLines=20) directorText = (prompt.get("text") or "").strip() attachedFileIds = list(prompt.get("fileIds") or []) promptMode = (prompt.get("mode") or "").lower() isPersistentPrompt = promptMode == TeamsbotDirectorPromptMode.PERSISTENT.value.lower() # Make file attachment EXPLICIT in the brief. The agent service already # prepends a "## Attached Files & Folders" header via _enrichPromptWithFiles # when fileIds are passed, but without an explicit instruction the agent # sometimes goes straight to a generic answer. We force the workflow: # studyDocs -> form briefing -> decide MEETING_REPLY vs SILENT. filesBlock = "" if attachedFileIds: filesBlock = ( "\nANGEHAENGTE DOKUMENTE (UDB-File-IDs): " + ", ".join(attachedFileIds) + "\nDu MUSST diese Dokumente VOR der finalen Antwort lesen / zusammenfassen " "(z.B. summarizeContent, readFile, readContentObjects, describeImage). " "Beziehe Fakten und Zitate aus den Dokumenten in deine Notiz / dein " "Meeting-Reply ein, statt allgemein zu antworten.\n" ) # Persistent prompts that ship documents are usually a "knowledge briefing" # the operator wants the bot to STUDY now and USE LATER. The SILENT note # in that case must be a useful, file-grounded summary that subsequent # SPEECH_TEAMS triggers can pick up โ€” not "noted". persistentNoteHint = "" if isPersistentPrompt and attachedFileIds: persistentNoteHint = ( "\nSPEZIAL fuer PERSISTENT + Dokumente: Wenn die Anweisung KEIN explizites " "Meeting-Statement verlangt, antworte mit 'SILENT:' und liefere als interne " "Notiz eine STRUKTURIERTE, faktendichte Briefing-Zusammenfassung der Dokumente " "(Stichpunkte, Kennzahlen, Aussagen, die fuer Folgefragen im Meeting relevant " "sein koennen). Diese Notiz wird spaeteren Meeting-Antworten als Wissensbasis " "vorgelegt โ€” schreibe sie also so, dass der Bot daraus zitieren kann.\n" ) taskText = ( f"Du bist der KI-Assistent in einem laufenden Teams-Meeting (Bot-Name: {self.config.botName}).\n" f"Der Operator hat dir folgende PRIVATE Regieanweisung gegeben (die anderen Teilnehmer im " f"Meeting sehen sie NICHT). Sie ist KEINE Frage an das Meeting, sondern eine interne " f"Anweisung an dich:\n\n" f"{directorText}\n" f"{filesBlock}" f"{persistentNoteHint}\n" f"AKTUELLER MEETING-KONTEXT (juengste Aussagen):\n{recentTranscript}\n\n" "ANTWORT-PROTOKOLL โ€” Beginne deine FINALE Antwort mit GENAU EINEM dieser Marker:\n" " โ€ข 'MEETING_REPLY:' gefolgt vom Text, der im Meeting gesprochen / in den Meeting-Chat " "gepostet werden soll. Verwende diesen Marker NUR, wenn die Regieanweisung dich explizit " "auffordert, jetzt etwas im Meeting zu sagen oder zu schreiben (Beispiele: 'stell dich vor', " "'fasse zusammen', 'stelle Person X eine Frage', 'beantworte die letzte Frage'). Halte den " "Text kurz, sprachlich passend zur Stimme und ohne Marker oder Meta-Kommentare.\n" " โ€ข 'SILENT:' gefolgt von einer internen Notiz fuer das Operator-UI. " "Verwende diesen Marker fuer interne Direktiven und Wissens-Briefings (Beispiele: " "'achte ab jetzt auf X', 'merke dir Y', 'studiere Dokument Z'). " "Dieser Text wird NICHT ins Meeting gegeben, dient aber spaeteren Meeting-Antworten " "als Wissensbasis. Wenn Dokumente angehaengt sind, MUSS die Notiz konkrete, " "zitierfaehige Fakten aus den Dokumenten enthalten.\n\n" "Standard ist SILENT, wenn nicht eindeutig zur Meeting-Interaktion aufgefordert wurde. " "Wiederhole NIEMALS die Regieanweisung selbst im MEETING_REPLY-Text." ) try: finalText = await self._runAgentForMeeting( sessionId=sessionId, taskText=taskText, fileIds=attachedFileIds, sourceLabel="directorPrompt", triggerTranscriptId=None, promptId=promptId, directorPromptMode=True, ) # One-shot: mark consumed; persistent: keep active but record success. isPersistent = prompt.get("mode") == TeamsbotDirectorPromptMode.PERSISTENT.value updates: Dict[str, Any] = { "status": TeamsbotDirectorPromptStatus.SUCCEEDED.value, "responseText": finalText or "", } if not isPersistent: updates["status"] = TeamsbotDirectorPromptStatus.CONSUMED.value updates["consumedAt"] = getIsoTimestamp() interface.updateDirectorPrompt(promptId, updates) await _emitSessionEvent(sessionId, "directorPrompt", { "id": promptId, "status": updates["status"], "responseText": finalText, }) except Exception as e: logger.error( f"Session {sessionId}: Director prompt {promptId} failed: {type(e).__name__}: {e}", exc_info=True, ) interface.updateDirectorPrompt(promptId, { "status": TeamsbotDirectorPromptStatus.FAILED.value, "statusMessage": f"{type(e).__name__}: {str(e)[:300]}", }) await _emitSessionEvent(sessionId, "directorPrompt", { "id": promptId, "status": TeamsbotDirectorPromptStatus.FAILED.value, "error": f"{type(e).__name__}: {str(e)[:300]}", }) self._activePersistentPrompts = [ p for p in self._activePersistentPrompts if p.get("id") != promptId ] def _renderRecentTranscriptForAgent(self, maxLines: int = 20) -> str: """Render the most recent context buffer entries for inclusion in the agent task brief (similar to SPEECH_TEAMS context, but plain text).""" if not self._contextBuffer: return "(noch keine Aussagen erfasst)" recent = self._contextBuffer[-maxLines:] lines = [] for seg in recent: speaker = seg.get("speaker", "Unknown") text = seg.get("text", "") segSource = seg.get("source", "caption") prefix = "Chat: " if segSource == "chat" else "" if self._isBotSpeaker(speaker): lines.append(f"[YOU ({self.config.botName})]: {text}") else: lines.append(f"[{prefix}{speaker}]: {text}") return "\n".join(lines) async def _interimAgentBusyMessage(self) -> Optional[str]: """Short spoken/chat line before a potentially long agent run (web, tools). Phrasing is AI-localised to ``self.config.language`` and cached per session โ€” no hardcoded language branching. Returns ``None`` if generation failed; caller must treat that as 'silently skip the interim notice'.""" return await self._pickEphemeralPhrase("agentBusy") async def _interimAgentRoundMessage( self, roundNum: int, maxRounds: int ) -> Optional[str]: """Per-round progress notice for long agent runs (meeting voice / chat, ephemeral). Phrasing is AI-localised once per session; ``{round}`` and ``{maxRounds}`` placeholders are substituted at render time. Returns ``None`` if generation failed.""" return await self._pickEphemeralPhrase( "agentRound", substitutions={"round": roundNum, "maxRounds": maxRounds}, ) async def _notifyMeetingEphemeral(self, sessionId: str, text: str) -> None: """Deliver a short line to the meeting (TTS + chat per config) without persisting botResponses/transcripts, so the main agent answer stays the single recorded follow-up.""" websocket = self._websocket voiceInterface = self._voiceInterface if not websocket: logger.warning(f"Session {sessionId}: Interim notice skipped โ€” no WebSocket") return channelRaw = self.config.responseChannel channelStr = ( channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw) ).lower().strip() sendVoice = channelStr in ("voice", "both") sendChat = channelStr in ("chat", "both") if sendVoice and voiceInterface: cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: outcome = await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=_voiceFriendlyMeetingText(text), languageCode=self.config.language, voiceName=self.config.voiceId, isCancelled=cancelHook, ) if not outcome.get("success"): logger.warning( f"Session {sessionId}: Interim TTS failed ({outcome.get('error')}) โ€” falling back to chat" ) if not sendChat: sendChat = True if sendChat: try: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": text, })) except Exception as chatErr: logger.warning(f"Session {sessionId}: Interim chat failed: {chatErr}") await _emitSessionEvent(sessionId, "agentRun", { "status": "interimNotice", "message": text, "timestamp": getIsoTimestamp(), }) async def _runAgentForMeeting( self, sessionId: str, taskText: str, fileIds: List[str], sourceLabel: str, triggerTranscriptId: Optional[str] = None, promptId: Optional[str] = None, directorPromptMode: bool = False, ) -> str: """Run agentService.runAgent for a meeting context, deliver the FINAL text via the bot's existing TTS + chat channels, and return that text. sourceLabel is used for logging and SSE differentiation ('directorPrompt' or 'speechEscalation'). ``directorPromptMode`` activates the silent-by-default protocol for operator director prompts: interim notices are suppressed, no per-round meeting updates, and the FINAL text is parsed for an explicit ``MEETING_REPLY:`` / ``SILENT:`` marker. Only ``MEETING_REPLY`` content is dispatched to the meeting; everything else stays internal. """ from modules.serviceCenter.services.serviceAgent.datamodelAgent import ( AgentConfig, AgentEventTypeEnum ) ctx = ServiceCenterContext( user=self.currentUser, mandate_id=self.mandateId, feature_instance_id=self.instanceId, feature_code="teamsbot", ) agentService = _getServiceCenterService("agent", ctx) # Workflow id stable per session so RAG/round-memory accumulate per meeting. workflowId = f"teamsbot:{sessionId}" agentConfig = AgentConfig( maxRounds=TEAMSBOT_AGENT_MAX_ROUNDS, maxCostCHF=TEAMSBOT_AGENT_MAX_COST_CHF, toolSet="core", initialToolboxes=["core", "web"], excludeActionTools=True, ) await _emitSessionEvent(sessionId, "agentRun", { "source": sourceLabel, "promptId": promptId, "status": "started", "timestamp": getIsoTimestamp(), }) # Director prompts run silently by default โ€” no spontaneous "moment please" # in the meeting just because the operator gave an internal directive. if not directorPromptMode: try: interimText = await self._interimAgentBusyMessage() if interimText: await self._notifyMeetingEphemeral(sessionId, interimText) except Exception as interimErr: logger.warning(f"Session {sessionId}: Interim agent notice failed: {interimErr}") finalText: str = "" rounds = 0 try: async for event in agentService.runAgent( prompt=taskText, fileIds=fileIds or None, config=agentConfig, toolSet="core", workflowId=workflowId, ): if event.type == AgentEventTypeEnum.AGENT_PROGRESS: rounds += 1 pdata = event.data or {} roundNum = int(pdata.get("round", rounds)) maxR = int(pdata.get("maxRounds", TEAMSBOT_AGENT_MAX_ROUNDS)) await _emitSessionEvent(sessionId, "agentRun", { "source": sourceLabel, "promptId": promptId, "status": "progress", "round": roundNum, "maxRounds": maxR, }) # Runde 1: schon allgemeiner Start-Hinweis; ab Runde 2 ins Meeting melden. # Director prompts bleiben still โ€” keine Zwischen-Updates ins Meeting. if roundNum >= 2 and not directorPromptMode: try: roundText = await self._interimAgentRoundMessage(roundNum, maxR) if roundText: await self._notifyMeetingEphemeral(sessionId, roundText) except Exception as roundNoticeErr: logger.warning( f"Session {sessionId}: Per-round agent notice failed: {roundNoticeErr}" ) elif event.type == AgentEventTypeEnum.TOOL_CALL: toolName = (event.data or {}).get("toolName") if event.data else None await _emitSessionEvent(sessionId, "agentRun", { "source": sourceLabel, "promptId": promptId, "status": "toolCall", "toolName": toolName, }) elif event.type == AgentEventTypeEnum.FINAL: finalText = (event.content or "").strip() elif event.type == AgentEventTypeEnum.ERROR: raise RuntimeError(event.content or "Agent error") except Exception as runErr: await _emitSessionEvent(sessionId, "agentRun", { "source": sourceLabel, "promptId": promptId, "status": "error", "error": str(runErr)[:500], }) raise await _emitSessionEvent(sessionId, "agentRun", { "source": sourceLabel, "promptId": promptId, "status": "completed", "rounds": rounds, "hasText": bool(finalText), }) if finalText: if directorPromptMode: decision = _parseDirectorPromptFinal(finalText) kind = decision.get("kind", "silent") meetingText = (decision.get("meetingText") or "").strip() internalNote = (decision.get("internalNote") or "").strip() logger.info( f"Session {sessionId}: Director prompt {promptId} -> kind={kind}, " f"meetingChars={len(meetingText)}, noteChars={len(internalNote)}" ) await _emitSessionEvent(sessionId, "directorPrompt", { "id": promptId, "status": "decision", "decision": kind, "meetingText": meetingText, "internalNote": internalNote, }) # Record this prompt as a session-scoped briefing BEFORE we hand # delivery off. This is what later SPEECH_TEAMS triggers see, so # if the user attached a doc with mode=PERSISTENT and the agent # produced a file-grounded SILENT note, that note (and the # original fileIds) stays available for "summarize the doc" # follow-up questions in the meeting. try: promptRecord: Dict[str, Any] = {} if promptId: try: from . import interfaceFeatureTeamsbot as _ifaceDb _iface = _ifaceDb.getInterface( self.currentUser, self.mandateId, self.instanceId ) promptRecord = _iface.getDirectorPrompt(promptId) or {} except Exception as _lookupErr: logger.debug( f"Briefing pool: could not look up prompt {promptId}: {_lookupErr}" ) if promptRecord or promptId: self._recordDirectorBriefing( prompt=promptRecord or {"id": promptId}, internalNote=internalNote, meetingText=meetingText, ) except Exception as briefErr: logger.warning( f"Session {sessionId}: Director briefing pool update failed: {briefErr}" ) # If this was a persistent prompt, the live in-memory copy in # ``_activePersistentPrompts`` was loaded BEFORE the agent ran # โ€” refresh its ``responseText`` so subsequent # ``_collectActiveDirectorBriefings`` calls show the latest # analysis without waiting for the next session reload. if promptId: for p in self._activePersistentPrompts: if p.get("id") == promptId: p["responseText"] = internalNote or meetingText or finalText break if kind == "meeting" and meetingText: await self._deliverTextToMeeting( sessionId=sessionId, text=meetingText, detectedIntent=f"agent:{sourceLabel}", reasoning=f"Agent run ({sourceLabel})", triggerTranscriptId=triggerTranscriptId, ) else: # Silent: persist as internal-only botResponse so the operator # UI keeps a record, but DO NOT push into the meeting (no TTS, # no chat send). The director prompt SSE above already carries # the note for the operator UI. await self._persistInternalDirectorReply( sessionId=sessionId, internalNote=internalNote or finalText, promptId=promptId, triggerTranscriptId=triggerTranscriptId, ) return meetingText if kind == "meeting" else "" await self._deliverTextToMeeting( sessionId=sessionId, text=finalText, detectedIntent=f"agent:{sourceLabel}", reasoning=f"Agent run ({sourceLabel})", triggerTranscriptId=triggerTranscriptId, ) return finalText async def _deliverTextToMeeting( self, sessionId: str, text: str, detectedIntent: str, reasoning: str, triggerTranscriptId: Optional[str] = None, ) -> None: """Send agent text into the meeting via the same channels SPEECH_TEAMS uses: TTS + chat per config, plus DB persistence and SSE events. Uses the websocket/voiceInterface stored on this instance. If the bot is not connected anymore, the call still records the response in the DB and emits SSE so the operator UI shows the agent answer. """ from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) websocket = self._websocket voiceInterface = self._voiceInterface channelRaw = self.config.responseChannel channelStr = ( channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw) ).lower().strip() sendVoice = channelStr in ("voice", "both") sendChat = channelStr in ("chat", "both") if sendVoice and sendChat: responseType = TeamsbotResponseType.BOTH elif sendVoice: responseType = TeamsbotResponseType.AUDIO else: responseType = TeamsbotResponseType.CHAT # Voice (TTS input is voice-sanitized; chat + DB keep full structured text). # Long agent answers must be chunked: Google TTS rejects single sentences # > ~5000 bytes, and the Chirp3 voices fail on long comma-heavy lines too. ttsOutcome: Optional[Dict[str, Any]] = None if sendVoice and voiceInterface and websocket: spokenText = await self._summarizeForVoice(sessionId, text) cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: ttsOutcome = await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=spokenText, languageCode=self.config.language, voiceName=self.config.voiceId, isCancelled=cancelHook, ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "dispatched" if ttsOutcome.get("success") else "failed", "hasWebSocket": True, "chunks": ttsOutcome.get("chunks"), "played": ttsOutcome.get("played"), "error": ttsOutcome.get("error"), "timestamp": getIsoTimestamp(), }) if not ttsOutcome.get("success"): logger.warning( f"Session {sessionId}: Agent TTS delivery failed " f"({ttsOutcome.get('error')}) โ€” falling back to meeting chat" ) if not sendChat: sendChat = True # Chat if sendChat and websocket: try: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": text, })) logger.info(f"Session {sessionId}: Agent chat dispatched ({len(text)} chars)") except Exception as chatErr: logger.warning(f"Session {sessionId}: Agent chat delivery failed: {chatErr}") # Persist as botResponse + transcript so it shows up in history/UI. intentEnum, intentMeta = _coercePersistedDetectedIntent(detectedIntent) reasoningForDb = ( f"{reasoning} [{intentMeta}]" if intentMeta else reasoning ) botResponseData = TeamsbotBotResponse( sessionId=sessionId, responseText=text, responseType=responseType, detectedIntent=intentEnum, reasoning=reasoningForDb, triggeredByTranscriptId=triggerTranscriptId, modelName="agent", processingTime=0.0, priceCHF=0.0, timestamp=getIsoTimestamp(), ).model_dump() createdResponse = interface.createBotResponse(botResponseData) await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), "responseText": text, "responseType": responseType.value, "detectedIntent": intentEnum.value, "reasoning": reasoningForDb, "modelName": "agent", "processingTime": 0.0, "priceCHF": 0.0, "timestamp": botResponseData.get("timestamp"), }) botTranscriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=text, timestamp=getIsoTimestamp(), confidence=1.0, language=self.config.language, isFinal=True, source="botResponse", ).model_dump() botTranscript = interface.createTranscript(botTranscriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": text, "timestamp": getUtcTimestamp(), "source": "botResponse", }) self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = text self._lastTranscriptId = botTranscript.get("id") self._lastBotResponseText = text.strip().lower() self._lastBotResponseTs = time.time() self._followUpWindowEnd = time.time() + 15.0 await _emitSessionEvent(sessionId, "transcript", { "id": botTranscript.get("id"), "speaker": self.config.botName, "text": text, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "botResponse", "speakerResolvedFromHint": False, }) session = interface.getSession(sessionId) if session: count = session.get("botResponseCount", 0) + 1 interface.updateSession(sessionId, {"botResponseCount": count}) async def _persistInternalDirectorReply( self, sessionId: str, internalNote: str, promptId: Optional[str], triggerTranscriptId: Optional[str] = None, ) -> None: """Record a director-prompt agent reply as INTERNAL (operator-UI only). Unlike ``_deliverTextToMeeting`` this never dispatches TTS or chat into the meeting, never appends to the meeting context buffer, and does not create a meeting transcript line. It only persists a botResponse and emits an SSE event so the operator UI shows what the agent decided. """ from . import interfaceFeatureTeamsbot as interfaceDb note = (internalNote or "").strip() if not note: return interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) intentEnum, _intentMeta = _coercePersistedDetectedIntent("agent:directorPrompt") reasoningForDb = ( f"Director prompt {promptId or ''} โ€” silent / internal only " f"(not sent to meeting)" ).strip() botResponseData = TeamsbotBotResponse( sessionId=sessionId, responseText=note, responseType=TeamsbotResponseType.CHAT, detectedIntent=intentEnum, reasoning=reasoningForDb, triggeredByTranscriptId=triggerTranscriptId, modelName="agent", processingTime=0.0, priceCHF=0.0, timestamp=getIsoTimestamp(), ).model_dump() createdResponse = interface.createBotResponse(botResponseData) await _emitSessionEvent(sessionId, "botResponse", { "id": createdResponse.get("id"), "responseText": note, "responseType": TeamsbotResponseType.CHAT.value, "detectedIntent": intentEnum.value, "reasoning": reasoningForDb, "modelName": "agent", "processingTime": 0.0, "priceCHF": 0.0, "timestamp": botResponseData.get("timestamp"), "internalOnly": True, "promptId": promptId, }) logger.info( f"Session {sessionId}: Director prompt {promptId} silent reply " f"persisted internally ({len(note)} chars)" ) # ========================================================================= # Greeting (AI-localised, no hardcoded language strings) # ========================================================================= async def _generateGreetingText(self, languageCode: str) -> str: """Generate the bot's join greeting via AI in ``languageCode`` and the configured persona. Returns empty string on failure โ€” the caller must treat that as 'skip the greeting' (NEVER fall back to a hardcoded localised string).""" targetLang = (languageCode or self.config.language or "").strip() or "en-US" botName = (self.config.botName or "the assistant").strip() firstName = botName.split(" ")[0] if botName else botName persona = (self.config.aiSystemPrompt or "").strip() # English instructions to the LLM; the OUTPUT must be in ``targetLang``. prompt = ( f"You are localizing the join greeting for a meeting assistant.\n\n" f"Assistant display name (use exactly this, no translation): {firstName}\n\n" f"Persona / style guide for the assistant:\n" f"{persona or '(no persona configured โ€” use a neutral, polite, professional tone)'}\n\n" f"Target spoken language (BCP-47 code): {targetLang}\n\n" f"Generate ONE short greeting (max ~14 words) for the assistant " f"to say AND post in chat the moment it joins a meeting. The " f"greeting MUST:\n" f" - be in the target language\n" f" - introduce the assistant by name ({firstName})\n" f" - signal that it is now present and ready\n" f" - sound natural when spoken aloud (this text is also TTS'd)\n\n" f"Output ONLY the greeting text, no quotes, no markdown, no " f"commentary, no surrounding punctuation beyond what naturally " f"belongs to the sentence." ) try: aiService = createAiService( self.currentUser, self.mandateId, self.instanceId ) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=prompt, context="", options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ), ) response = await aiService.callAi(request) except Exception as aiErr: logger.warning( f"Greeting generation crashed (lang={targetLang}): {aiErr}" ) return "" if not response or response.errorCount != 0 or not response.content: logger.warning( f"Greeting generation returned empty/error (lang={targetLang})" ) return "" text = response.content.strip() # Strip any wrapping quotes/code fences the model might have added. text = re.sub(r"^```.*?\n", "", text, flags=re.DOTALL) text = re.sub(r"\n```\s*$", "", text) text = text.strip().strip("\"'`").strip() if not text: return "" logger.info( f"Greeting generated (lang={targetLang}, chars={len(text)}): {text[:80]}" ) return text async def _dispatchGreetingToMeeting( self, sessionId: str, greetingText: str, greetingLang: str, sendToChat: bool, interface: Any, voiceInterface: Any, websocket: WebSocket, ) -> None: """Centralised dispatcher for the bot's join greeting: speaks the text via TTS into the meeting and (optionally) tells the bot to post it in the meeting chat. Persists the greeting as a bot transcript / botResponse so it appears in the operator UI history. ``sendToChat`` is ``False`` for the legacy ``voiceGreeting`` path (the bot already chatted itself) and ``True`` for the new ``requestGreeting`` path where the Gateway owns chat dispatch too. """ try: await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "requested", "hasWebSocket": True, "message": "Greeting TTS requested", "timestamp": getIsoTimestamp(), }) cancelHook = self._makeAnswerCancelHook() async with self._meetingTtsLock: ttsOutcome = await _speakTextChunked( websocket=websocket, voiceInterface=voiceInterface, sessionId=sessionId, voiceText=_voiceFriendlyMeetingText(greetingText), languageCode=greetingLang, voiceName=self.config.voiceId, isCancelled=cancelHook, ) if ttsOutcome.get("success"): logger.info( f"Greeting TTS sent for session {sessionId} " f"(chunks={ttsOutcome.get('chunks')})" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "dispatched", "hasWebSocket": True, "chunks": ttsOutcome.get("chunks"), "played": ttsOutcome.get("played"), "timestamp": getIsoTimestamp(), }) else: logger.warning( f"Greeting TTS failed for session {sessionId}: {ttsOutcome.get('error')}" ) await _emitSessionEvent(sessionId, "ttsDeliveryStatus", { "status": "failed", "hasWebSocket": True, "message": ttsOutcome.get("error"), "timestamp": getIsoTimestamp(), }) if sendToChat: try: await websocket.send_text(json.dumps({ "type": "sendChatMessage", "sessionId": sessionId, "text": greetingText, })) logger.info(f"Greeting chat dispatch queued for session {sessionId}") except Exception as chatErr: logger.warning( f"Greeting chat dispatch failed for session {sessionId}: {chatErr}" ) greetingTranscriptData = TeamsbotTranscript( sessionId=sessionId, speaker=self.config.botName, text=greetingText, timestamp=getIsoTimestamp(), confidence=1.0, language=greetingLang, isFinal=True, source="botResponse", ).model_dump() greetingTranscript = interface.createTranscript(greetingTranscriptData) self._contextBuffer.append({ "speaker": self.config.botName, "text": greetingText, "timestamp": getUtcTimestamp(), "source": "botResponse", }) self._lastTranscriptSpeaker = self.config.botName self._lastTranscriptText = greetingText self._lastTranscriptId = greetingTranscript.get("id") await _emitSessionEvent(sessionId, "botResponse", { "id": greetingTranscript.get("id"), "responseText": greetingText, "responseType": TeamsbotResponseType.AUDIO.value, "detectedIntent": "greeting", "reasoning": "Automatic join greeting", "timestamp": getIsoTimestamp(), }) await _emitSessionEvent(sessionId, "transcript", { "id": greetingTranscript.get("id"), "speaker": self.config.botName, "text": greetingText, "confidence": 1.0, "timestamp": getIsoTimestamp(), "isContinuation": False, "source": "botResponse", "speakerResolvedFromHint": False, }) except Exception as dispatchErr: logger.warning( f"Greeting dispatch failed for session {sessionId}: {dispatchErr}" ) # ========================================================================= # Context Summarization (for long sessions) # ========================================================================= async def _summarizeSessionContext(self, sessionId: str, rawContext: str) -> str: """Summarize a long user-provided session context to its essential points. This reduces token usage in every subsequent AI call.""" try: aiService = createAiService(self.currentUser, self.mandateId, self.instanceId) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt=( "Fasse den folgenden Kontext auf die wesentlichen Punkte zusammen. " "Behalte alle wichtigen Fakten, Namen, Zahlen, Entscheidungen und Aktionspunkte. " "Entferne Fuelltext und Wiederholungen. " "Antworte NUR mit der Zusammenfassung, keine Erklaerungen oder Einleitungen." ), context=rawContext, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ) ) response = await aiService.callAi(request) if response and response.errorCount == 0 and response.content: summary = response.content.strip() logger.info(f"Session {sessionId}: Context summarized from {len(rawContext)} to {len(summary)} chars") return summary except Exception as e: logger.warning(f"Session context summarization failed for {sessionId}: {e}") # Fallback: return original (truncated if very long) return rawContext[:2000] if len(rawContext) > 2000 else rawContext async def _summarizeContextBuffer(self, sessionId: str): """Summarize the older part of the context buffer to preserve information without exceeding the context window. This runs in the background.""" try: if self._contextSummary: return # Already summarized recently # Take the older half of the buffer for summarization halfPoint = len(self._contextBuffer) // 2 oldSegments = self._contextBuffer[:halfPoint] if len(oldSegments) < 10: return # Not enough to summarize # Build text to summarize lines = [] for seg in oldSegments: speaker = seg.get("speaker", "Unknown") text = seg.get("text", "") lines.append(f"[{speaker}]: {text}") textToSummarize = "\n".join(lines) aiService = createAiService(self.currentUser, self.mandateId, self.instanceId) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt="Fasse das folgende Meeting-Transkript in 3-5 Saetzen zusammen. Nenne die wichtigsten Themen, Entscheidungen und offene Fragen. Antworte NUR mit der Zusammenfassung, keine Erklaerungen.", context=textToSummarize, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.SPEED, ) ) response = await aiService.callAi(request) if response and response.errorCount == 0: self._contextSummary = response.content.strip() logger.info(f"Session {sessionId}: Context summarized ({len(oldSegments)} segments -> {len(self._contextSummary)} chars)") except Exception as e: logger.warning(f"Context summarization failed for session {sessionId}: {e}") # ========================================================================= # Meeting Summary # ========================================================================= async def _generateMeetingSummary(self, sessionId: str): """Generate an AI summary of the meeting after it ends.""" try: from . import interfaceFeatureTeamsbot as interfaceDb interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId) transcripts = interface.getTranscripts(sessionId) if not transcripts or len(transcripts) < 5: return # Not enough content for a summary # Build full transcript fullTranscript = "\n".join( f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}" for t in transcripts ) aiService = createAiService(self.currentUser, self.mandateId, self.instanceId) await aiService.ensureAiObjectsInitialized() request = AiCallRequest( prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.", context=fullTranscript, options=AiCallOptions( operationType=OperationTypeEnum.DATA_ANALYSE, priority=PriorityEnum.BALANCED, ) ) response = await aiService.callAi(request) if response and response.errorCount == 0: interface.updateSession(sessionId, {"summary": response.content}) logger.info(f"Meeting summary generated for session {sessionId}") except Exception as e: logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")