gateway/modules/features/teamsbot/service.py
2026-04-25 01:13:01 +02:00

4042 lines
181 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Teamsbot Service - Pipeline Orchestrator.
Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge.
"""
import logging
import json
import re
import asyncio
import time
import base64
from typing import Optional, Dict, Any, List, Callable
from fastapi import WebSocket
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp
from modules.serviceCenter import getService as _getServiceCenterService
from modules.serviceCenter.context import ServiceCenterContext
from .datamodelTeamsbot import (
TeamsbotSessionStatus,
TeamsbotTranscript,
TeamsbotBotResponse,
TeamsbotResponseType,
TeamsbotConfig,
TeamsbotResponseMode,
TeamsbotResponseChannel,
TeamsbotDetectedIntent,
SpeechTeamsResponse,
TeamsbotCommand,
TeamsbotDirectorPrompt,
TeamsbotDirectorPromptStatus,
TeamsbotDirectorPromptMode,
DIRECTOR_PROMPT_TEXT_LIMIT,
DIRECTOR_PROMPT_FILE_LIMIT,
)
from .browserBotConnector import BrowserBotConnector
logger = logging.getLogger(__name__)
# Agent run limits for director prompts / speech escalation (meeting context).
# Higher than default workspace agent: Teams research + tool chains need depth.
TEAMSBOT_AGENT_MAX_ROUNDS = 8
TEAMSBOT_AGENT_MAX_COST_CHF = 0.12
# How many recent director-prompt briefings (one-shot + persistent) we keep in
# session memory so SPEECH_TEAMS triggers and speech escalation can still see
# the operator's attached files + analysis after the prompt itself was consumed.
_RECENT_DIRECTOR_BRIEFINGS_MAX = 6
# Quick-ack ("Moment...") UX: fire a SHORT TTS the moment the bot's name is
# detected so the speaker hears within ~1s that the bot reacted, instead of
# waiting for the full debounce + SPEECH_TEAMS + agent pipeline (~5-30s).
# Throttled per session to avoid acking every fragment of a long utterance.
_QUICK_ACK_MIN_INTERVAL_SEC = 25.0
# Number of phrase variants we generate per kind (rotated round-robin so back-
# to-back acks/notices don't sound identical).
_EPHEMERAL_PHRASE_VARIANTS = 4
# Localisation INTENTS for ephemeral phrases. Each kind describes WHAT the
# phrase should express; the actual wording is produced at runtime by the AI
# in the bot's configured language + persona. The intent text below is the
# instruction passed to the LLM (English, since it's a model directive — the
# OUTPUT will be in the configured spoken language). Add new ephemeral phrase
# kinds here, never inline string literals at the call site.
_EPHEMERAL_PHRASE_INTENTS: Dict[str, str] = {
"quickAck": (
"Very short verbal acknowledgment (1 to 4 words) the assistant says "
"the moment its name is recognised, BEFORE it has formulated a full "
"answer. The intent is purely 'I heard you, I'm thinking' — natural, "
"conversational, never a complete sentence."
),
"agentBusy": (
"One short sentence (max ~12 words) the assistant says BEFORE starting "
"a longer research / tool-call task, so the audience knows the answer "
"will take a few seconds. Polite, professional, calm."
),
"agentRound": (
"One short sentence (max ~14 words) the assistant says BETWEEN rounds "
"of a longer agent task to signal that work is still in progress. "
"Include the placeholder tokens '{round}' and '{maxRounds}' so the "
"caller can substitute the actual numbers — e.g. 'Step {round} of "
"{maxRounds}, still working.'"
),
}
def _voiceLineLooksLikeBillingOrMeta(line: str) -> bool:
"""Heuristic: trailing lines that are separators or billing/usage footers."""
s = line.strip()
if not s:
return True
lower = s.lower()
if re.match(r"^[-=*_]{3,}\s*$", s):
return True
if re.match(r"^#{1,6}\s*(usage|billing|costs?|meta|technical|statistics)\b", lower):
return True
if "chf" in lower and re.search(r"\d", s):
if re.search(
r"\b(total|usage|cost|billing|token|spent|used|price|estimate|"
r"rounds?|calls?|duration|processing\s*time|model\s*calls?)\b",
lower,
):
return True
if "token" in lower and re.search(r"\d", s):
if re.search(r"\b(total|usage|prompt|completion)\b", lower):
return True
pl = lower.replace(" ", "")
if "progressafter" in pl and ("aicalls:" in pl or "toolcalls:" in pl):
return True
return False
_EMOJI_PATTERN = re.compile(
"["
"\U0001F300-\U0001FAFF" # symbols & pictographs, emoticons, transport, supplemental
"\U00002600-\U000027BF" # misc symbols + dingbats (incl. ⚙ 🔐 🔌 ✓ ✗)
"\U0001F1E6-\U0001F1FF" # regional indicator (flags)
"\U00002B00-\U00002BFF" # arrows, geometric
"\U0001F900-\U0001F9FF" # supplemental symbols (incl. 🤖 🧠)
"\U0000FE0F" # variation selector-16 (emoji presentation)
"]+",
flags=re.UNICODE,
)
def _voiceFriendlyMeetingText(raw: str) -> str:
"""Sanitise a chat/markdown response so it can be SPOKEN naturally.
Aggressive cleanup — when a TTS engine reads raw markdown out loud the
listener hears "hash hash hash Zusammenfassung pipe pipe pipe", which
is unbearable in a meeting. The chat / DB / UI keep the original text;
only the audio path goes through this sanitiser.
What we strip:
* Code fences and inline code
* Markdown emphasis (**bold**, *italic*, __bold__, _italic_)
* Markdown links → keep label
* Headings (# .. ######)
* Markdown tables (any line with two or more pipes is dropped wholesale)
* Horizontal rules (---, ***, ___ on their own line)
* Bullet markers (-, *, •, ·) and numbered list markers (1., 2)) at line start
* Emojis (full Unicode pictograph ranges + variation selector)
* Decorative trailing colons on bullet headings
* Stray pipes left over from inline tables
* Trailing billing / "maximum rounds reached" / "budget exceeded" footers
Whitespace is then collapsed to single spaces.
"""
if not raw:
return ""
# Trim trailing operator/billing footers BEFORE any structural rewrite
# so we don't waste effort sanitising a footer that gets dropped.
low = raw.lower()
if "maximum rounds reached" in low:
m = re.search(r"(?is)maximum\s+rounds\s+reached", raw)
if m:
head = raw[: m.start()].strip()
raw = head or (
"Die Abklaerung brauchte mehr Schritte als vorgesehen; Details stehen im Chat."
)
if "budget exceeded" in low:
m = re.search(r"(?is)budget\s+exceeded", raw)
if m:
head = raw[: m.start()].strip()
raw = head or "Das eingestellte Kostenlimit ist erreicht; Details stehen im Chat."
lines = raw.strip().split("\n")
while lines and _voiceLineLooksLikeBillingOrMeta(lines[-1]):
lines.pop()
t = "\n".join(lines).strip()
if not t:
t = raw.strip()
# 1) Strip code blocks (multi-line first, then inline)
t = re.sub(r"```[\s\S]*?```", " ", t)
t = re.sub(r"`([^`]+)`", r"\1", t)
# 2) Drop markdown table rows (any line with two or more pipes) and the
# separator lines they come with (|---|---|). A paragraph that just
# happens to contain ONE pipe survives.
cleanedLines: List[str] = []
for ln in t.split("\n"):
stripped = ln.strip()
if stripped.count("|") >= 2:
continue
if re.fullmatch(r"\s*\|?[\s\-:|]+\|?\s*", stripped) and "-" in stripped:
continue
cleanedLines.append(ln)
t = "\n".join(cleanedLines)
# 3) Drop horizontal rule lines (---, ***, ___, with optional spaces)
t = re.sub(r"(?m)^\s*([-*_])\s*\1\s*\1[\s\1]*$", "", t)
# 4) Headings: drop the leading hashes
t = re.sub(r"(?m)^\s*#{1,6}\s+", "", t)
# 5) Bullet markers at line start — keep the content, drop the bullet
t = re.sub(r"(?m)^\s*[-*•·]\s+", "", t)
# 6) Numbered list markers at line start ("1.", "2)", "3 -")
t = re.sub(r"(?m)^\s*\d+[\.\)]\s+", "", t)
# 7) Emphasis markers (after bullets so a "**Bold:**" heading is handled)
t = re.sub(r"\*\*([^*]+)\*\*", r"\1", t)
t = re.sub(r"\*([^*\n]+)\*", r"\1", t)
t = re.sub(r"__([^_]+)__", r"\1", t)
t = re.sub(r"(?<![A-Za-z0-9])_([^_\n]+)_(?![A-Za-z0-9])", r"\1", t)
# 8) Markdown links → keep the visible label
t = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", t)
# 9) Strip emojis (purely decorative for voice)
t = _EMOJI_PATTERN.sub("", t)
# 10) Final special-character pass for TTS. Engines like Google Chirp /
# Azure Neural read special chars out loud as words ("Stern", "Klammer
# auf", "Pfeil", "Unterstrich") which makes the meeting unbearable.
# We KEEP only the canonical spoken-language punctuation that TTS
# uses for prosody: . , ? ! : ; ' - (and the Unicode ellipsis …).
#
# Everything else gets stripped or replaced with a single space.
# 10a) Decorative dashes (en-dash / em-dash) between spaces → comma
# (gives a natural prosodic pause without the TTS reading "dash").
t = re.sub(r"\s+[\u2013\u2014\u2212]\s+", ", ", t)
# In-word en/em dash collapses to a regular hyphen.
t = re.sub(r"[\u2013\u2014\u2212]", "-", t)
# 10b) Smart / typographic quotes — TTS sometimes vocalises them.
t = re.sub(r"[\u201C\u201D\u201E\u201F\u00AB\u00BB\u2039\u203A]", "", t)
t = re.sub(r"[\u2018\u2019\u201A\u201B]", "'", t)
# 10c) Arrows (U+2190..U+21FF), math symbols (U+2200..U+22FF), misc
# technical (U+2300..U+23FF) — strip outright. Also a handful of
# common Latin-1 specials (±, °, ©, ®, ™, ¶, §, ¦, ·) that TTS
# engines tend to read out as words ("plus minus", "Grad").
t = re.sub(r"[\u2190-\u23FF]+", " ", t)
t = re.sub(r"[\u00A1-\u00BF\u00D7\u00F7\u2122]+", " ", t)
# 10d) Common ASCII / typographic specials that survived earlier passes.
# `*` `#` `~` `^` `=` `+` `|` `\` `<` `>` `{` `}` `[` `]` `(` `)`
# `_` `&` `@` `$` `%` `` -- replaced with a space so word
# boundaries are preserved.
t = re.sub(r"[*#~^=+|\\<>{}\[\]()_&@$%`]+", " ", t)
# 10e) Drop ASCII double-quote (single quotes are legitimate apostrophes
# in contractions like "don't" / "geht's", so we keep U+0027).
t = t.replace('"', "")
# 10f) Slash between letters/digits — TTS reads "slash". Replace with
# " or " for readability when it separates words like "und/oder".
t = re.sub(r"(?<=\w)\s*/\s*(?=\w)", " oder ", t)
# Any remaining stray slash is just whitespace.
t = t.replace("/", " ")
# 10g) Trim multiple punctuation runs ("...!!!" → "..." / "!" / etc.)
t = re.sub(r"([\.,;:!\?])\1{1,}", r"\1", t)
# Remove orphan punctuation directly preceded by whitespace
# (common after symbol stripping: " , ", " . ").
t = re.sub(r"\s+([\.,;:!\?])", r"\1", t)
# Collapse trailing colon at end of meaningful phrase to a period for
# nicer cadence ("Was ist PowerOn:" → "Was ist PowerOn.").
t = re.sub(r":\s*$", ".", t.rstrip())
# 10h) Collapse " :" tail of MULTI-LINE blocks the same way.
t = re.sub(r"\s+:\s*$", ":", t, flags=re.MULTILINE)
# 11) Collapse whitespace to single spaces; protect sentence breaks by
# turning paragraph blanks into a period if the previous chunk
# didn't already terminate.
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", t) if p.strip()]
rebuilt: List[str] = []
for p in paragraphs:
p = re.sub(r"\s+", " ", p).strip()
if not p:
continue
if not re.search(r"[\.!\?\u2026:]\s*$", p):
p = p.rstrip() + "."
rebuilt.append(p)
t = " ".join(rebuilt)
t = re.sub(r"\s+", " ", t).strip()
# If we sanitised away everything (e.g. the input was *only* a markdown
# table or a wall of pictographs) return empty — the caller (TTS / voice
# summary) treats empty as "nothing to say", which is the safe default.
# Falling back to raw markdown here would leak the very symbols we just
# spent ten passes removing.
return t
# Google Cloud TTS rejects single sentences that exceed ~5000 bytes. The Chirp3
# voices are stricter: long, comma-heavy sentences (no terminating punctuation)
# also fail with "Sentence ... is too long". We chunk well below the documented
# limit AND inject sentence terminators so the synthesizer accepts every chunk.
_TTS_MAX_CHUNK_CHARS = 800
def _splitTextForTts(text: str, maxChars: int = _TTS_MAX_CHUNK_CHARS) -> List[str]:
"""Split a long voice line into TTS-safe chunks at sentence/paragraph boundaries.
The result preserves order and contains no empty strings. A single
sentence longer than ``maxChars`` is hard-cut at word boundaries.
"""
cleaned = (text or "").strip()
if not cleaned:
return []
if len(cleaned) <= maxChars:
return [cleaned]
sentencePattern = re.compile(r"(?<=[\.!\?\u2026])\s+|\n+")
rawSentences = [s.strip() for s in sentencePattern.split(cleaned) if s and s.strip()]
if not rawSentences:
rawSentences = [cleaned]
chunks: List[str] = []
buffer = ""
for sentence in rawSentences:
if len(sentence) > maxChars:
if buffer:
chunks.append(buffer.strip())
buffer = ""
words = sentence.split(" ")
current = ""
for word in words:
candidate = (current + " " + word).strip() if current else word
if len(candidate) > maxChars and current:
chunks.append(current.strip())
current = word
else:
current = candidate
if current:
if not re.search(r"[\.!\?\u2026]\s*$", current):
current = current.rstrip() + "."
chunks.append(current.strip())
continue
candidate = (buffer + " " + sentence).strip() if buffer else sentence
if len(candidate) > maxChars and buffer:
chunks.append(buffer.strip())
buffer = sentence
else:
buffer = candidate
if buffer:
chunks.append(buffer.strip())
finalized: List[str] = []
for c in chunks:
if not c:
continue
if not re.search(r"[\.!\?\u2026]\s*$", c):
c = c.rstrip() + "."
finalized.append(c)
return finalized
async def _speakTextChunked(
websocket: Optional[WebSocket],
voiceInterface: Any,
sessionId: str,
voiceText: str,
languageCode: str,
voiceName: Optional[str],
isCancelled: Optional[Callable[[], bool]] = None,
) -> Dict[str, Any]:
"""Run TTS in chunks and dispatch each ``playAudio`` over the websocket.
Returns ``{"success": bool, "chunks": int, "played": int, "error": Optional[str], "cancelled": bool}``.
Failure for one chunk does NOT abort the rest; partial playback still
counts as ``success=True`` so the caller can decide whether to add a chat
fallback for the missing parts.
``isCancelled`` is an optional zero-arg predicate the caller passes in to
signal "abort the remaining chunks". It is checked BEFORE each Google
TTS round-trip and again BEFORE each websocket send, so a stop word in
the meeting can interrupt a multi-chunk dispatch within at most one
chunk boundary instead of waiting for the whole answer to finish.
"""
chunks = _splitTextForTts(voiceText)
result: Dict[str, Any] = {"success": False, "chunks": len(chunks), "played": 0, "error": None, "cancelled": False}
if not chunks:
result["error"] = "no text"
return result
if voiceInterface is None:
result["error"] = "no voice interface"
return result
lastError: Optional[str] = None
for idx, chunk in enumerate(chunks, start=1):
if isCancelled is not None and isCancelled():
result["cancelled"] = True
logger.info(
f"Session {sessionId}: TTS chunk loop cancelled before chunk "
f"{idx}/{len(chunks)} (user stop or newer answer in flight)"
)
break
try:
ttsResult = await voiceInterface.textToSpeech(
text=chunk,
languageCode=languageCode,
voiceName=voiceName,
)
except Exception as ttsErr: # pragma: no cover - network/runtime errors
lastError = f"chunk {idx}/{len(chunks)} raised: {ttsErr}"
logger.warning(f"Session {sessionId}: TTS {lastError}")
continue
if not isinstance(ttsResult, dict) or ttsResult.get("success") is False:
err = (ttsResult or {}).get("error", "unknown") if isinstance(ttsResult, dict) else "no result"
lastError = f"chunk {idx}/{len(chunks)} failed: {err}"
logger.warning(f"Session {sessionId}: TTS {lastError}")
continue
audioContent = ttsResult.get("audioContent")
if not audioContent:
lastError = f"chunk {idx}/{len(chunks)} returned no audioContent"
logger.warning(f"Session {sessionId}: TTS {lastError}")
continue
if websocket is None:
lastError = "websocket unavailable"
break
if isCancelled is not None and isCancelled():
result["cancelled"] = True
logger.info(
f"Session {sessionId}: TTS chunk loop cancelled before "
f"sending chunk {idx}/{len(chunks)} (audio dropped)"
)
break
try:
await websocket.send_text(json.dumps({
"type": "playAudio",
"sessionId": sessionId,
"audio": {
"data": base64.b64encode(
audioContent if isinstance(audioContent, bytes) else audioContent.encode()
).decode(),
"format": "mp3",
},
}))
result["played"] += 1
except Exception as wsErr: # pragma: no cover - websocket failures
lastError = f"chunk {idx}/{len(chunks)} websocket send failed: {wsErr}"
logger.warning(f"Session {sessionId}: TTS {lastError}")
break
result["success"] = result["played"] > 0
if lastError:
result["error"] = lastError
return result
def _coercePersistedDetectedIntent(raw: Optional[str]) -> tuple:
"""Map free-form intent labels (e.g. agent:directorPrompt) to TeamsbotDetectedIntent
for DB persistence; return (enum, meta_suffix_or_None for reasoning)."""
if not raw or not str(raw).strip():
return TeamsbotDetectedIntent.NONE, None
s = str(raw).strip().lower()
for member in TeamsbotDetectedIntent:
if member.value == s:
return member, None
if s.startswith("agent:"):
return TeamsbotDetectedIntent.PROACTIVE, str(raw).strip()[:120]
return TeamsbotDetectedIntent.NONE, str(raw).strip()[:120]
# Director prompts are PRIVATE operator instructions — they must NOT be echoed
# verbatim into the meeting. The agent is asked to start its FINAL answer with
# either ``MEETING_REPLY:`` (followed by the text actually meant for the meeting)
# or ``SILENT:`` / ``INTERNAL_ONLY:`` (followed by an internal note for the
# operator UI). Anything else → treat as silent (safe default).
_DIRECTOR_REPLY_PATTERN = re.compile(
r"^\s*(MEETING_REPLY|MEETING|REPLY|SAY|SPEAK)\s*:\s*",
re.IGNORECASE,
)
_DIRECTOR_SILENT_PATTERN = re.compile(
r"^\s*(SILENT|INTERNAL(?:_ONLY)?|NOTE|NO_MEETING_OUTPUT|ACK(?:NOWLEDGE)?)\s*:\s*",
re.IGNORECASE,
)
def _parseDirectorPromptFinal(finalText: str) -> Dict[str, Any]:
"""Parse the agent's final answer for a director prompt.
Returns ``{"kind": "meeting"|"silent", "meetingText": str, "internalNote": str}``.
Default is ``silent`` so unmarked replies are NOT broadcast into the meeting.
"""
text = (finalText or "").strip()
if not text:
return {"kind": "silent", "meetingText": "", "internalNote": ""}
meetingMatch = _DIRECTOR_REPLY_PATTERN.match(text)
if meetingMatch:
body = text[meetingMatch.end():].strip()
return {"kind": "meeting", "meetingText": body, "internalNote": ""}
silentMatch = _DIRECTOR_SILENT_PATTERN.match(text)
if silentMatch:
body = text[silentMatch.end():].strip()
return {"kind": "silent", "meetingText": "", "internalNote": body}
# No marker → safe default: do NOT spam the meeting with the agent's
# internal reasoning. Keep the full text as an internal note for the
# operator UI so nothing is lost.
return {"kind": "silent", "meetingText": "", "internalNote": text}
# =========================================================================
# Active Service Registry (sessionId -> running TeamsbotService instance)
#
# Required so HTTP endpoints (e.g. director-prompt POST) can reach the
# TeamsbotService instance currently holding the live websocket + voice
# interface for that session, without going through the websocket loop.
# =========================================================================
_activeServices: Dict[str, "TeamsbotService"] = {}
def getActiveService(sessionId: str) -> Optional["TeamsbotService"]:
"""Return the running TeamsbotService for a session, or None if not active."""
return _activeServices.get(sessionId)
# =========================================================================
# AI Service Factory (for billing-aware AI calls)
# =========================================================================
def _createAiService(user, mandateId, featureInstanceId=None):
"""Create a properly wired AiService via the service center."""
ctx = ServiceCenterContext(
user=user,
mandate_id=mandateId,
feature_instance_id=featureInstanceId,
feature_code="teamsbot",
)
return _getServiceCenterService("ai", ctx)
# =========================================================================
# Session Event Queues (for SSE streaming to frontend)
# =========================================================================
_sessionEvents: Dict[str, asyncio.Queue] = {}
async def _emitSessionEvent(sessionId: str, eventType: str, data: Any):
"""Emit an event to the session's SSE stream.
Creates the queue on-demand so events are never silently dropped."""
if sessionId not in _sessionEvents:
_sessionEvents[sessionId] = asyncio.Queue()
await _sessionEvents[sessionId].put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()})
def _normalizeGatewayHostForBotWs(host: str) -> str:
"""Use IPv4 loopback for local dev WebSocket URLs passed to the Node browser-bot.
Node on Windows often resolves ``localhost`` to ``::1`` first; Uvicorn bound to
``0.0.0.0`` typically accepts IPv4 only, so the bot gets ``ECONNREFUSED ::1``.
"""
h = host.strip()
lower = h.lower()
if lower == "localhost":
return "127.0.0.1"
if lower.startswith("localhost:"):
return "127.0.0.1" + h[len("localhost"):]
if lower.startswith("[::1]:"):
return "127.0.0.1" + h.partition("]")[2]
if lower in ("[::1]", "::1"):
return "127.0.0.1"
return h
class TeamsbotService:
"""
Pipeline Orchestrator for Teams Bot sessions.
Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication.
"""
def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig):
self.currentUser = currentUser
self.mandateId = mandateId
self.instanceId = instanceId
self.config = config
self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl())
# State
self._lastAiCallTime: float = 0.0
self._aiAnalysisInProgress: bool = False
self._contextBuffer: List[Dict[str, Any]] = []
self._sessionContext: Optional[str] = None # User-provided background context
self._contextSummary: Optional[str] = None # AI-generated summary of long context
# Differential transcript tracking
self._lastTranscriptSpeaker: Optional[str] = None
self._lastTranscriptText: Optional[str] = None
self._lastTranscriptId: Optional[str] = None
self._lastSttTime: float = 0.0
self._lastBotResponseText: Optional[str] = None
self._lastBotResponseTs: float = 0.0
# Speaker attribution: simple last-caption-speaker model
self._lastCaptionSpeaker: Optional[str] = None
self._unattributedTranscriptIds: List[str] = []
self._knownSpeakers: set = set()
# Debounced name trigger: wait for speaker to finish before AI analysis
self._pendingNameTrigger: Optional[Dict[str, Any]] = None
self._followUpWindowEnd: float = 0.0
# Quick-ack throttle (timestamp of the last short "Moment..." ack we
# spoke into the meeting). Without this guard a long sentence with
# multiple name mentions would trigger several acks in a row.
self._lastQuickAckTs: float = 0.0
# Session-scoped phrase pool for SHORT ephemeral utterances (quick
# acks, "checking..." notices, per-round progress). Lazily populated
# by the AI in the bot's configured language + persona — no hardcoded
# strings or hardcoded language branching anywhere downstream. Keyed
# by the kinds defined in ``_EPHEMERAL_PHRASE_INTENTS``.
# * ``self._phrasePool[kind]`` -> list of variants for that kind
# * ``self._phrasePoolIdx[kind]`` -> round-robin pointer
# Concurrent generation calls for the same kind are serialised by the
# lock so we don't spawn duplicate AI requests on a burst.
self._phrasePool: Dict[str, List[str]] = {}
self._phrasePoolIdx: Dict[str, int] = {}
self._phrasePoolLock: asyncio.Lock = asyncio.Lock()
# Voice pipeline: a single per-session lock that serialises every TTS
# dispatch into the meeting. Without it three independent code paths
# (SPEECH_TEAMS direct answer, agent escalation final answer, and
# operator-driven director prompt) can all reach
# ``websocket.send_text({"type": "playAudio", ...})`` at the same time
# and the browser bot then plays interleaved chunks — i.e. "two bots
# talking over each other" exactly as the operator suspects. Chat
# (text) sends are NOT locked: they're cheap and can interleave fine.
self._meetingTtsLock: asyncio.Lock = asyncio.Lock()
# Generation counter incremented every time we begin producing a NEW
# meeting answer OR every time the user issues a hard stop. Any TTS
# chunk loop captures the counter value at start; before sending
# each chunk to the bot it re-checks the counter and bails out if
# it has moved on. This is what makes "Stopp" actually feel
# instantaneous: the in-flight TTS dispatch loop drops itself the
# moment the next chunk would have been sent, without waiting for
# any AI round-trip or extra Google TTS call to come back.
self._answerGenerationCounter: int = 0
# Tracking handles for cancellable background tasks. Keeping a
# reference lets ``_cancelInFlightSpeech`` actually call
# ``task.cancel()`` instead of just hoping the task notices the
# generation counter has moved on. Cleared in the task's own
# ``finally`` block.
self._currentEscalationTask: Optional[asyncio.Task] = None
self._currentQuickAckTask: Optional[asyncio.Task] = None
# Whether an agent escalation task is in flight. Kept separate from
# ``_aiAnalysisInProgress`` (which only covers the SPEECH_TEAMS phase)
# so a new speech trigger that arrives WHILE the agent is still
# researching does not start a parallel SPEECH_TEAMS that would then
# answer at the same time as the agent.
self._agentEscalationInFlight: bool = False
# Live transport handles for out-of-band actions (director prompts, agent escalation).
# Set in handleBotWebSocket once the bot connects; cleared on disconnect.
self._activeSessionId: Optional[str] = None
self._websocket: Optional[WebSocket] = None
self._voiceInterface = None
# Persistent director prompts kept in memory for context injection across triggers.
# Loaded from DB on (re)connect; mutated by submit/delete director prompt routes.
self._activePersistentPrompts: List[Dict[str, Any]] = []
# Recent director-prompt briefings (one-shot AND persistent) — keeps the
# operator's attached files and the agent's internal analysis available
# for later SPEECH_TEAMS triggers, even after a one-shot prompt has been
# consumed. Without this pool, the bot "forgets" attached docs as soon
# as the director prompt finished, and answers later meeting questions
# ("summarize the doc") with general babble instead of the file content.
# Capped by ``_RECENT_DIRECTOR_BRIEFINGS_MAX`` to bound prompt size.
self._recentDirectorBriefings: List[Dict[str, Any]] = []
# =========================================================================
# Session Lifecycle
# =========================================================================
async def joinMeeting(
self,
sessionId: str,
meetingLink: str,
connectionId: Optional[str] = None,
gatewayBaseUrl: str = "",
botAccountEmail: Optional[str] = None,
botAccountPassword: Optional[str] = None,
):
"""Send join command to the Browser Bot service.
The browser bot will:
1. Launch browser (headful if credentials provided, headless otherwise)
2. Navigate to Teams web app
3. Authenticate if credentials provided, otherwise join as anonymous guest
4. Enable captions/audio capture and start scraping
5. Connect back via WebSocket to send transcripts
"""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
# Initialize SSE event queue
_sessionEvents[sessionId] = asyncio.Queue()
try:
# Update status to JOINING
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value})
await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"})
# Send join command to browser bot
session = interface.getSession(sessionId)
if not session:
raise ValueError(f"Session {sessionId} not found")
# Build the full WebSocket URL for the bot to connect back to this gateway instance
# gatewayBaseUrl is passed from the route handler (derived from request.base_url)
wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws"
gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/")
gatewayHost = _normalizeGatewayHostForBotWs(gatewayHost)
fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}"
hasAuth = bool(botAccountEmail and botAccountPassword)
logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}")
result = await self.browserBotConnector.joinMeeting(
sessionId=sessionId,
meetingUrl=meetingLink,
botName=session.get("botName", self.config.botName),
instanceId=self.instanceId,
gatewayWsUrl=fullGatewayWsUrl,
language=self.config.language,
botAccountEmail=botAccountEmail,
botAccountPassword=botAccountPassword,
transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto",
debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False,
)
if result.get("success"):
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.JOINING.value, # Will become ACTIVE when bot connects via WS
})
logger.info(f"Browser bot deployment started for session {sessionId}")
else:
errorMsg = result.get("error", "Unknown error joining meeting")
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ERROR.value,
"errorMessage": errorMsg,
})
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg})
logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}")
except Exception as e:
logger.error(f"Error joining meeting for session {sessionId}: {e}")
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ERROR.value,
"errorMessage": str(e),
})
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)})
async def leaveMeeting(self, sessionId: str):
"""Send leave command to the Browser Bot service."""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
try:
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value})
await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"})
await self.browserBotConnector.leaveMeeting(sessionId)
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ENDED.value,
"endedAt": getIsoTimestamp(),
})
await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"})
# Generate meeting summary in background
asyncio.create_task(self._generateMeetingSummary(sessionId))
logger.info(f"Bot left meeting for session {sessionId}")
except Exception as e:
logger.error(f"Error leaving meeting for session {sessionId}: {e}")
interface.updateSession(sessionId, {
"status": TeamsbotSessionStatus.ERROR.value,
"errorMessage": str(e),
"endedAt": getIsoTimestamp(),
})
# Cleanup event queue
_sessionEvents.pop(sessionId, None)
# =========================================================================
# Browser Bot WebSocket Communication
# =========================================================================
async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str):
"""
Main WebSocket handler for Browser Bot communication.
Receives:
- transcript: Caption text scraped from Teams
- status: Bot state changes (joined, in_lobby, left, error)
Sends:
- playAudio: TTS audio for the bot to play in the meeting
"""
from . import interfaceFeatureTeamsbot as interfaceDb
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
voiceInterface = getVoiceInterface(self.currentUser, self.mandateId)
# Load session context (user-provided background knowledge)
# If the context is long (>500 chars), summarize it to reduce token usage
session = interface.getSession(sessionId)
if session:
rawContext = session.get("sessionContext")
if rawContext and len(rawContext) > 500:
logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...")
self._sessionContext = await self._summarizeSessionContext(sessionId, rawContext)
elif rawContext:
self._sessionContext = rawContext
if self._sessionContext:
logger.info(f"Session {sessionId}: Session context ready ({len(self._sessionContext)} chars)")
# Resolve system bot email for speaker detection (prevents bot from triggering AI on own speech)
try:
systemBot = interface.getActiveSystemBot(self.mandateId)
self._botAccountEmail = systemBot.get("email") if systemBot else None
if self._botAccountEmail:
logger.info(f"Session {sessionId}: Bot account email resolved: {self._botAccountEmail}")
except Exception:
self._botAccountEmail = None
# Register the live service so out-of-band callers (director prompts,
# agent escalation) can deliver text/audio through this same websocket.
self._activeSessionId = sessionId
self._websocket = websocket
self._voiceInterface = voiceInterface
_activeServices[sessionId] = self
# Notify the operator UI that the bot's WebSocket is now live so the
# director-prompt panel can enable its submit button.
try:
await _emitSessionEvent(sessionId, "botConnectionState", {
"connected": True,
"timestamp": getIsoTimestamp(),
})
except Exception:
pass
# Restore active persistent director prompts from DB (survives reconnects).
try:
self._activePersistentPrompts = interface.getActivePersistentPrompts(sessionId) or []
if self._activePersistentPrompts:
logger.info(
f"Session {sessionId}: Loaded {len(self._activePersistentPrompts)} active persistent director prompt(s)"
)
except Exception as restoreErr:
logger.warning(f"Session {sessionId}: Could not restore persistent director prompts: {restoreErr}")
self._activePersistentPrompts = []
# Pre-warm the ephemeral phrase pool in the background so the first
# quick-ack ("Moment...") and interim agent notice don't have to wait
# for the AI round-trip. Best-effort: if generation fails, the
# corresponding ephemeral cue is silently skipped at runtime — never
# falls back to hardcoded language strings.
asyncio.create_task(self._warmEphemeralPhrasePool(sessionId))
logger.info(f"[WS] Handler started for session {sessionId}")
try:
msgCount = 0
while True:
data = await websocket.receive()
msgCount += 1
if "text" not in data:
logger.debug(f"[WS] session={sessionId} msg #{msgCount}: non-text data (keys: {list(data.keys())})")
continue
message = json.loads(data["text"])
msgType = message.get("type")
if msgType not in ("audioChunk", "ping"):
logger.info(f"[WS] session={sessionId} msg #{msgCount}: type={msgType}")
if msgType == "transcript":
transcript = message.get("transcript", {})
source = transcript.get("source", "caption")
speaker = transcript.get("speaker", "Unknown")
textPreview = (transcript.get("text", "") or "")[:60]
# Caption/speakerHint: name resolution only; transcript comes from STT
logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...")
await self._processTranscript(
sessionId=sessionId,
speaker=transcript.get("speaker", "Unknown"),
text=transcript.get("text", ""),
isFinal=transcript.get("isFinal", True),
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source=source,
)
elif msgType == "chatMessage":
chat = message.get("chat", {})
isHistory = chat.get("isHistory", False)
source = "chatHistory" if isHistory else "chat"
logger.info(
f"[WS] Chat{'[HISTORY]' if isHistory else ''}: "
f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..."
)
await self._processTranscript(
sessionId=sessionId,
speaker=chat.get("speaker", "Unknown"),
text=chat.get("text", ""),
isFinal=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source=source,
)
elif msgType == "status":
status = message.get("status")
errorMessage = message.get("message")
logger.info(f"[WS] Status: status={status}, message={errorMessage}")
await self._handleBotStatus(sessionId, status, errorMessage, interface)
elif msgType == "audioChunk":
audioData = message.get("audio", {})
audioBase64 = audioData.get("data", "")
sampleRate = audioData.get("sampleRate", 16000)
captureDiagnostics = audioData.get("captureDiagnostics") or {}
if audioBase64:
await self._processAudioChunk(
sessionId=sessionId,
audioBase64=audioBase64,
sampleRate=sampleRate,
captureDiagnostics=captureDiagnostics,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
)
elif msgType == "voiceGreeting":
# Legacy path: older bot images send a pre-built greeting
# text. New bots use ``requestGreeting`` and let the
# Gateway own greeting generation.
greetingText = message.get("text", "")
greetingLang = message.get("language", self.config.language)
logger.info(
f"[WS] Voice greeting (legacy): text={greetingText[:60]}..., language={greetingLang}"
)
if greetingText and voiceInterface:
await self._dispatchGreetingToMeeting(
sessionId=sessionId,
greetingText=greetingText,
greetingLang=greetingLang,
sendToChat=False,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
)
elif msgType == "requestGreeting":
# New path: bot just signals "I have joined" — Gateway
# generates the greeting text via AI in the configured
# language + persona, then dispatches it to BOTH the
# meeting chat (sendChatMessage command) and TTS. No
# hardcoded language strings on the bot side.
requestedLang = (
message.get("language") or self.config.language or ""
).strip() or "en-US"
botNameHint = (
message.get("botName") or self.config.botName or ""
).strip() or self.config.botName
logger.info(
f"[WS] Greeting request from bot: language={requestedLang}, name={botNameHint}"
)
if voiceInterface:
try:
greetingText = await self._generateGreetingText(
requestedLang
)
except Exception as genErr:
logger.warning(
f"Greeting generation failed for session {sessionId}: {genErr}"
)
greetingText = ""
if greetingText:
await self._dispatchGreetingToMeeting(
sessionId=sessionId,
greetingText=greetingText,
greetingLang=requestedLang,
sendToChat=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
)
else:
logger.warning(
f"Session {sessionId}: Skipping greeting — AI generation produced no text"
)
elif msgType == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
elif msgType == "ttsPlaybackAck":
playback = message.get("playback", {}) or {}
status = playback.get("status", "unknown")
ackMessage = playback.get("message") or "Bot playback status update"
logger.info(
f"[WS] TTS playback ack: status={status}, format={playback.get('format')}, "
f"bytesBase64={playback.get('bytesBase64')}"
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": f"playback_{status}",
"hasWebSocket": True,
"message": ackMessage,
"timestamp": playback.get("timestamp") or getIsoTimestamp(),
"format": playback.get("format"),
"bytesBase64": playback.get("bytesBase64"),
})
elif msgType == "mfaChallenge":
mfaData = message.get("mfa", {})
mfaType = mfaData.get("type", "unknown")
displayNumber = mfaData.get("displayNumber")
prompt = mfaData.get("prompt", "")
logger.info(f"[WS] MFA challenge: type={mfaType}, number={displayNumber}, prompt={prompt[:60]}")
await _emitSessionEvent(sessionId, "mfaChallenge", {
"mfaType": mfaType,
"displayNumber": displayNumber,
"prompt": prompt,
"timestamp": getIsoTimestamp(),
})
from .routeFeatureTeamsbot import _mfaCodeQueues, _mfaWaitTasks
mfaQueue = asyncio.Queue()
_mfaCodeQueues[sessionId] = mfaQueue
async def _waitAndForwardMfa(sid, queue, ws):
try:
mfaResponse = await asyncio.wait_for(queue.get(), timeout=120.0)
logger.info(f"[WS] MFA response received for session {sid}: action={mfaResponse.get('action')}")
await ws.send_text(json.dumps({
"type": "mfaResponse",
"sessionId": sid,
"mfa": mfaResponse,
}))
except asyncio.TimeoutError:
logger.warning(f"[WS] MFA response timeout for session {sid}")
await ws.send_text(json.dumps({
"type": "mfaResponse",
"sessionId": sid,
"mfa": {"action": "timeout"},
}))
await _emitSessionEvent(sid, "mfaChallenge", {
"mfaType": "timeout",
"prompt": "MFA-Zeitlimit ueberschritten. Bitte erneut versuchen.",
})
except asyncio.CancelledError:
logger.info(f"[WS] MFA wait cancelled for session {sid} (resolved via page)")
finally:
_mfaCodeQueues.pop(sid, None)
_mfaWaitTasks.pop(sid, None)
_mfaWaitTasks[sessionId] = asyncio.create_task(
_waitAndForwardMfa(sessionId, mfaQueue, websocket)
)
elif msgType == "chatSendFailed":
errorData = message.get("error", {})
reason = errorData.get("reason", "unknown")
failedText = errorData.get("text", "")
logger.warning(
f"[WS] Chat send failed for session {sessionId}: "
f"reason={reason}, text={failedText[:60]}"
)
await _emitSessionEvent(sessionId, "chatSendFailed", {
"reason": reason,
"message": errorData.get("message", "Chat message could not be sent"),
"text": failedText,
"timestamp": getIsoTimestamp(),
})
elif msgType == "mfaResolved":
success = message.get("success", False)
logger.info(f"[WS] MFA resolved: success={success}")
from .routeFeatureTeamsbot import _mfaCodeQueues, _mfaWaitTasks
task = _mfaWaitTasks.pop(sessionId, None)
if task and not task.done():
task.cancel()
_mfaCodeQueues.pop(sessionId, None)
await _emitSessionEvent(sessionId, "mfaResolved", {
"success": success,
"timestamp": getIsoTimestamp(),
})
except Exception as e:
if "disconnect" not in str(e).lower():
logger.error(f"[WS] Error for session {sessionId}: {type(e).__name__}: {e}")
finally:
if _activeServices.get(sessionId) is self:
_activeServices.pop(sessionId, None)
self._websocket = None
self._voiceInterface = None
self._activeSessionId = None
try:
await _emitSessionEvent(sessionId, "botConnectionState", {
"connected": False,
"timestamp": getIsoTimestamp(),
})
except Exception:
pass
logger.info(f"[WS] Handler ended for session {sessionId} after {msgCount} messages")
async def _handleBotStatus(
self,
sessionId: str,
status: str,
errorMessage: Optional[str],
interface,
):
"""Handle status updates from the browser bot."""
logger.info(f"Bot status update for session {sessionId}: {status}")
statusMap = {
"connecting": TeamsbotSessionStatus.JOINING.value,
"launching": TeamsbotSessionStatus.JOINING.value,
"navigating": TeamsbotSessionStatus.JOINING.value,
"in_lobby": TeamsbotSessionStatus.JOINING.value,
"joined": TeamsbotSessionStatus.ACTIVE.value,
"in_meeting": TeamsbotSessionStatus.ACTIVE.value,
"left": TeamsbotSessionStatus.ENDED.value,
"error": TeamsbotSessionStatus.ERROR.value,
}
dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value)
updates = {"status": dbStatus}
if errorMessage:
updates["errorMessage"] = errorMessage
if dbStatus == TeamsbotSessionStatus.ACTIVE.value:
updates["startedAt"] = getIsoTimestamp()
elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
updates["endedAt"] = getIsoTimestamp()
interface.updateSession(sessionId, updates)
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
# Generate summary when session ends
if dbStatus == TeamsbotSessionStatus.ENDED.value:
asyncio.create_task(self._generateMeetingSummary(sessionId))
async def _processAudioChunk(
self,
sessionId: str,
audioBase64: str,
sampleRate: int,
captureDiagnostics: Optional[Dict[str, Any]],
interface,
voiceInterface,
websocket: WebSocket,
):
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
import base64
try:
audioBytes = base64.b64decode(audioBase64)
if len(audioBytes) < 1000:
return
if captureDiagnostics:
trackId = captureDiagnostics.get("trackId")
readyState = captureDiagnostics.get("readyState")
rms = captureDiagnostics.get("rms")
nativeSampleRate = captureDiagnostics.get("nativeSampleRate")
logger.debug(
f"[AudioChunk] diagnostics: track={trackId}, readyState={readyState}, "
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
)
# Use RMS from capture diagnostics to skip real silence.
# Byte-variation heuristics produced false positives and dropped valid speech.
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
try:
rmsVal = float(captureDiagnostics.get("rms"))
if rmsVal < 0.0003:
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
return
except Exception:
pass
if not voiceInterface:
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
return
# Treat sampleRate=0 as unknown (triggers auto-detection)
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
phraseHints = list(self._knownSpeakers)
if self.config.botName:
phraseHints.append(self.config.botName)
sttResult = await voiceInterface.speechToText(
audioContent=audioBytes,
language=self.config.language or "de-DE",
sampleRate=effectiveSampleRate,
channels=1,
skipFallbacks=True,
phraseHints=phraseHints if phraseHints else None,
alternativeLanguages=["en-US"],
)
if sttResult and sttResult.get("success") and sttResult.get("text"):
text = sttResult["text"].strip()
if text:
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False)
logger.info(
f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} "
f"(fromCaption={fromCaption}), text={text[:80]}..."
)
await self._processTranscript(
sessionId=sessionId,
speaker=resolvedSpeaker["speaker"],
text=text,
isFinal=True,
interface=interface,
voiceInterface=voiceInterface,
websocket=websocket,
source="audioCapture",
speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"],
)
except Exception as e:
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
def _registerSpeakerHint(self, speaker: str, text: str, sessionId: str = ""):
"""Track current speaker from captions for STT attribution.
When the first non-bot caption arrives, retroactively attributes
any STT segments that were created before a speaker was known."""
if not speaker:
return
normalizedSpeaker = speaker.strip()
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
return
prevSpeaker = self._lastCaptionSpeaker
self._lastCaptionSpeaker = normalizedSpeaker
self._knownSpeakers.add(normalizedSpeaker)
if prevSpeaker is None and self._unattributedTranscriptIds:
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
for tid in self._unattributedTranscriptIds:
interface.updateTranscript(tid, {"speaker": normalizedSpeaker})
for seg in self._contextBuffer:
if seg.get("speaker") == "Unknown" and seg.get("source") == "audioCapture":
seg["speaker"] = normalizedSpeaker
if self._lastTranscriptSpeaker == "Unknown":
self._lastTranscriptSpeaker = normalizedSpeaker
logger.info(
f"Session {sessionId}: Retroactive speaker attribution: "
f"{len(self._unattributedTranscriptIds)} segments -> {normalizedSpeaker}"
)
self._unattributedTranscriptIds.clear()
if self._pendingNameTrigger:
self._pendingNameTrigger["lastActivity"] = time.time()
def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]:
"""Speaker name for audio chunks — uses the last caption speaker."""
if self._lastCaptionSpeaker:
return {"speaker": self._lastCaptionSpeaker, "speakerResolvedFromHint": True}
return {"speaker": "Unknown", "speakerResolvedFromHint": False}
async def _processTranscript(
self,
sessionId: str,
speaker: str,
text: str,
isFinal: bool,
interface,
voiceInterface,
websocket: WebSocket,
source: str = "caption",
speakerResolvedFromHint: Optional[bool] = None,
):
"""Process a transcript segment from captions or chat messages.
Differential writing: When the same speaker continues (text grows
incrementally as captions stream), we UPDATE the existing DB record
instead of creating a cascade of near-duplicate rows. A new record
is only created when the speaker changes or the text is not a
continuation of the previous segment.
"""
text = text.strip()
if not text:
return
# Captions are used ONLY for speaker name resolution (never as transcript).
# Transcript text comes exclusively from audio STT or chat.
# Address detection (bot name in caption) still triggers AI analysis
# using existing audio-based context — but caption text itself is NOT
# added to the context buffer.
if source in ("caption", "speakerHint"):
self._registerSpeakerHint(speaker, text, sessionId)
if (
source == "speakerHint"
and isFinal
and not self._isBotSpeaker(speaker)
and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
and self._detectBotName(text)
):
triggerTranscript = {"id": None, "speaker": speaker, "text": text, "source": source}
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, triggerTranscript)
if isNew:
logger.info(f"Session {sessionId}: Bot name in caption, debounce trigger started")
asyncio.create_task(self._checkPendingNameTrigger())
# Fire a short audible "Moment..." in parallel so the
# speaker hears the bot react immediately, instead of
# waiting for debounce + SPEECH_TEAMS + agent (~5-30s).
self._currentQuickAckTask = asyncio.create_task(
self._runQuickAck(sessionId)
)
return
# Chat history: messages sent before the bot joined the meeting.
# Stored in DB for reference but NOT added to the AI context buffer,
# because old messages (e.g. "nyla, summarize the protocol") would
# be treated as current requests when AI analysis is triggered.
if source == "chatHistory":
transcriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=speaker,
text=text,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
isFinal=True,
source="chatHistory",
).model_dump()
createdTranscript = interface.createTranscript(transcriptData)
await _emitSessionEvent(sessionId, "transcript", {
"id": createdTranscript.get("id"),
"speaker": speaker,
"text": text,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": "chatHistory",
"isHistory": True,
})
logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}")
return
# Filter out the bot's own speech (caption/audioCapture) — garbled text
# pollutes context. Chat from the bot is clean text and must appear in
# the transcript for all participants.
isBotSpeaker = self._isBotSpeaker(speaker)
if isBotSpeaker and source != "chat":
logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...")
return
# Differential transcript writing:
# audioCapture from same speaker → append text (merge STT chunks into one block)
# Start a new block after a pause (>5s gap between STT results)
sttPauseThreshold = 5.0
isMerge = (
source == "audioCapture"
and self._lastTranscriptSpeaker == speaker
and self._lastTranscriptText is not None
and self._lastTranscriptId is not None
and (time.time() - self._lastSttTime) < sttPauseThreshold
)
if isMerge:
mergedText = f"{self._lastTranscriptText} {text}"
interface.updateTranscript(self._lastTranscriptId, {
"text": mergedText,
"isFinal": isFinal,
})
self._lastTranscriptText = mergedText
createdTranscript = {"id": self._lastTranscriptId}
if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
self._contextBuffer[-1]["text"] = mergedText
else:
transcriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=speaker,
text=text,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
isFinal=isFinal,
source=source,
).model_dump()
createdTranscript = interface.createTranscript(transcriptData)
self._lastTranscriptSpeaker = speaker
self._lastTranscriptText = text
self._lastTranscriptId = createdTranscript.get("id")
if source == "audioCapture" and speaker == "Unknown":
self._unattributedTranscriptIds.append(createdTranscript.get("id"))
self._contextBuffer.append({
"speaker": speaker or "Unknown",
"text": text,
"timestamp": getUtcTimestamp(),
"source": source,
})
maxSegments = self.config.contextWindowSegments
if len(self._contextBuffer) > maxSegments:
if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
asyncio.create_task(self._summarizeContextBuffer(sessionId))
self._contextBuffer = self._contextBuffer[-maxSegments:]
session = interface.getSession(sessionId)
if session:
count = session.get("transcriptSegmentCount", 0) + 1
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
if source == "audioCapture":
self._lastSttTime = time.time()
displayText = self._lastTranscriptText if isMerge else text
await _emitSessionEvent(sessionId, "transcript", {
"id": createdTranscript.get("id"),
"speaker": speaker,
"text": displayText,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": isMerge,
"source": source,
"speakerResolvedFromHint": (
speakerResolvedFromHint
if speakerResolvedFromHint is not None
else False
),
})
if not isFinal:
return
if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY:
return
# Bot's own chat: stored for display only, never trigger AI
if source == "chat" and isBotSpeaker:
return
# Stop phrases: HARD STOP, no AI round-trip. We previously routed
# this through ``_analyzeAndRespond`` which spent 1-2 seconds in
# the speech LLM just to classify the intent, during which the
# current TTS kept playing — and the LLM round-trip would also
# produce yet another response that joined the queue. The new
# path goes straight to the browser bot's audio cancel and
# invalidates everything else in flight.
if self._isStopPhrase(text):
logger.info(
f"Session {sessionId}: Stop phrase detected ('{text.strip()[:60]}'), "
f"hard-cancelling in-flight speech immediately"
)
await self._cancelInFlightSpeech(
sessionId=sessionId,
websocket=websocket,
reason="userStopPhrase",
)
return
# Update activity for any pending debounced trigger
if self._pendingNameTrigger:
self._pendingNameTrigger["lastActivity"] = time.time()
# Bot name detection → debounced trigger (wait for speaker to finish)
if self._detectBotName(text):
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
if isNew:
asyncio.create_task(self._checkPendingNameTrigger())
# Audible early-feedback ack ("Moment...") in parallel — runs
# while we still wait the debounce window and SPEECH_TEAMS
# decides what to actually answer.
self._currentQuickAckTask = asyncio.create_task(
self._runQuickAck(sessionId)
)
return
# Follow-up window: after a bot response, trigger AI for any human speech
# without requiring the bot name — the AI decides via shouldRespond
if (
source == "audioCapture"
and not self._isBotSpeaker(speaker)
and time.time() < self._followUpWindowEnd
and not self._pendingNameTrigger
):
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
if isNew:
logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
asyncio.create_task(self._checkPendingNameTrigger())
return
# Periodic trigger (only when no debounce pending)
if not self._pendingNameTrigger:
shouldTrigger = self._shouldTriggerAnalysis(text)
if shouldTrigger:
logger.info(f"Session {sessionId}: Periodic trigger (buffer: {len(self._contextBuffer)} segments)")
await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript)
def _isBotSpeaker(self, speaker: str) -> bool:
"""Check if a transcript speaker is the bot itself.
Teams captions show the bot as e.g. "BotName (Unverified)" or
"Nyla Larsson" depending on auth/anonymous join. We match against:
- The configured/derived bot name
- The bot account display name if authenticated
"""
if not speaker:
return False
speakerLower = speaker.lower().strip()
# Match against configured bot name
botName = self.config.botName.lower().strip()
if botName and botName in speakerLower:
return True
# Match against bot account email prefix (e.g. "nyla.larsson" from "nyla.larsson@poweron.swiss")
botAccountEmail = getattr(self, '_botAccountEmail', None) or getattr(self.config, 'botAccountEmail', None)
if botAccountEmail:
emailPrefix = botAccountEmail.split("@")[0].lower().replace(".", " ")
if emailPrefix in speakerLower:
return True
return False
def _shouldTriggerAnalysis(self, transcriptText: str, allowPeriodic: bool = True) -> bool:
"""
Decide whether to trigger AI analysis based on the latest transcript.
Bot name detection is handled separately via debounce.
This method only checks periodic/cooldown triggers.
"""
now = time.time()
timeSinceLastCall = now - self._lastAiCallTime
if timeSinceLastCall < self.config.triggerCooldownSeconds:
return False
if allowPeriodic and timeSinceLastCall >= self.config.triggerIntervalSeconds:
logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s)")
return True
return False
def _isStopPhrase(self, text: str) -> bool:
"""Check if text is an immediate-cancel command from the meeting.
Recognised intents (any language we hear in practice):
* Hard stop: stop / stopp / halt / ruhe / stille / arrete / quiet / shut
* Pause / wait: warte / wait / moment / pause / hold (hold on)
* Silence: sei still / be quiet / shut up / aufhoeren / aufhören / silence
Hits trigger the direct stop pipeline in ``_cancelInFlightSpeech``:
kill TTS, invalidate pending generations, clear name-trigger debounce.
Critically: NO new AI call is fired — the user explicitly asked the
bot to be quiet, so the worst thing we could do is generate yet
another response on top of the one we just cancelled.
"""
if not text or len(text.strip()) < 2:
return False
t = text.strip().lower()
words = [w.strip(".,!?:;\"'()[]") for w in t.split() if w.strip()]
wordSet = set(words)
stopWords = {
# Hard-stop verbs
"stop", "stopp", "halt", "ruhe", "stille", "schweig",
"arrete", "quiet", "shut", "silence",
# Pause / wait verbs (still "be quiet now" semantics)
"warte", "wait", "moment", "pause",
}
if wordSet & stopWords:
return True
if (
"sei still" in t
or "be quiet" in t
or "shut up" in t
or "hold on" in t
or "aufhoeren" in t
or "aufhören" in t
):
return True
return False
def _makeAnswerCancelHook(self) -> Callable[[], bool]:
"""Capture the current ``_answerGenerationCounter`` and return a
zero-arg predicate that returns ``True`` once a hard stop (or any
future "supersede this answer" event) has bumped the counter.
Pass the returned predicate as ``isCancelled`` into
``_speakTextChunked`` so a multi-chunk dispatch can bail out
between chunks instead of speaking a 30-second answer to the end.
"""
snapshot = self._answerGenerationCounter
return lambda: self._answerGenerationCounter != snapshot
async def _cancelInFlightSpeech(
self,
sessionId: str,
websocket: Optional[WebSocket],
reason: str,
) -> None:
"""Hard stop everything the bot is currently doing in the meeting.
Pipeline (ALL synchronous from the caller's point of view, no AI
round-trips):
1. Bump ``_answerGenerationCounter`` so any in-flight TTS chunk
loop, agent escalation or quick-ack drops its remaining work
the moment it next checks the counter.
2. Clear ``_pendingNameTrigger`` so a debounced "speaker just said
the bot name" trigger that was queued before the stop word
cannot wake up 3 seconds later and answer anyway.
3. Cancel tracked background tasks (escalation, quick-ack). The
tasks themselves swallow ``CancelledError`` in their finally
block.
4. Send ``{"type":"stopAudio"}`` to the browser bot — it stops the
current playback in the AudioContext and clears its play queue
so nothing buffered comes through afterwards.
Deliberately does NOT generate a new response. The user just told
the bot to be quiet; producing a "Okay, ich bin still" reply on
top would be the exact opposite of what was asked for.
"""
self._answerGenerationCounter += 1
gen = self._answerGenerationCounter
logger.info(
f"Session {sessionId}: Cancelling in-flight speech "
f"(reason={reason}, gen={gen})"
)
if self._pendingNameTrigger:
logger.info(
f"Session {sessionId}: Dropping pending debounced name "
f"trigger (was queued before stop)"
)
self._pendingNameTrigger = None
for taskAttr in ("_currentEscalationTask", "_currentQuickAckTask"):
task = getattr(self, taskAttr, None)
if task is not None and not task.done():
logger.info(
f"Session {sessionId}: Cancelling background task "
f"{taskAttr}"
)
task.cancel()
if websocket is not None:
try:
await websocket.send_text(json.dumps({
"type": "stopAudio",
"sessionId": sessionId,
"reason": reason,
}))
except Exception as stopErr:
logger.warning(
f"Session {sessionId}: Failed to send stopAudio to "
f"browser bot: {stopErr}"
)
try:
await _emitSessionEvent(sessionId, "speechCancelled", {
"reason": reason,
"generation": gen,
"timestamp": getIsoTimestamp(),
})
except Exception:
pass
def _detectBotName(self, text: str) -> bool:
"""Check if text contains the bot's name (exact or phonetically similar)."""
botNameLower = self.config.botName.lower()
textLower = text.lower()
if botNameLower in textLower:
return True
botFirstName = botNameLower.split()[0] if " " in botNameLower else botNameLower
if len(botFirstName) >= 3:
for word in textLower.split():
cleanWord = word.strip(".,!?:;\"'()[]")
if not cleanWord or len(cleanWord) < 3:
continue
if cleanWord == botFirstName:
return True
if cleanWord[0] == botFirstName[0] and abs(len(cleanWord) - len(botFirstName)) <= 2:
common = sum(1 for c in set(botFirstName) if c in cleanWord)
similarity = common / max(len(set(botFirstName)), len(set(cleanWord)))
if similarity >= 0.6:
return True
return False
def _setPendingNameTrigger(self, sessionId, interface, voiceInterface, websocket, triggerTranscript) -> bool:
"""Set or update a debounced name trigger. Returns True if newly set."""
if self._pendingNameTrigger:
self._pendingNameTrigger["lastActivity"] = time.time()
return False
self._pendingNameTrigger = {
"sessionId": sessionId,
"interface": interface,
"voiceInterface": voiceInterface,
"websocket": websocket,
"triggerTranscript": triggerTranscript,
"detectedAt": time.time(),
"lastActivity": time.time(),
}
return True
async def _warmEphemeralPhrasePool(self, sessionId: str) -> None:
"""Fire-and-forget background task: generate the ephemeral phrase
pool for every kind defined in ``_EPHEMERAL_PHRASE_INTENTS`` so the
first quick-ack / interim notice doesn't pay the AI round-trip
latency at runtime. Failures are logged but never raised — the
runtime selectors handle empty pools by silently skipping the cue."""
try:
for kind in _EPHEMERAL_PHRASE_INTENTS:
try:
await self._getEphemeralPhrases(kind)
except Exception as innerErr:
logger.warning(
f"Session {sessionId}: Phrase pool warmup failed for "
f"kind={kind}: {innerErr}"
)
except Exception as warmErr:
logger.warning(
f"Session {sessionId}: Phrase pool warmup task crashed: {warmErr}"
)
# ---------------------------------------------------------------- Voice
# When the bot's full answer is a long structured chat post (markdown
# tables, bullet lists, headings, multi-paragraph) we MUST NOT read it
# out verbatim into the meeting — even after sanitisation it sounds
# like a wall of text and easily takes 5+ minutes. The chat keeps the
# full answer; the audio path goes through ``_summarizeForVoice`` which
# asks the AI for a 1-3 sentence spoken paraphrase in the configured
# bot persona / language.
# Threshold: anything longer than this many characters (after sanitise)
# OR any answer whose source contains markdown structure (tables /
# multiple bullets / multiple headings) gets condensed before TTS.
_VOICE_DIRECT_MAX_CHARS = 600
_VOICE_SUMMARY_MAX_CHARS = 350
@staticmethod
def _looksLikeStructuredText(raw: str) -> bool:
"""Heuristic: does the original answer have markdown structure that
would be miserable to listen to verbatim? Used to trigger the
AI summary path even when the sanitised text is short enough."""
if not raw:
return False
if raw.count("|") >= 4: # at least one markdown table row
return True
if raw.count("\n#") >= 1: # at least one heading after newline
return True
if raw.count("\n- ") + raw.count("\n* ") + raw.count("\n") >= 3:
return True # 3+ bullets → list-like
if re.search(r"\n\d+[\.\)]\s", raw): # numbered list
count = len(re.findall(r"(?m)^\s*\d+[\.\)]\s", raw))
if count >= 3:
return True
return False
async def _summarizeForVoice(
self,
sessionId: str,
rawAnswer: str,
) -> str:
"""Return a SHORT, naturally-spoken paraphrase of ``rawAnswer`` for
TTS playback. Falls back to the sanitised + truncated original if
the AI call fails — never blocks the response.
The chat / DB / UI keep the original ``rawAnswer`` untouched. Only
the voice channel goes through this condensation.
"""
if not rawAnswer or not rawAnswer.strip():
return ""
sanitised = _voiceFriendlyMeetingText(rawAnswer)
# Short + unstructured → speak as-is, no AI round-trip
if (
len(sanitised) <= self._VOICE_DIRECT_MAX_CHARS
and not self._looksLikeStructuredText(rawAnswer)
):
return sanitised
targetLang = (self.config.language or "de-DE").strip()
botName = (self.config.botName or "").strip() or "the assistant"
persona = (self.config.aiSystemPrompt or "").strip()
personaBlock = (
f"\n\nBOT PERSONA / TONE:\n{persona}\n"
if persona else ""
)
prompt = (
f"You are condensing a long written answer into a SHORT spoken "
f"paraphrase that the assistant '{botName}' will say out loud "
f"into a Microsoft Teams meeting. The full written answer is "
f"already in the meeting chat — your job is to summarise it for "
f"the EAR, not the eye.\n\n"
f"STRICT REQUIREMENTS:\n"
f"1. Output language: BCP-47 '{targetLang}'. No other language.\n"
f"2. 1 to 3 sentences, max ~{self._VOICE_SUMMARY_MAX_CHARS} characters total.\n"
f"3. Natural spoken style — no headings, no bullet points, no "
f"tables, no markdown, no emojis, no enumerations like 'Erstens... "
f"Zweitens...' unless that genuinely flows in speech.\n"
f"4. Capture the essence and the most important conclusion. Do "
f"NOT try to fit every detail. Listeners can read the chat for "
f"the full version.\n"
f"5. End by gently pointing the audience to the chat for details, "
f"e.g. 'Details stehen im Chat.' (adapted to the target language).\n"
f"6. Output ONLY the spoken text. No JSON, no quotes around it, "
f"no preamble like 'Here is the summary:'.\n"
f"{personaBlock}\n"
f"FULL WRITTEN ANSWER (markdown-formatted, sometimes long):\n"
f"---\n{rawAnswer.strip()[:6000]}\n---\n"
)
try:
aiService = _createAiService(
self.currentUser, self.mandateId, self.instanceId
)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=prompt,
context="",
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.SPEED,
),
)
response = await aiService.callAi(request)
except Exception as aiErr:
logger.warning(
f"Session {sessionId}: Voice summary AI call failed: {aiErr}"
)
return sanitised[: self._VOICE_DIRECT_MAX_CHARS]
if not response or response.errorCount != 0 or not response.content:
logger.warning(
f"Session {sessionId}: Voice summary returned empty/error"
)
return sanitised[: self._VOICE_DIRECT_MAX_CHARS]
spoken = response.content.strip()
# Defensive sanitiser pass — the model usually obeys the
# "no markdown" instruction but not always.
spoken = _voiceFriendlyMeetingText(spoken)
if not spoken:
return sanitised[: self._VOICE_DIRECT_MAX_CHARS]
logger.info(
f"Session {sessionId}: Voice summary generated "
f"(orig={len(rawAnswer)} chars, sanitised={len(sanitised)}, "
f"spoken={len(spoken)})"
)
return spoken
async def _pickQuickAckText(self) -> Optional[str]:
"""Return a short ack text in the bot's configured language. The
actual phrases are AI-generated once per session (cached) and rotated
round-robin so consecutive acks don't sound identical. Returns
``None`` only if AI generation completely failed and no fallback
variant could be produced — in that case the caller silently skips
the ack."""
return await self._pickEphemeralPhrase("quickAck")
async def _pickEphemeralPhrase(
self,
kind: str,
substitutions: Optional[Dict[str, Any]] = None,
) -> Optional[str]:
"""Round-robin selector over the cached phrase pool for ``kind``.
Lazily generates the pool on first use. ``substitutions`` is applied
to the chosen phrase via ``str.format(**substitutions)`` so kinds
like ``agentRound`` can render ``{round}`` / ``{maxRounds}``.
Returns ``None`` if no phrases are available."""
variants = await self._getEphemeralPhrases(kind)
if not variants:
return None
idx = self._phrasePoolIdx.get(kind, 0) % len(variants)
self._phrasePoolIdx[kind] = (idx + 1) % len(variants)
chosen = variants[idx]
if substitutions:
try:
chosen = chosen.format(**substitutions)
except (KeyError, IndexError, ValueError) as fmtErr:
# The AI didn't include the expected placeholder — return the
# raw phrase rather than crash. The user still hears something
# in the right language; only the numeric hint is missing.
logger.debug(
f"Ephemeral phrase substitution failed for kind={kind}: {fmtErr}"
)
return chosen
async def _getEphemeralPhrases(self, kind: str) -> List[str]:
"""Return the cached pool of AI-generated variants for ``kind``,
generating it on first request. Subsequent calls hit the in-memory
cache. Concurrent first-time callers are serialised by the pool lock
so only ONE AI request is fired per kind per session."""
cached = self._phrasePool.get(kind)
if cached:
return cached
async with self._phrasePoolLock:
cached = self._phrasePool.get(kind)
if cached:
return cached
phrases = await self._generateEphemeralPhrases(
kind, _EPHEMERAL_PHRASE_VARIANTS
)
if phrases:
self._phrasePool[kind] = phrases
return phrases
async def _generateEphemeralPhrases(
self, kind: str, count: int
) -> List[str]:
"""Ask the AI to produce ``count`` short utterances for ``kind`` in
the bot's configured language and persona. Returns ``[]`` on any
failure — callers must treat empty as 'silently skip this ephemeral
cue', NEVER fall back to a hardcoded localized string."""
intent = _EPHEMERAL_PHRASE_INTENTS.get(kind)
if not intent:
logger.warning(f"Unknown ephemeral phrase kind requested: {kind}")
return []
targetLang = (self.config.language or "").strip() or "en-US"
botName = (self.config.botName or "the assistant").strip()
persona = (self.config.aiSystemPrompt or "").strip()
# The prompt is in English on purpose — these are instructions to the
# LLM, not user-facing text. The OUTPUT is required to be in
# ``targetLang``. We ask for a strict JSON array so parsing is robust.
prompt = (
f"You are localizing short SPOKEN-LANGUAGE utterances for a "
f"meeting assistant named '{botName}'.\n\n"
f"Persona / style guide for the assistant:\n"
f"{persona or '(no persona configured — use a neutral, polite, professional tone)'}\n\n"
f"Target spoken language (BCP-47 code): {targetLang}\n\n"
f"Utterance intent:\n{intent}\n\n"
f"Generate {count} DIFFERENT variants matching this intent, in "
f"the target language. Variants should feel natural when spoken "
f"aloud, not robotic. Do NOT include the assistant's name in "
f"the variants.\n\n"
f"Output STRICTLY a JSON array of {count} plain-text strings, "
f"with no markdown fences, no commentary, no surrounding "
f"quotation marks beyond the JSON syntax itself. Example "
f"format: [\"...\", \"...\", \"...\", \"...\"]"
)
try:
aiService = _createAiService(
self.currentUser, self.mandateId, self.instanceId
)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=prompt,
context="",
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.SPEED,
),
)
response = await aiService.callAi(request)
except Exception as aiErr:
logger.warning(
f"Ephemeral phrase generation failed (kind={kind}, lang={targetLang}): {aiErr}"
)
return []
if not response or response.errorCount != 0 or not response.content:
logger.warning(
f"Ephemeral phrase generation returned empty/error "
f"(kind={kind}, lang={targetLang})"
)
return []
raw = response.content.strip()
# Strip optional ```json ... ``` fences before parsing.
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```\s*$", "", raw)
try:
arr = json.loads(raw)
except json.JSONDecodeError as parseErr:
logger.warning(
f"Ephemeral phrase generation: could not parse JSON "
f"(kind={kind}, lang={targetLang}): {parseErr} "
f"raw={raw[:200]}"
)
return []
if not isinstance(arr, list):
return []
cleaned = [
str(v).strip()
for v in arr
if isinstance(v, str) and str(v).strip()
]
cleaned = cleaned[:count]
if cleaned:
logger.info(
f"Ephemeral phrase pool generated (kind={kind}, "
f"lang={targetLang}, count={len(cleaned)})"
)
return cleaned
def _shouldFireQuickAck(self) -> bool:
"""Centralized gate so the call sites stay short and consistent."""
now = time.time()
if (now - self._lastQuickAckTs) < _QUICK_ACK_MIN_INTERVAL_SEC:
return False
# If we are already producing a real response, the ack would step on
# the actual answer's TTS — skip it. Same for an in-flight agent
# escalation: the agent will deliver its own answer (and we already
# spoke an interim "moment please" when it started).
if self._aiAnalysisInProgress or self._agentEscalationInFlight:
return False
# Voice channel must be active. Chat-only mode would just spam "...".
channelRaw = self.config.responseChannel
channelStr = (
channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw)
).lower().strip()
if channelStr not in ("voice", "both"):
return False
if self.config.responseMode in (
TeamsbotResponseMode.MANUAL,
TeamsbotResponseMode.TRANSCRIBE_ONLY,
):
return False
return True
async def _runQuickAck(self, sessionId: str) -> None:
"""Background task: speak the short ack into the meeting via TTS.
Designed to be fired as ``asyncio.create_task(self._runQuickAck(...))``
the moment the bot's name is detected — does not block the regular
debounced analysis pipeline. Persists nothing to the DB and emits no
botResponse event; this is purely an audio cue ("Moment...") so the
speaker hears within ~1s that the bot is reacting.
"""
websocket = self._websocket
voiceInterface = self._voiceInterface
if websocket is None or voiceInterface is None:
return
if not self._shouldFireQuickAck():
return
ackText = await self._pickQuickAckText()
if not ackText:
return
# Mark the throttle BEFORE TTS so two near-simultaneous detections
# don't both fire (TTS dispatch can take a few hundred ms).
self._lastQuickAckTs = time.time()
try:
await _emitSessionEvent(sessionId, "quickAck", {
"text": ackText,
"timestamp": getIsoTimestamp(),
})
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
outcome = await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=ackText,
languageCode=self.config.language,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
if not outcome.get("success"):
logger.info(
f"Session {sessionId}: Quick ack TTS failed silently "
f"({outcome.get('error')}) — main response will still go through"
)
except asyncio.CancelledError:
logger.info(f"Session {sessionId}: Quick ack cancelled by stop signal")
except Exception as ackErr:
logger.warning(f"Session {sessionId}: Quick ack failed: {ackErr}")
finally:
self._currentQuickAckTask = None
async def _checkPendingNameTrigger(self, delaySec: float = 3.0):
"""Async loop: fire the pending name trigger once the speaker is quiet."""
await asyncio.sleep(delaySec)
if not self._pendingNameTrigger:
return
now = time.time()
lastActivity = self._pendingNameTrigger.get("lastActivity", 0)
detectedAt = self._pendingNameTrigger.get("detectedAt", 0)
quietSec = now - lastActivity
totalWaitSec = now - detectedAt
if quietSec >= 3.0 or totalWaitSec >= 15.0:
trigger = self._pendingNameTrigger
self._pendingNameTrigger = None
logger.info(
f"Session {trigger['sessionId']}: Debounced name trigger fires "
f"(quiet={quietSec:.1f}s, totalWait={totalWaitSec:.1f}s)"
)
await self._analyzeAndRespond(
trigger["sessionId"],
trigger["interface"],
trigger["voiceInterface"],
trigger["websocket"],
trigger["triggerTranscript"],
)
else:
remaining = max(0.5, 3.0 - quietSec)
asyncio.create_task(self._checkPendingNameTrigger(remaining))
async def _analyzeAndRespond(
self,
sessionId: str,
interface,
voiceInterface,
websocket: WebSocket,
triggerTranscript: Dict[str, Any],
):
"""Run SPEECH_TEAMS AI analysis and respond if needed."""
if self._aiAnalysisInProgress:
logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger")
return
# An agent escalation from a previous trigger may still be researching
# (it lives in its own task, ``_aiAnalysisInProgress`` was already
# released when SPEECH_TEAMS returned). If we let a fresh SPEECH_TEAMS
# run now, both pipelines would race to the meeting voice channel and
# the operator would hear "two bots talking". Skip until the agent
# finishes; the speaker can re-trigger by saying the bot name again
# if they have a new question.
if self._agentEscalationInFlight:
logger.info(
f"Session {sessionId}: Agent escalation still in flight — "
f"skipping new SPEECH_TEAMS trigger to prevent overlapping replies"
)
return
self._aiAnalysisInProgress = True
self._lastAiCallTime = time.time()
# Build transcript context from buffer.
# Mark bot's own utterances and chat messages for the AI.
contextLines = []
for segment in self._contextBuffer:
speaker = segment.get("speaker", "Unknown")
text = segment.get("text", "")
segSource = segment.get("source", "caption")
prefix = "Chat" if segSource == "chat" else ""
if self._isBotSpeaker(speaker):
contextLines.append(f"[YOU ({self.config.botName})]: {text}")
elif prefix:
contextLines.append(f"[{prefix}: {speaker}]: {text}")
else:
contextLines.append(f"[{speaker}]: {text}")
# Include session context if provided by the user at session start
sessionContextStr = ""
if self._sessionContext:
sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{self._sessionContext}\n"
# Include summary of earlier conversation if available
summaryStr = ""
if self._contextSummary:
summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{self._contextSummary}\n"
# Persistent director prompts: private operator instructions that stay
# in effect across triggers (e.g. "respond in English", "always be brief").
directorStr = self._buildPersistentDirectorContext()
transcriptContext = f"BOT_NAME:{self.config.botName}{sessionContextStr}{summaryStr}{directorStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines)
# Call SPEECH_TEAMS
try:
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=self.config.aiSystemPrompt,
context=transcriptContext,
options=AiCallOptions(
operationType=OperationTypeEnum.SPEECH_TEAMS,
priority=PriorityEnum.SPEED,
)
)
response = await aiService.callAi(request)
# Parse structured response
try:
speechResult = SpeechTeamsResponse.model_validate_json(response.content)
except Exception:
# Try to extract JSON from response content
try:
jsonStr = response.content
if "```json" in jsonStr:
jsonStr = jsonStr.split("```json")[1].split("```")[0]
elif "```" in jsonStr:
jsonStr = jsonStr.split("```")[1].split("```")[0]
speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip())
except Exception as parseErr:
logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}")
speechResult = SpeechTeamsResponse(
shouldRespond=False,
reasoning=f"Parse error: {str(parseErr)[:100]}",
detectedIntent="none"
)
logger.info(
f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, "
f"intent={speechResult.detectedIntent}, "
f"reasoning={speechResult.reasoning[:80]}..."
)
# Emit analysis event (always, for debug/UI)
await _emitSessionEvent(sessionId, "analysis", {
"shouldRespond": speechResult.shouldRespond,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
"needsAgent": speechResult.needsAgent,
"agentReason": speechResult.agentReason,
})
# Hybrid routing: SPEECH_TEAMS detected a complex request that
# requires the full agent (web research, mail, multi-step). Hand
# off to the agent path; do NOT speak the SPEECH_TEAMS placeholder.
if speechResult.needsAgent:
# Director prompts (persistent + recent one-shot) have already
# delivered files to the operator. The escalation agent MUST see
# them — otherwise it answers "summarize the doc" with general
# babble because the SPEECH_TEAMS prompt itself never had file
# access. We also forward the prior agent analysis so the
# escalation can build on, not duplicate, the earlier work.
briefings = self._collectActiveDirectorBriefings()
briefingFileIds = self._collectDirectorFileIds()
briefingBlock = ""
if briefings:
parts = []
for b in briefings:
seg = f"- ({b.get('mode')}) {b.get('text', '')}".rstrip()
if b.get("fileIds"):
seg += f"\n attachedFileIds: {', '.join(b['fileIds'])}"
if b.get("note"):
note = b["note"]
seg += (
"\n priorAgentAnalysis: "
+ (note if len(note) <= 800 else note[:800] + "...")
)
parts.append(seg)
briefingBlock = (
"\n\nACTIVE_OPERATOR_BRIEFINGS (private; you may read the "
"attached files via summarizeContent / readFile / "
"readContentObjects to answer the user precisely; do NOT "
"quote the directive text itself):\n" + "\n".join(parts)
)
logger.info(
f"Session {sessionId}: SPEECH_TEAMS escalates to agent. "
f"Reason: {speechResult.agentReason or speechResult.reasoning} | "
f"briefings={len(briefings)}, fileIds={len(briefingFileIds)}"
)
taskBrief = (
(speechResult.agentReason
or speechResult.responseText
or "Verarbeite die juengste Sprecheranfrage und antworte ins Meeting.")
+ briefingBlock
)
# Mark escalation as in-flight BEFORE we create the task so the
# ``_aiAnalysisInProgress=False`` released in our finally block
# cannot let a competing speech trigger sneak past the gate
# before the agent task has even been scheduled.
self._agentEscalationInFlight = True
self._currentEscalationTask = asyncio.create_task(
self._runEscalationAndRelease(
sessionId=sessionId,
taskBrief=taskBrief,
briefingFileIds=briefingFileIds,
triggerTranscriptId=triggerTranscript.get("id"),
)
)
return
# Step 4a: Handle STOP intent -- stop audio immediately
if speechResult.detectedIntent == "stop":
logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}")
if websocket:
try:
await websocket.send_text(json.dumps({
"type": "stopAudio",
"sessionId": sessionId,
}))
except Exception as stopErr:
logger.warning(f"Failed to send stop command: {stopErr}")
return
# Step 4b: Respond if AI decided to
if speechResult.shouldRespond and speechResult.responseText:
if self.config.responseMode == TeamsbotResponseMode.MANUAL:
# In manual mode, suggest but don't send
await _emitSessionEvent(sessionId, "suggestedResponse", {
"responseText": speechResult.responseText,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
})
return
# Determine response channel: per-request (AI) overrides config
channels = speechResult.responseChannels
if channels and isinstance(channels, list):
channelStr = ",".join(str(c).lower().strip() for c in channels)
sendVoice = "voice" in channelStr
sendChat = "chat" in channelStr
logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
else:
channelRaw = self.config.responseChannel
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
sendVoice = channelStr in ("voice", "both")
sendChat = channelStr in ("chat", "both")
logger.info(f"Response channel (from config): '{channelStr}'")
if sendVoice and sendChat:
responseType = TeamsbotResponseType.BOTH
elif sendVoice:
responseType = TeamsbotResponseType.AUDIO
else:
responseType = TeamsbotResponseType.CHAT
# Suppress duplicate responses in short windows ("repeat loop" protection).
canonicalText = (
speechResult.responseText
or speechResult.responseTextForVoice
or speechResult.responseTextForChat
or ""
)
normalizedResponse = (canonicalText or "").strip().lower()
nowTs = time.time()
if (
normalizedResponse
and self._lastBotResponseText == normalizedResponse
and (nowTs - self._lastBotResponseTs) < 90
):
logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
await _emitSessionEvent(sessionId, "analysis", {
"shouldRespond": False,
"detectedIntent": speechResult.detectedIntent,
"reasoning": "Suppressed duplicate response within 90s",
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
})
return
# Resolve text per channel (AI can send different content to voice vs chat)
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
textForChat = speechResult.responseTextForChat or speechResult.responseText
storedText = textForChat or textForVoice or speechResult.responseText
# 4a: Voice response (TTS -> Audio to bot, chunked for long replies)
if sendVoice and textForVoice:
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "requested",
"hasWebSocket": websocket is not None,
"message": "TTS generation requested",
"timestamp": getIsoTimestamp(),
})
logger.info(
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
)
if not websocket:
logger.warning(
f"Session {sessionId}: TTS skipped (bot websocket unavailable, likely fallback mode)"
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "unavailable",
"hasWebSocket": False,
"message": "TTS skipped — bot websocket unavailable",
"timestamp": getIsoTimestamp(),
})
if not sendChat:
sendChat = True
else:
# Long / structured answers → AI condenses for ear; chat keeps full text.
spokenText = await self._summarizeForVoice(sessionId, textForVoice)
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
ttsOutcome = await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=spokenText,
languageCode=self.config.language,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
if ttsOutcome.get("success"):
logger.info(
f"Session {sessionId}: TTS audio dispatched to bot "
f"(chunks={ttsOutcome.get('chunks')}, played={ttsOutcome.get('played')})"
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "dispatched",
"hasWebSocket": True,
"chunks": ttsOutcome.get("chunks"),
"played": ttsOutcome.get("played"),
"timestamp": getIsoTimestamp(),
})
else:
logger.warning(
f"TTS failed for session {sessionId}: {ttsOutcome.get('error')}"
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "failed",
"hasWebSocket": True,
"chunks": ttsOutcome.get("chunks"),
"played": ttsOutcome.get("played"),
"message": ttsOutcome.get("error"),
"timestamp": getIsoTimestamp(),
})
if not sendChat:
sendChat = True # Fallback to chat if voice-only and TTS failed
# 4b: Chat response (send text message to meeting chat)
if sendChat and textForChat:
try:
if websocket:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
"text": textForChat,
}))
logger.info(f"Chat response sent for session {sessionId}")
except Exception as chatErr:
logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}")
# 4b: Store bot response
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
responseText=storedText,
responseType=responseType,
detectedIntent=speechResult.detectedIntent,
reasoning=speechResult.reasoning,
triggeredByTranscriptId=triggerTranscript.get("id"),
modelName=response.modelName,
processingTime=response.processingTime,
priceCHF=response.priceCHF,
timestamp=getIsoTimestamp(),
).model_dump()
createdResponse = interface.createBotResponse(botResponseData)
# 4c: Emit SSE event
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
"responseText": storedText,
"responseType": responseType.value,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
"timestamp": botResponseData.get("timestamp"),
})
# Update session response count
session = interface.getSession(sessionId)
if session:
count = session.get("botResponseCount", 0) + 1
interface.updateSession(sessionId, {"botResponseCount": count})
self._lastBotResponseText = normalizedResponse
self._lastBotResponseTs = nowTs
# Record bot response in transcript (exactly once, regardless of channel)
botTranscriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=self.config.botName,
text=storedText,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
isFinal=True,
).model_dump()
botTranscript = interface.createTranscript(botTranscriptData)
self._contextBuffer.append({
"speaker": self.config.botName,
"text": storedText,
"timestamp": getUtcTimestamp(),
"source": "botResponse",
})
await _emitSessionEvent(sessionId, "transcript", {
"id": botTranscript.get("id"),
"speaker": self.config.botName,
"text": storedText,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": "botResponse",
"speakerResolvedFromHint": False,
})
# Reset differential writing tracker so next STT creates a new block
self._lastTranscriptSpeaker = self.config.botName
self._lastTranscriptText = storedText
self._lastTranscriptId = botTranscript.get("id")
self._followUpWindowEnd = time.time() + 15.0
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")
# Step 5: Execute AI-issued commands (if any)
if speechResult.commands:
await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)
# When AI used only commands (no responseText), emit botResponse SSE
# so the UI shows the response. Extract text from sendChat commands.
if speechResult.shouldRespond and not speechResult.responseText:
cmdTexts = [
c.params.get("text", "") for c in speechResult.commands
if c.action == "sendChat" and c.params and c.params.get("text")
]
combinedText = " ".join(cmdTexts) if cmdTexts else None
if combinedText:
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
responseText=combinedText,
responseType=TeamsbotResponseType.CHAT,
detectedIntent=speechResult.detectedIntent,
reasoning=speechResult.reasoning,
triggeredByTranscriptId=triggerTranscript.get("id"),
modelName=response.modelName,
processingTime=response.processingTime,
priceCHF=response.priceCHF,
timestamp=getIsoTimestamp(),
).model_dump()
createdResponse = interface.createBotResponse(botResponseData)
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
"responseText": combinedText,
"responseType": TeamsbotResponseType.CHAT.value,
"detectedIntent": speechResult.detectedIntent,
"reasoning": speechResult.reasoning,
"modelName": response.modelName,
"processingTime": response.processingTime,
"priceCHF": response.priceCHF,
"timestamp": botResponseData.get("timestamp"),
})
session = interface.getSession(sessionId)
if session:
count = session.get("botResponseCount", 0) + 1
interface.updateSession(sessionId, {"botResponseCount": count})
self._followUpWindowEnd = time.time() + 15.0
logger.info(
f"Bot responded via commands in session {sessionId}: "
f"intent={speechResult.detectedIntent}, follow-up window open for 15s"
)
except Exception as e:
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
finally:
self._aiAnalysisInProgress = False
async def _runEscalationAndRelease(
self,
sessionId: str,
taskBrief: str,
briefingFileIds: List[str],
triggerTranscriptId: Optional[str],
) -> None:
"""Background wrapper for ``_runAgentForMeeting`` that holds the
``_agentEscalationInFlight`` flag for the entire duration of the agent
run — not just for the moment we schedule the task. Without this
wrapper, ``_aiAnalysisInProgress`` would already be ``False`` while
the agent is still researching, and a fresh SPEECH_TEAMS trigger from
a new utterance would race the agent to the voice channel."""
try:
await self._runAgentForMeeting(
sessionId=sessionId,
taskText=taskBrief,
fileIds=briefingFileIds,
sourceLabel="speechEscalation",
triggerTranscriptId=triggerTranscriptId,
)
except asyncio.CancelledError:
logger.info(
f"Session {sessionId}: Escalation agent task cancelled by stop signal"
)
except Exception as escErr:
logger.error(
f"Session {sessionId}: Escalation agent task failed: "
f"{type(escErr).__name__}: {escErr}",
exc_info=True,
)
finally:
self._agentEscalationInFlight = False
self._currentEscalationTask = None
# =========================================================================
# AI Command Execution
# =========================================================================
async def _executeCommands(
self,
sessionId: str,
commands: List[TeamsbotCommand],
voiceInterface,
websocket: WebSocket,
):
"""Execute structured commands returned by the AI.
Each command is dispatched to a dedicated handler function."""
for cmd in commands:
action = cmd.action
params = cmd.params or {}
logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")
try:
if action == "toggleTranscript":
await self._cmdToggleTranscript(sessionId, params, websocket)
elif action == "toggleChat":
await self._cmdToggleChat(sessionId, params, websocket)
elif action == "sendChat":
await self._cmdSendChat(sessionId, params, websocket)
elif action == "readChat":
await self._cmdReadChat(sessionId, params, voiceInterface, websocket)
elif action == "readAloud":
await self._cmdReadAloud(sessionId, params, voiceInterface, websocket)
elif action == "changeLanguage":
await self._cmdChangeLanguage(sessionId, params)
elif action in ("toggleMic", "toggleCamera"):
await self._cmdToggleMicOrCamera(sessionId, action, params, websocket)
elif action == "sendMail":
await self._cmdSendMail(sessionId, params)
elif action == "storeDocument":
await self._cmdStoreDocument(sessionId, params)
else:
logger.warning(f"Session {sessionId}: Unknown command '{action}'")
except Exception as cmdErr:
logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")
async def _cmdToggleTranscript(self, sessionId: str, params: dict, websocket: WebSocket):
"""Caption on/off - toggle Teams live transcript capture."""
enable = params.get("enable", True)
if websocket:
await websocket.send_text(json.dumps({
"type": "botCommand",
"sessionId": sessionId,
"command": "toggleTranscript",
"params": {"enable": enable},
}))
async def _cmdToggleChat(self, sessionId: str, params: dict, websocket: WebSocket):
"""Chat on/off - enable/disable meeting chat monitoring."""
enable = params.get("enable", True)
if websocket:
await websocket.send_text(json.dumps({
"type": "botCommand",
"sessionId": sessionId,
"command": "toggleChat",
"params": {"enable": enable},
}))
async def _cmdSendChat(self, sessionId: str, params: dict, websocket: WebSocket):
"""Send a message to the meeting chat and record it in transcript/SSE."""
chatText = params.get("text", "")
if not chatText:
return
if websocket:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
"text": chatText,
}))
logger.info(f"Chat command sent for session {sessionId}")
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
transcriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=self.config.botName,
text=chatText,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
isFinal=True,
source="chat",
).model_dump()
createdTranscript = interface.createTranscript(transcriptData)
self._contextBuffer.append({
"speaker": self.config.botName,
"text": chatText,
"timestamp": getUtcTimestamp(),
"source": "chat",
})
self._lastTranscriptSpeaker = self.config.botName
self._lastTranscriptText = chatText
self._lastTranscriptId = createdTranscript.get("id")
self._lastBotResponseText = chatText.strip().lower()
self._lastBotResponseTs = time.time()
await _emitSessionEvent(sessionId, "transcript", {
"id": createdTranscript.get("id"),
"speaker": self.config.botName,
"text": chatText,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": "chat",
"speakerResolvedFromHint": False,
})
async def _cmdReadChat(
self,
sessionId: str,
params: dict,
voiceInterface,
websocket: WebSocket,
):
"""Read chat messages (from DB) with optional fromdatetime/todatetime, then speak or send to chat."""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
transcripts = interface.getTranscripts(sessionId)
fromDt = params.get("fromdatetime") or params.get("fromDateTime")
toDt = params.get("todatetime") or params.get("toDateTime")
chatOnly = [t for t in transcripts if t.get("source") in ("chat", "chatHistory")]
if fromDt:
chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") >= fromDt]
if toDt:
chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") <= toDt]
summary = "\n".join(f"[{t.get('speaker', '?')}]: {t.get('text', '')}" for t in chatOnly[-20:])
if not summary:
summary = "Keine Chat-Nachrichten im angegebenen Zeitraum."
if voiceInterface and websocket:
spokenSummary = await self._summarizeForVoice(sessionId, summary[:2000])
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=spokenSummary,
languageCode=self.config.language,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
async def _cmdReadAloud(
self,
sessionId: str,
params: dict,
voiceInterface,
websocket: WebSocket,
):
"""Read text aloud via TTS and play in meeting."""
readText = params.get("text", "")
if readText and voiceInterface and websocket:
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=_voiceFriendlyMeetingText(readText),
languageCode=self.config.language,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
async def _cmdChangeLanguage(self, sessionId: str, params: dict):
"""Change bot language."""
newLang = params.get("language", "")
if newLang:
self.config = self.config.model_copy(update={"language": newLang})
logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})
async def _cmdToggleMicOrCamera(
self,
sessionId: str,
action: str,
params: dict,
websocket: WebSocket,
):
"""Toggle mic or camera in the meeting."""
if websocket:
await websocket.send_text(json.dumps({
"type": "botCommand",
"sessionId": sessionId,
"command": action,
"params": params,
}))
async def _cmdSendMail(self, sessionId: str, params: dict):
"""Send email via Service Center MessagingService."""
recipient = params.get("recipient") or params.get("to", "")
subject = params.get("subject", "")
message = params.get("message") or params.get("body", "")
if not recipient or not subject:
logger.warning(f"Session {sessionId}: sendMail requires recipient and subject")
return
try:
from modules.serviceCenter import ServiceCenterContext, getService
ctx = ServiceCenterContext(
user=self.currentUser,
mandate_id=self.mandateId,
feature_instance_id=self.instanceId,
)
messaging = getService("messaging", ctx)
success = messaging.sendEmailDirect(
recipient=recipient,
subject=subject,
message=message,
userId=str(self.currentUser.id) if self.currentUser else None,
)
if success:
logger.info(f"Session {sessionId}: Email sent to {recipient}")
else:
logger.warning(f"Session {sessionId}: Email send failed for {recipient}")
except Exception as e:
logger.warning(f"Session {sessionId}: sendMail failed: {e}")
async def _cmdStoreDocument(self, sessionId: str, params: dict):
"""Store document via Service Center SharepointService."""
sitePath = params.get("sitePath") or params.get("site", "")
folderPath = params.get("folderPath") or params.get("folder", "")
fileName = params.get("fileName", "document.txt")
content = params.get("content", "")
if isinstance(content, str):
content = content.encode("utf-8")
if not sitePath or not folderPath:
logger.warning(f"Session {sessionId}: storeDocument requires sitePath and folderPath")
return
try:
from modules.serviceCenter import ServiceCenterContext, getService
ctx = ServiceCenterContext(
user=self.currentUser,
mandate_id=self.mandateId,
feature_instance_id=self.instanceId,
)
sharepoint = getService("sharepoint", ctx)
if not sharepoint.setAccessTokenFromConnection(self.currentUser):
logger.warning(f"Session {sessionId}: SharePoint connection not configured")
return
site = await sharepoint.getSiteByStandardPath(sitePath)
if not site:
logger.warning(f"Session {sessionId}: SharePoint site not found: {sitePath}")
return
result = await sharepoint.uploadFile(
siteId=site["id"],
folderPath=folderPath,
fileName=fileName,
content=content,
)
if "error" in result:
logger.warning(f"Session {sessionId}: storeDocument failed: {result['error']}")
else:
logger.info(f"Session {sessionId}: Document stored: {fileName}")
except Exception as e:
logger.warning(f"Session {sessionId}: storeDocument failed: {e}")
# =========================================================================
# Director Prompts (private operator instructions during a live meeting)
# =========================================================================
def _collectActiveDirectorBriefings(self) -> List[Dict[str, Any]]:
"""Return the deduplicated list of director-prompt briefings that are
currently relevant for the meeting context: every active persistent
prompt PLUS every recent one-shot prompt that still sits in the
``_recentDirectorBriefings`` pool. Each entry carries ``text``,
``fileIds`` (UDB attachments), ``mode``, ``promptId`` and ``note``
(the agent's internal analysis from the SILENT director run, if any).
"""
seen: Dict[str, Dict[str, Any]] = {}
for p in self._activePersistentPrompts:
pid = p.get("id") or ""
seen[pid] = {
"promptId": pid,
"mode": p.get("mode") or "persistent",
"text": (p.get("text") or "").strip(),
"fileIds": list(p.get("fileIds") or []),
"note": (p.get("responseText") or "").strip(),
}
for b in self._recentDirectorBriefings:
pid = b.get("promptId") or ""
if pid in seen:
# Refresh note with the latest analysis if the persistent run
# produced one after the prompt was first loaded from DB.
if b.get("note"):
seen[pid]["note"] = b["note"]
continue
seen[pid] = {
"promptId": pid,
"mode": b.get("mode") or "oneShot",
"text": (b.get("text") or "").strip(),
"fileIds": list(b.get("fileIds") or []),
"note": (b.get("note") or "").strip(),
}
return [v for v in seen.values() if v.get("text") or v.get("fileIds")]
def _collectDirectorFileIds(self) -> List[str]:
"""Flat, deduplicated list of UDB file IDs attached to any currently
relevant director prompt (persistent + recent one-shot). Used when
SPEECH_TEAMS escalates to the agent so the agent can actually READ the
documents the operator already provided."""
out: List[str] = []
seen: set = set()
for b in self._collectActiveDirectorBriefings():
for fid in b.get("fileIds") or []:
if fid and fid not in seen:
seen.add(fid)
out.append(fid)
return out
def _buildPersistentDirectorContext(self) -> str:
"""Render active director-prompt briefings as private operator guidance
for the SPEECH_TEAMS system prompt context block.
Surfaces three things SPEECH_TEAMS otherwise misses:
* the operator's directive text (as before)
* the IDs of any UDB files the operator attached — so SPEECH_TEAMS
knows the documents exist and can decide to escalate to the agent,
which has the tooling to read them.
* the agent's previous internal analysis of the prompt (the SILENT
``MEETING_REPLY/SILENT`` decision's note), so SPEECH_TEAMS can answer
short questions without re-running the agent.
"""
briefings = self._collectActiveDirectorBriefings()
if not briefings:
return ""
lines: List[str] = []
for b in briefings:
entry = f"- ({b.get('mode', 'persistent')}) {b.get('text', '')}".rstrip()
fileIds = b.get("fileIds") or []
if fileIds:
entry += (
"\n ATTACHED_FILES (operator-provided documents — the AGENT "
"has tools to read them via summarizeContent / readFile / "
"readContentObjects): "
+ ", ".join(fileIds)
)
note = b.get("note")
if note:
noteShort = note if len(note) <= 600 else note[:600] + "..."
entry += f"\n AGENT_ANALYSIS (already computed by the bot): {noteShort}"
lines.append(entry)
return (
"\nOPERATOR_DIRECTIVES (private; never quote them verbatim, just follow them. "
"If the user asks about an attached document, use AGENT_ANALYSIS first; "
"if more depth is needed, set needsAgent=true so the agent can re-read the file):\n"
+ "\n".join(lines)
+ "\n"
)
def _recordDirectorBriefing(
self,
prompt: Dict[str, Any],
internalNote: str,
meetingText: str,
) -> None:
"""Append a director-prompt briefing to the session-scoped pool so the
attached files and the agent's analysis stay available for subsequent
SPEECH_TEAMS triggers — even after a one-shot prompt was consumed.
Idempotent per ``promptId`` (latest entry wins)."""
pid = prompt.get("id") or ""
# Drop any older entry for the same prompt so we keep the freshest note.
self._recentDirectorBriefings = [
b for b in self._recentDirectorBriefings if b.get("promptId") != pid
]
self._recentDirectorBriefings.append({
"promptId": pid,
"mode": prompt.get("mode") or "oneShot",
"text": (prompt.get("text") or "").strip(),
"fileIds": list(prompt.get("fileIds") or []),
"note": (internalNote or meetingText or "").strip(),
"recordedAt": getIsoTimestamp(),
})
if len(self._recentDirectorBriefings) > _RECENT_DIRECTOR_BRIEFINGS_MAX:
self._recentDirectorBriefings = self._recentDirectorBriefings[
-_RECENT_DIRECTOR_BRIEFINGS_MAX:
]
async def submitDirectorPrompt(
self,
sessionId: str,
operatorUserId: str,
text: str,
mode: TeamsbotDirectorPromptMode,
fileIds: List[str],
) -> Dict[str, Any]:
"""Persist a new director prompt and trigger immediate agent processing.
Returns the created prompt record. Processing happens asynchronously
and emits SSE events ('directorPrompt') for the operator UI.
"""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
promptData = TeamsbotDirectorPrompt(
sessionId=sessionId,
instanceId=self.instanceId,
operatorUserId=operatorUserId,
text=text,
mode=mode,
fileIds=fileIds or [],
status=TeamsbotDirectorPromptStatus.QUEUED,
).model_dump()
created = interface.createDirectorPrompt(promptData)
# Persistent prompts join in-memory directives immediately so they
# also influence subsequent SPEECH_TEAMS triggers, not only the
# one-shot agent run we kick off below.
if mode == TeamsbotDirectorPromptMode.PERSISTENT:
self._activePersistentPrompts.append(created)
await _emitSessionEvent(sessionId, "directorPrompt", {
"id": created.get("id"),
"status": created.get("status"),
"mode": created.get("mode"),
"text": created.get("text"),
"fileIds": created.get("fileIds", []),
"createdAt": created.get("createdAt"),
})
asyncio.create_task(self._processDirectorPrompt(created))
return created
async def removePersistentPrompt(self, promptId: str) -> bool:
"""Remove a persistent director prompt (operator clicked 'remove')."""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
sessionId = self._activeSessionId
prompt = interface.getDirectorPrompt(promptId)
if not prompt:
return False
interface.updateDirectorPrompt(promptId, {
"status": TeamsbotDirectorPromptStatus.CONSUMED.value,
"consumedAt": getIsoTimestamp(),
"statusMessage": "Removed by operator",
})
self._activePersistentPrompts = [
p for p in self._activePersistentPrompts if p.get("id") != promptId
]
# Also drop the briefing copy so SPEECH_TEAMS forgets the doc reference
# immediately; otherwise the bot would keep "remembering" a doc the
# operator just retired.
self._recentDirectorBriefings = [
b for b in self._recentDirectorBriefings if b.get("promptId") != promptId
]
if sessionId:
await _emitSessionEvent(sessionId, "directorPrompt", {
"id": promptId,
"status": TeamsbotDirectorPromptStatus.CONSUMED.value,
"mode": prompt.get("mode"),
"text": prompt.get("text"),
"removed": True,
})
return True
async def _processDirectorPrompt(self, prompt: Dict[str, Any]) -> None:
"""Run the agent for a director prompt and deliver the FINAL text into
the meeting via TTS + chat (using the bot's existing channels)."""
from . import interfaceFeatureTeamsbot as interfaceDb
sessionId = prompt.get("sessionId")
promptId = prompt.get("id")
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
interface.updateDirectorPrompt(promptId, {
"status": TeamsbotDirectorPromptStatus.RUNNING.value,
})
await _emitSessionEvent(sessionId, "directorPrompt", {
"id": promptId,
"status": TeamsbotDirectorPromptStatus.RUNNING.value,
})
# Build a task brief for the agent that surfaces the meeting context.
recentTranscript = self._renderRecentTranscriptForAgent(maxLines=20)
directorText = (prompt.get("text") or "").strip()
attachedFileIds = list(prompt.get("fileIds") or [])
promptMode = (prompt.get("mode") or "").lower()
isPersistentPrompt = promptMode == TeamsbotDirectorPromptMode.PERSISTENT.value.lower()
# Make file attachment EXPLICIT in the brief. The agent service already
# prepends a "## Attached Files & Folders" header via _enrichPromptWithFiles
# when fileIds are passed, but without an explicit instruction the agent
# sometimes goes straight to a generic answer. We force the workflow:
# studyDocs -> form briefing -> decide MEETING_REPLY vs SILENT.
filesBlock = ""
if attachedFileIds:
filesBlock = (
"\nANGEHAENGTE DOKUMENTE (UDB-File-IDs): "
+ ", ".join(attachedFileIds)
+ "\nDu MUSST diese Dokumente VOR der finalen Antwort lesen / zusammenfassen "
"(z.B. summarizeContent, readFile, readContentObjects, describeImage). "
"Beziehe Fakten und Zitate aus den Dokumenten in deine Notiz / dein "
"Meeting-Reply ein, statt allgemein zu antworten.\n"
)
# Persistent prompts that ship documents are usually a "knowledge briefing"
# the operator wants the bot to STUDY now and USE LATER. The SILENT note
# in that case must be a useful, file-grounded summary that subsequent
# SPEECH_TEAMS triggers can pick up — not "noted".
persistentNoteHint = ""
if isPersistentPrompt and attachedFileIds:
persistentNoteHint = (
"\nSPEZIAL fuer PERSISTENT + Dokumente: Wenn die Anweisung KEIN explizites "
"Meeting-Statement verlangt, antworte mit 'SILENT:' und liefere als interne "
"Notiz eine STRUKTURIERTE, faktendichte Briefing-Zusammenfassung der Dokumente "
"(Stichpunkte, Kennzahlen, Aussagen, die fuer Folgefragen im Meeting relevant "
"sein koennen). Diese Notiz wird spaeteren Meeting-Antworten als Wissensbasis "
"vorgelegt — schreibe sie also so, dass der Bot daraus zitieren kann.\n"
)
taskText = (
f"Du bist der KI-Assistent in einem laufenden Teams-Meeting (Bot-Name: {self.config.botName}).\n"
f"Der Operator hat dir folgende PRIVATE Regieanweisung gegeben (die anderen Teilnehmer im "
f"Meeting sehen sie NICHT). Sie ist KEINE Frage an das Meeting, sondern eine interne "
f"Anweisung an dich:\n\n"
f"{directorText}\n"
f"{filesBlock}"
f"{persistentNoteHint}\n"
f"AKTUELLER MEETING-KONTEXT (juengste Aussagen):\n{recentTranscript}\n\n"
"ANTWORT-PROTOKOLL — Beginne deine FINALE Antwort mit GENAU EINEM dieser Marker:\n"
"'MEETING_REPLY:' gefolgt vom Text, der im Meeting gesprochen / in den Meeting-Chat "
"gepostet werden soll. Verwende diesen Marker NUR, wenn die Regieanweisung dich explizit "
"auffordert, jetzt etwas im Meeting zu sagen oder zu schreiben (Beispiele: 'stell dich vor', "
"'fasse zusammen', 'stelle Person X eine Frage', 'beantworte die letzte Frage'). Halte den "
"Text kurz, sprachlich passend zur Stimme und ohne Marker oder Meta-Kommentare.\n"
"'SILENT:' gefolgt von einer internen Notiz fuer das Operator-UI. "
"Verwende diesen Marker fuer interne Direktiven und Wissens-Briefings (Beispiele: "
"'achte ab jetzt auf X', 'merke dir Y', 'studiere Dokument Z'). "
"Dieser Text wird NICHT ins Meeting gegeben, dient aber spaeteren Meeting-Antworten "
"als Wissensbasis. Wenn Dokumente angehaengt sind, MUSS die Notiz konkrete, "
"zitierfaehige Fakten aus den Dokumenten enthalten.\n\n"
"Standard ist SILENT, wenn nicht eindeutig zur Meeting-Interaktion aufgefordert wurde. "
"Wiederhole NIEMALS die Regieanweisung selbst im MEETING_REPLY-Text."
)
try:
finalText = await self._runAgentForMeeting(
sessionId=sessionId,
taskText=taskText,
fileIds=attachedFileIds,
sourceLabel="directorPrompt",
triggerTranscriptId=None,
promptId=promptId,
directorPromptMode=True,
)
# One-shot: mark consumed; persistent: keep active but record success.
isPersistent = prompt.get("mode") == TeamsbotDirectorPromptMode.PERSISTENT.value
updates: Dict[str, Any] = {
"status": TeamsbotDirectorPromptStatus.SUCCEEDED.value,
"responseText": finalText or "",
}
if not isPersistent:
updates["status"] = TeamsbotDirectorPromptStatus.CONSUMED.value
updates["consumedAt"] = getIsoTimestamp()
interface.updateDirectorPrompt(promptId, updates)
await _emitSessionEvent(sessionId, "directorPrompt", {
"id": promptId,
"status": updates["status"],
"responseText": finalText,
})
except Exception as e:
logger.error(
f"Session {sessionId}: Director prompt {promptId} failed: {type(e).__name__}: {e}",
exc_info=True,
)
interface.updateDirectorPrompt(promptId, {
"status": TeamsbotDirectorPromptStatus.FAILED.value,
"statusMessage": f"{type(e).__name__}: {str(e)[:300]}",
})
await _emitSessionEvent(sessionId, "directorPrompt", {
"id": promptId,
"status": TeamsbotDirectorPromptStatus.FAILED.value,
"error": f"{type(e).__name__}: {str(e)[:300]}",
})
self._activePersistentPrompts = [
p for p in self._activePersistentPrompts if p.get("id") != promptId
]
def _renderRecentTranscriptForAgent(self, maxLines: int = 20) -> str:
"""Render the most recent context buffer entries for inclusion in the
agent task brief (similar to SPEECH_TEAMS context, but plain text)."""
if not self._contextBuffer:
return "(noch keine Aussagen erfasst)"
recent = self._contextBuffer[-maxLines:]
lines = []
for seg in recent:
speaker = seg.get("speaker", "Unknown")
text = seg.get("text", "")
segSource = seg.get("source", "caption")
prefix = "Chat: " if segSource == "chat" else ""
if self._isBotSpeaker(speaker):
lines.append(f"[YOU ({self.config.botName})]: {text}")
else:
lines.append(f"[{prefix}{speaker}]: {text}")
return "\n".join(lines)
async def _interimAgentBusyMessage(self) -> Optional[str]:
"""Short spoken/chat line before a potentially long agent run (web,
tools). Phrasing is AI-localised to ``self.config.language`` and
cached per session — no hardcoded language branching. Returns
``None`` if generation failed; caller must treat that as
'silently skip the interim notice'."""
return await self._pickEphemeralPhrase("agentBusy")
async def _interimAgentRoundMessage(
self, roundNum: int, maxRounds: int
) -> Optional[str]:
"""Per-round progress notice for long agent runs (meeting voice /
chat, ephemeral). Phrasing is AI-localised once per session;
``{round}`` and ``{maxRounds}`` placeholders are substituted at
render time. Returns ``None`` if generation failed."""
return await self._pickEphemeralPhrase(
"agentRound",
substitutions={"round": roundNum, "maxRounds": maxRounds},
)
async def _notifyMeetingEphemeral(self, sessionId: str, text: str) -> None:
"""Deliver a short line to the meeting (TTS + chat per config) without
persisting botResponses/transcripts, so the main agent answer stays the
single recorded follow-up."""
websocket = self._websocket
voiceInterface = self._voiceInterface
if not websocket:
logger.warning(f"Session {sessionId}: Interim notice skipped — no WebSocket")
return
channelRaw = self.config.responseChannel
channelStr = (
channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw)
).lower().strip()
sendVoice = channelStr in ("voice", "both")
sendChat = channelStr in ("chat", "both")
if sendVoice and voiceInterface:
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
outcome = await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=_voiceFriendlyMeetingText(text),
languageCode=self.config.language,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
if not outcome.get("success"):
logger.warning(
f"Session {sessionId}: Interim TTS failed ({outcome.get('error')}) — falling back to chat"
)
if not sendChat:
sendChat = True
if sendChat:
try:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
"text": text,
}))
except Exception as chatErr:
logger.warning(f"Session {sessionId}: Interim chat failed: {chatErr}")
await _emitSessionEvent(sessionId, "agentRun", {
"status": "interimNotice",
"message": text,
"timestamp": getIsoTimestamp(),
})
async def _runAgentForMeeting(
self,
sessionId: str,
taskText: str,
fileIds: List[str],
sourceLabel: str,
triggerTranscriptId: Optional[str] = None,
promptId: Optional[str] = None,
directorPromptMode: bool = False,
) -> str:
"""Run agentService.runAgent for a meeting context, deliver the FINAL
text via the bot's existing TTS + chat channels, and return that text.
sourceLabel is used for logging and SSE differentiation
('directorPrompt' or 'speechEscalation').
``directorPromptMode`` activates the silent-by-default protocol for
operator director prompts: interim notices are suppressed, no per-round
meeting updates, and the FINAL text is parsed for an explicit
``MEETING_REPLY:`` / ``SILENT:`` marker. Only ``MEETING_REPLY`` content
is dispatched to the meeting; everything else stays internal.
"""
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
AgentConfig, AgentEventTypeEnum
)
ctx = ServiceCenterContext(
user=self.currentUser,
mandate_id=self.mandateId,
feature_instance_id=self.instanceId,
feature_code="teamsbot",
)
agentService = _getServiceCenterService("agent", ctx)
# Workflow id stable per session so RAG/round-memory accumulate per meeting.
workflowId = f"teamsbot:{sessionId}"
agentConfig = AgentConfig(
maxRounds=TEAMSBOT_AGENT_MAX_ROUNDS,
maxCostCHF=TEAMSBOT_AGENT_MAX_COST_CHF,
toolSet="core",
initialToolboxes=["core", "web"],
excludeActionTools=True,
)
await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel,
"promptId": promptId,
"status": "started",
"timestamp": getIsoTimestamp(),
})
# Director prompts run silently by default — no spontaneous "moment please"
# in the meeting just because the operator gave an internal directive.
if not directorPromptMode:
try:
interimText = await self._interimAgentBusyMessage()
if interimText:
await self._notifyMeetingEphemeral(sessionId, interimText)
except Exception as interimErr:
logger.warning(f"Session {sessionId}: Interim agent notice failed: {interimErr}")
finalText: str = ""
rounds = 0
try:
async for event in agentService.runAgent(
prompt=taskText,
fileIds=fileIds or None,
config=agentConfig,
toolSet="core",
workflowId=workflowId,
):
if event.type == AgentEventTypeEnum.AGENT_PROGRESS:
rounds += 1
pdata = event.data or {}
roundNum = int(pdata.get("round", rounds))
maxR = int(pdata.get("maxRounds", TEAMSBOT_AGENT_MAX_ROUNDS))
await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel,
"promptId": promptId,
"status": "progress",
"round": roundNum,
"maxRounds": maxR,
})
# Runde 1: schon allgemeiner Start-Hinweis; ab Runde 2 ins Meeting melden.
# Director prompts bleiben still — keine Zwischen-Updates ins Meeting.
if roundNum >= 2 and not directorPromptMode:
try:
roundText = await self._interimAgentRoundMessage(roundNum, maxR)
if roundText:
await self._notifyMeetingEphemeral(sessionId, roundText)
except Exception as roundNoticeErr:
logger.warning(
f"Session {sessionId}: Per-round agent notice failed: {roundNoticeErr}"
)
elif event.type == AgentEventTypeEnum.TOOL_CALL:
toolName = (event.data or {}).get("toolName") if event.data else None
await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel,
"promptId": promptId,
"status": "toolCall",
"toolName": toolName,
})
elif event.type == AgentEventTypeEnum.FINAL:
finalText = (event.content or "").strip()
elif event.type == AgentEventTypeEnum.ERROR:
raise RuntimeError(event.content or "Agent error")
except Exception as runErr:
await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel,
"promptId": promptId,
"status": "error",
"error": str(runErr)[:500],
})
raise
await _emitSessionEvent(sessionId, "agentRun", {
"source": sourceLabel,
"promptId": promptId,
"status": "completed",
"rounds": rounds,
"hasText": bool(finalText),
})
if finalText:
if directorPromptMode:
decision = _parseDirectorPromptFinal(finalText)
kind = decision.get("kind", "silent")
meetingText = (decision.get("meetingText") or "").strip()
internalNote = (decision.get("internalNote") or "").strip()
logger.info(
f"Session {sessionId}: Director prompt {promptId} -> kind={kind}, "
f"meetingChars={len(meetingText)}, noteChars={len(internalNote)}"
)
await _emitSessionEvent(sessionId, "directorPrompt", {
"id": promptId,
"status": "decision",
"decision": kind,
"meetingText": meetingText,
"internalNote": internalNote,
})
# Record this prompt as a session-scoped briefing BEFORE we hand
# delivery off. This is what later SPEECH_TEAMS triggers see, so
# if the user attached a doc with mode=PERSISTENT and the agent
# produced a file-grounded SILENT note, that note (and the
# original fileIds) stays available for "summarize the doc"
# follow-up questions in the meeting.
try:
promptRecord: Dict[str, Any] = {}
if promptId:
try:
from . import interfaceFeatureTeamsbot as _ifaceDb
_iface = _ifaceDb.getInterface(
self.currentUser, self.mandateId, self.instanceId
)
promptRecord = _iface.getDirectorPrompt(promptId) or {}
except Exception as _lookupErr:
logger.debug(
f"Briefing pool: could not look up prompt {promptId}: {_lookupErr}"
)
if promptRecord or promptId:
self._recordDirectorBriefing(
prompt=promptRecord or {"id": promptId},
internalNote=internalNote,
meetingText=meetingText,
)
except Exception as briefErr:
logger.warning(
f"Session {sessionId}: Director briefing pool update failed: {briefErr}"
)
# If this was a persistent prompt, the live in-memory copy in
# ``_activePersistentPrompts`` was loaded BEFORE the agent ran
# — refresh its ``responseText`` so subsequent
# ``_collectActiveDirectorBriefings`` calls show the latest
# analysis without waiting for the next session reload.
if promptId:
for p in self._activePersistentPrompts:
if p.get("id") == promptId:
p["responseText"] = internalNote or meetingText or finalText
break
if kind == "meeting" and meetingText:
await self._deliverTextToMeeting(
sessionId=sessionId,
text=meetingText,
detectedIntent=f"agent:{sourceLabel}",
reasoning=f"Agent run ({sourceLabel})",
triggerTranscriptId=triggerTranscriptId,
)
else:
# Silent: persist as internal-only botResponse so the operator
# UI keeps a record, but DO NOT push into the meeting (no TTS,
# no chat send). The director prompt SSE above already carries
# the note for the operator UI.
await self._persistInternalDirectorReply(
sessionId=sessionId,
internalNote=internalNote or finalText,
promptId=promptId,
triggerTranscriptId=triggerTranscriptId,
)
return meetingText if kind == "meeting" else ""
await self._deliverTextToMeeting(
sessionId=sessionId,
text=finalText,
detectedIntent=f"agent:{sourceLabel}",
reasoning=f"Agent run ({sourceLabel})",
triggerTranscriptId=triggerTranscriptId,
)
return finalText
async def _deliverTextToMeeting(
self,
sessionId: str,
text: str,
detectedIntent: str,
reasoning: str,
triggerTranscriptId: Optional[str] = None,
) -> None:
"""Send agent text into the meeting via the same channels SPEECH_TEAMS
uses: TTS + chat per config, plus DB persistence and SSE events.
Uses the websocket/voiceInterface stored on this instance. If the bot
is not connected anymore, the call still records the response in the DB
and emits SSE so the operator UI shows the agent answer.
"""
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
websocket = self._websocket
voiceInterface = self._voiceInterface
channelRaw = self.config.responseChannel
channelStr = (
channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw)
).lower().strip()
sendVoice = channelStr in ("voice", "both")
sendChat = channelStr in ("chat", "both")
if sendVoice and sendChat:
responseType = TeamsbotResponseType.BOTH
elif sendVoice:
responseType = TeamsbotResponseType.AUDIO
else:
responseType = TeamsbotResponseType.CHAT
# Voice (TTS input is voice-sanitized; chat + DB keep full structured text).
# Long agent answers must be chunked: Google TTS rejects single sentences
# > ~5000 bytes, and the Chirp3 voices fail on long comma-heavy lines too.
ttsOutcome: Optional[Dict[str, Any]] = None
if sendVoice and voiceInterface and websocket:
spokenText = await self._summarizeForVoice(sessionId, text)
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
ttsOutcome = await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=spokenText,
languageCode=self.config.language,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "dispatched" if ttsOutcome.get("success") else "failed",
"hasWebSocket": True,
"chunks": ttsOutcome.get("chunks"),
"played": ttsOutcome.get("played"),
"error": ttsOutcome.get("error"),
"timestamp": getIsoTimestamp(),
})
if not ttsOutcome.get("success"):
logger.warning(
f"Session {sessionId}: Agent TTS delivery failed "
f"({ttsOutcome.get('error')}) — falling back to meeting chat"
)
if not sendChat:
sendChat = True
# Chat
if sendChat and websocket:
try:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
"text": text,
}))
logger.info(f"Session {sessionId}: Agent chat dispatched ({len(text)} chars)")
except Exception as chatErr:
logger.warning(f"Session {sessionId}: Agent chat delivery failed: {chatErr}")
# Persist as botResponse + transcript so it shows up in history/UI.
intentEnum, intentMeta = _coercePersistedDetectedIntent(detectedIntent)
reasoningForDb = (
f"{reasoning} [{intentMeta}]" if intentMeta else reasoning
)
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
responseText=text,
responseType=responseType,
detectedIntent=intentEnum,
reasoning=reasoningForDb,
triggeredByTranscriptId=triggerTranscriptId,
modelName="agent",
processingTime=0.0,
priceCHF=0.0,
timestamp=getIsoTimestamp(),
).model_dump()
createdResponse = interface.createBotResponse(botResponseData)
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
"responseText": text,
"responseType": responseType.value,
"detectedIntent": intentEnum.value,
"reasoning": reasoningForDb,
"modelName": "agent",
"processingTime": 0.0,
"priceCHF": 0.0,
"timestamp": botResponseData.get("timestamp"),
})
botTranscriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=self.config.botName,
text=text,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=self.config.language,
isFinal=True,
source="botResponse",
).model_dump()
botTranscript = interface.createTranscript(botTranscriptData)
self._contextBuffer.append({
"speaker": self.config.botName,
"text": text,
"timestamp": getUtcTimestamp(),
"source": "botResponse",
})
self._lastTranscriptSpeaker = self.config.botName
self._lastTranscriptText = text
self._lastTranscriptId = botTranscript.get("id")
self._lastBotResponseText = text.strip().lower()
self._lastBotResponseTs = time.time()
self._followUpWindowEnd = time.time() + 15.0
await _emitSessionEvent(sessionId, "transcript", {
"id": botTranscript.get("id"),
"speaker": self.config.botName,
"text": text,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": "botResponse",
"speakerResolvedFromHint": False,
})
session = interface.getSession(sessionId)
if session:
count = session.get("botResponseCount", 0) + 1
interface.updateSession(sessionId, {"botResponseCount": count})
async def _persistInternalDirectorReply(
self,
sessionId: str,
internalNote: str,
promptId: Optional[str],
triggerTranscriptId: Optional[str] = None,
) -> None:
"""Record a director-prompt agent reply as INTERNAL (operator-UI only).
Unlike ``_deliverTextToMeeting`` this never dispatches TTS or chat into
the meeting, never appends to the meeting context buffer, and does not
create a meeting transcript line. It only persists a botResponse and
emits an SSE event so the operator UI shows what the agent decided.
"""
from . import interfaceFeatureTeamsbot as interfaceDb
note = (internalNote or "").strip()
if not note:
return
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
intentEnum, _intentMeta = _coercePersistedDetectedIntent("agent:directorPrompt")
reasoningForDb = (
f"Director prompt {promptId or ''} — silent / internal only "
f"(not sent to meeting)"
).strip()
botResponseData = TeamsbotBotResponse(
sessionId=sessionId,
responseText=note,
responseType=TeamsbotResponseType.CHAT,
detectedIntent=intentEnum,
reasoning=reasoningForDb,
triggeredByTranscriptId=triggerTranscriptId,
modelName="agent",
processingTime=0.0,
priceCHF=0.0,
timestamp=getIsoTimestamp(),
).model_dump()
createdResponse = interface.createBotResponse(botResponseData)
await _emitSessionEvent(sessionId, "botResponse", {
"id": createdResponse.get("id"),
"responseText": note,
"responseType": TeamsbotResponseType.CHAT.value,
"detectedIntent": intentEnum.value,
"reasoning": reasoningForDb,
"modelName": "agent",
"processingTime": 0.0,
"priceCHF": 0.0,
"timestamp": botResponseData.get("timestamp"),
"internalOnly": True,
"promptId": promptId,
})
logger.info(
f"Session {sessionId}: Director prompt {promptId} silent reply "
f"persisted internally ({len(note)} chars)"
)
# =========================================================================
# Greeting (AI-localised, no hardcoded language strings)
# =========================================================================
async def _generateGreetingText(self, languageCode: str) -> str:
"""Generate the bot's join greeting via AI in ``languageCode`` and the
configured persona. Returns empty string on failure — the caller must
treat that as 'skip the greeting' (NEVER fall back to a hardcoded
localised string)."""
targetLang = (languageCode or self.config.language or "").strip() or "en-US"
botName = (self.config.botName or "the assistant").strip()
firstName = botName.split(" ")[0] if botName else botName
persona = (self.config.aiSystemPrompt or "").strip()
# English instructions to the LLM; the OUTPUT must be in ``targetLang``.
prompt = (
f"You are localizing the join greeting for a meeting assistant.\n\n"
f"Assistant display name (use exactly this, no translation): {firstName}\n\n"
f"Persona / style guide for the assistant:\n"
f"{persona or '(no persona configured — use a neutral, polite, professional tone)'}\n\n"
f"Target spoken language (BCP-47 code): {targetLang}\n\n"
f"Generate ONE short greeting (max ~14 words) for the assistant "
f"to say AND post in chat the moment it joins a meeting. The "
f"greeting MUST:\n"
f" - be in the target language\n"
f" - introduce the assistant by name ({firstName})\n"
f" - signal that it is now present and ready\n"
f" - sound natural when spoken aloud (this text is also TTS'd)\n\n"
f"Output ONLY the greeting text, no quotes, no markdown, no "
f"commentary, no surrounding punctuation beyond what naturally "
f"belongs to the sentence."
)
try:
aiService = _createAiService(
self.currentUser, self.mandateId, self.instanceId
)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=prompt,
context="",
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.SPEED,
),
)
response = await aiService.callAi(request)
except Exception as aiErr:
logger.warning(
f"Greeting generation crashed (lang={targetLang}): {aiErr}"
)
return ""
if not response or response.errorCount != 0 or not response.content:
logger.warning(
f"Greeting generation returned empty/error (lang={targetLang})"
)
return ""
text = response.content.strip()
# Strip any wrapping quotes/code fences the model might have added.
text = re.sub(r"^```.*?\n", "", text, flags=re.DOTALL)
text = re.sub(r"\n```\s*$", "", text)
text = text.strip().strip("\"'`").strip()
if not text:
return ""
logger.info(
f"Greeting generated (lang={targetLang}, chars={len(text)}): {text[:80]}"
)
return text
async def _dispatchGreetingToMeeting(
self,
sessionId: str,
greetingText: str,
greetingLang: str,
sendToChat: bool,
interface: Any,
voiceInterface: Any,
websocket: WebSocket,
) -> None:
"""Centralised dispatcher for the bot's join greeting: speaks the
text via TTS into the meeting and (optionally) tells the bot to post
it in the meeting chat. Persists the greeting as a bot transcript /
botResponse so it appears in the operator UI history.
``sendToChat`` is ``False`` for the legacy ``voiceGreeting`` path
(the bot already chatted itself) and ``True`` for the new
``requestGreeting`` path where the Gateway owns chat dispatch too.
"""
try:
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "requested",
"hasWebSocket": True,
"message": "Greeting TTS requested",
"timestamp": getIsoTimestamp(),
})
cancelHook = self._makeAnswerCancelHook()
async with self._meetingTtsLock:
ttsOutcome = await _speakTextChunked(
websocket=websocket,
voiceInterface=voiceInterface,
sessionId=sessionId,
voiceText=_voiceFriendlyMeetingText(greetingText),
languageCode=greetingLang,
voiceName=self.config.voiceId,
isCancelled=cancelHook,
)
if ttsOutcome.get("success"):
logger.info(
f"Greeting TTS sent for session {sessionId} "
f"(chunks={ttsOutcome.get('chunks')})"
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "dispatched",
"hasWebSocket": True,
"chunks": ttsOutcome.get("chunks"),
"played": ttsOutcome.get("played"),
"timestamp": getIsoTimestamp(),
})
else:
logger.warning(
f"Greeting TTS failed for session {sessionId}: {ttsOutcome.get('error')}"
)
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
"status": "failed",
"hasWebSocket": True,
"message": ttsOutcome.get("error"),
"timestamp": getIsoTimestamp(),
})
if sendToChat:
try:
await websocket.send_text(json.dumps({
"type": "sendChatMessage",
"sessionId": sessionId,
"text": greetingText,
}))
logger.info(f"Greeting chat dispatch queued for session {sessionId}")
except Exception as chatErr:
logger.warning(
f"Greeting chat dispatch failed for session {sessionId}: {chatErr}"
)
greetingTranscriptData = TeamsbotTranscript(
sessionId=sessionId,
speaker=self.config.botName,
text=greetingText,
timestamp=getIsoTimestamp(),
confidence=1.0,
language=greetingLang,
isFinal=True,
source="botResponse",
).model_dump()
greetingTranscript = interface.createTranscript(greetingTranscriptData)
self._contextBuffer.append({
"speaker": self.config.botName,
"text": greetingText,
"timestamp": getUtcTimestamp(),
"source": "botResponse",
})
self._lastTranscriptSpeaker = self.config.botName
self._lastTranscriptText = greetingText
self._lastTranscriptId = greetingTranscript.get("id")
await _emitSessionEvent(sessionId, "botResponse", {
"id": greetingTranscript.get("id"),
"responseText": greetingText,
"responseType": TeamsbotResponseType.AUDIO.value,
"detectedIntent": "greeting",
"reasoning": "Automatic join greeting",
"timestamp": getIsoTimestamp(),
})
await _emitSessionEvent(sessionId, "transcript", {
"id": greetingTranscript.get("id"),
"speaker": self.config.botName,
"text": greetingText,
"confidence": 1.0,
"timestamp": getIsoTimestamp(),
"isContinuation": False,
"source": "botResponse",
"speakerResolvedFromHint": False,
})
except Exception as dispatchErr:
logger.warning(
f"Greeting dispatch failed for session {sessionId}: {dispatchErr}"
)
# =========================================================================
# Context Summarization (for long sessions)
# =========================================================================
async def _summarizeSessionContext(self, sessionId: str, rawContext: str) -> str:
"""Summarize a long user-provided session context to its essential points.
This reduces token usage in every subsequent AI call."""
try:
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt=(
"Fasse den folgenden Kontext auf die wesentlichen Punkte zusammen. "
"Behalte alle wichtigen Fakten, Namen, Zahlen, Entscheidungen und Aktionspunkte. "
"Entferne Fuelltext und Wiederholungen. "
"Antworte NUR mit der Zusammenfassung, keine Erklaerungen oder Einleitungen."
),
context=rawContext,
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.SPEED,
)
)
response = await aiService.callAi(request)
if response and response.errorCount == 0 and response.content:
summary = response.content.strip()
logger.info(f"Session {sessionId}: Context summarized from {len(rawContext)} to {len(summary)} chars")
return summary
except Exception as e:
logger.warning(f"Session context summarization failed for {sessionId}: {e}")
# Fallback: return original (truncated if very long)
return rawContext[:2000] if len(rawContext) > 2000 else rawContext
async def _summarizeContextBuffer(self, sessionId: str):
"""Summarize the older part of the context buffer to preserve information
without exceeding the context window. This runs in the background."""
try:
if self._contextSummary:
return # Already summarized recently
# Take the older half of the buffer for summarization
halfPoint = len(self._contextBuffer) // 2
oldSegments = self._contextBuffer[:halfPoint]
if len(oldSegments) < 10:
return # Not enough to summarize
# Build text to summarize
lines = []
for seg in oldSegments:
speaker = seg.get("speaker", "Unknown")
text = seg.get("text", "")
lines.append(f"[{speaker}]: {text}")
textToSummarize = "\n".join(lines)
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt="Fasse das folgende Meeting-Transkript in 3-5 Saetzen zusammen. Nenne die wichtigsten Themen, Entscheidungen und offene Fragen. Antworte NUR mit der Zusammenfassung, keine Erklaerungen.",
context=textToSummarize,
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.SPEED,
)
)
response = await aiService.callAi(request)
if response and response.errorCount == 0:
self._contextSummary = response.content.strip()
logger.info(f"Session {sessionId}: Context summarized ({len(oldSegments)} segments -> {len(self._contextSummary)} chars)")
except Exception as e:
logger.warning(f"Context summarization failed for session {sessionId}: {e}")
# =========================================================================
# Meeting Summary
# =========================================================================
async def _generateMeetingSummary(self, sessionId: str):
"""Generate an AI summary of the meeting after it ends."""
try:
from . import interfaceFeatureTeamsbot as interfaceDb
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
transcripts = interface.getTranscripts(sessionId)
if not transcripts or len(transcripts) < 5:
return # Not enough content for a summary
# Build full transcript
fullTranscript = "\n".join(
f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}"
for t in transcripts
)
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
await aiService.ensureAiObjectsInitialized()
request = AiCallRequest(
prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.",
context=fullTranscript,
options=AiCallOptions(
operationType=OperationTypeEnum.DATA_ANALYSE,
priority=PriorityEnum.BALANCED,
)
)
response = await aiService.callAi(request)
if response and response.errorCount == 0:
interface.updateSession(sessionId, {"summary": response.content})
logger.info(f"Meeting summary generated for session {sessionId}")
except Exception as e:
logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")