4042 lines
181 KiB
Python
4042 lines
181 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Teamsbot Service - Pipeline Orchestrator.
|
|
Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge.
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
import re
|
|
import asyncio
|
|
import time
|
|
import base64
|
|
from typing import Optional, Dict, Any, List, Callable
|
|
|
|
from fastapi import WebSocket
|
|
|
|
from modules.datamodels.datamodelUam import User
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
|
from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp
|
|
from modules.serviceCenter import getService as _getServiceCenterService
|
|
from modules.serviceCenter.context import ServiceCenterContext
|
|
|
|
from .datamodelTeamsbot import (
|
|
TeamsbotSessionStatus,
|
|
TeamsbotTranscript,
|
|
TeamsbotBotResponse,
|
|
TeamsbotResponseType,
|
|
TeamsbotConfig,
|
|
TeamsbotResponseMode,
|
|
TeamsbotResponseChannel,
|
|
TeamsbotDetectedIntent,
|
|
SpeechTeamsResponse,
|
|
TeamsbotCommand,
|
|
TeamsbotDirectorPrompt,
|
|
TeamsbotDirectorPromptStatus,
|
|
TeamsbotDirectorPromptMode,
|
|
DIRECTOR_PROMPT_TEXT_LIMIT,
|
|
DIRECTOR_PROMPT_FILE_LIMIT,
|
|
)
|
|
from .browserBotConnector import BrowserBotConnector
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Agent run limits for director prompts / speech escalation (meeting context).
|
|
# Higher than default workspace agent: Teams research + tool chains need depth.
|
|
TEAMSBOT_AGENT_MAX_ROUNDS = 8
|
|
TEAMSBOT_AGENT_MAX_COST_CHF = 0.12
|
|
|
|
# How many recent director-prompt briefings (one-shot + persistent) we keep in
|
|
# session memory so SPEECH_TEAMS triggers and speech escalation can still see
|
|
# the operator's attached files + analysis after the prompt itself was consumed.
|
|
_RECENT_DIRECTOR_BRIEFINGS_MAX = 6
|
|
|
|
# Quick-ack ("Moment...") UX: fire a SHORT TTS the moment the bot's name is
|
|
# detected so the speaker hears within ~1s that the bot reacted, instead of
|
|
# waiting for the full debounce + SPEECH_TEAMS + agent pipeline (~5-30s).
|
|
# Throttled per session to avoid acking every fragment of a long utterance.
|
|
_QUICK_ACK_MIN_INTERVAL_SEC = 25.0
|
|
|
|
# Number of phrase variants we generate per kind (rotated round-robin so back-
|
|
# to-back acks/notices don't sound identical).
|
|
_EPHEMERAL_PHRASE_VARIANTS = 4
|
|
|
|
# Localisation INTENTS for ephemeral phrases. Each kind describes WHAT the
|
|
# phrase should express; the actual wording is produced at runtime by the AI
|
|
# in the bot's configured language + persona. The intent text below is the
|
|
# instruction passed to the LLM (English, since it's a model directive — the
|
|
# OUTPUT will be in the configured spoken language). Add new ephemeral phrase
|
|
# kinds here, never inline string literals at the call site.
|
|
_EPHEMERAL_PHRASE_INTENTS: Dict[str, str] = {
|
|
"quickAck": (
|
|
"Very short verbal acknowledgment (1 to 4 words) the assistant says "
|
|
"the moment its name is recognised, BEFORE it has formulated a full "
|
|
"answer. The intent is purely 'I heard you, I'm thinking' — natural, "
|
|
"conversational, never a complete sentence."
|
|
),
|
|
"agentBusy": (
|
|
"One short sentence (max ~12 words) the assistant says BEFORE starting "
|
|
"a longer research / tool-call task, so the audience knows the answer "
|
|
"will take a few seconds. Polite, professional, calm."
|
|
),
|
|
"agentRound": (
|
|
"One short sentence (max ~14 words) the assistant says BETWEEN rounds "
|
|
"of a longer agent task to signal that work is still in progress. "
|
|
"Include the placeholder tokens '{round}' and '{maxRounds}' so the "
|
|
"caller can substitute the actual numbers — e.g. 'Step {round} of "
|
|
"{maxRounds}, still working.'"
|
|
),
|
|
}
|
|
|
|
|
|
def _voiceLineLooksLikeBillingOrMeta(line: str) -> bool:
|
|
"""Heuristic: trailing lines that are separators or billing/usage footers."""
|
|
s = line.strip()
|
|
if not s:
|
|
return True
|
|
lower = s.lower()
|
|
if re.match(r"^[-=*_]{3,}\s*$", s):
|
|
return True
|
|
if re.match(r"^#{1,6}\s*(usage|billing|costs?|meta|technical|statistics)\b", lower):
|
|
return True
|
|
if "chf" in lower and re.search(r"\d", s):
|
|
if re.search(
|
|
r"\b(total|usage|cost|billing|token|spent|used|price|estimate|"
|
|
r"rounds?|calls?|duration|processing\s*time|model\s*calls?)\b",
|
|
lower,
|
|
):
|
|
return True
|
|
if "token" in lower and re.search(r"\d", s):
|
|
if re.search(r"\b(total|usage|prompt|completion)\b", lower):
|
|
return True
|
|
pl = lower.replace(" ", "")
|
|
if "progressafter" in pl and ("aicalls:" in pl or "toolcalls:" in pl):
|
|
return True
|
|
return False
|
|
|
|
|
|
_EMOJI_PATTERN = re.compile(
|
|
"["
|
|
"\U0001F300-\U0001FAFF" # symbols & pictographs, emoticons, transport, supplemental
|
|
"\U00002600-\U000027BF" # misc symbols + dingbats (incl. ⚙ 🔐 🔌 ✓ ✗)
|
|
"\U0001F1E6-\U0001F1FF" # regional indicator (flags)
|
|
"\U00002B00-\U00002BFF" # arrows, geometric
|
|
"\U0001F900-\U0001F9FF" # supplemental symbols (incl. 🤖 🧠)
|
|
"\U0000FE0F" # variation selector-16 (emoji presentation)
|
|
"]+",
|
|
flags=re.UNICODE,
|
|
)
|
|
|
|
|
|
def _voiceFriendlyMeetingText(raw: str) -> str:
|
|
"""Sanitise a chat/markdown response so it can be SPOKEN naturally.
|
|
|
|
Aggressive cleanup — when a TTS engine reads raw markdown out loud the
|
|
listener hears "hash hash hash Zusammenfassung pipe pipe pipe", which
|
|
is unbearable in a meeting. The chat / DB / UI keep the original text;
|
|
only the audio path goes through this sanitiser.
|
|
|
|
What we strip:
|
|
* Code fences and inline code
|
|
* Markdown emphasis (**bold**, *italic*, __bold__, _italic_)
|
|
* Markdown links → keep label
|
|
* Headings (# .. ######)
|
|
* Markdown tables (any line with two or more pipes is dropped wholesale)
|
|
* Horizontal rules (---, ***, ___ on their own line)
|
|
* Bullet markers (-, *, •, ·) and numbered list markers (1., 2)) at line start
|
|
* Emojis (full Unicode pictograph ranges + variation selector)
|
|
* Decorative trailing colons on bullet headings
|
|
* Stray pipes left over from inline tables
|
|
* Trailing billing / "maximum rounds reached" / "budget exceeded" footers
|
|
|
|
Whitespace is then collapsed to single spaces.
|
|
"""
|
|
if not raw:
|
|
return ""
|
|
|
|
# Trim trailing operator/billing footers BEFORE any structural rewrite
|
|
# so we don't waste effort sanitising a footer that gets dropped.
|
|
low = raw.lower()
|
|
if "maximum rounds reached" in low:
|
|
m = re.search(r"(?is)maximum\s+rounds\s+reached", raw)
|
|
if m:
|
|
head = raw[: m.start()].strip()
|
|
raw = head or (
|
|
"Die Abklaerung brauchte mehr Schritte als vorgesehen; Details stehen im Chat."
|
|
)
|
|
if "budget exceeded" in low:
|
|
m = re.search(r"(?is)budget\s+exceeded", raw)
|
|
if m:
|
|
head = raw[: m.start()].strip()
|
|
raw = head or "Das eingestellte Kostenlimit ist erreicht; Details stehen im Chat."
|
|
|
|
lines = raw.strip().split("\n")
|
|
while lines and _voiceLineLooksLikeBillingOrMeta(lines[-1]):
|
|
lines.pop()
|
|
t = "\n".join(lines).strip()
|
|
if not t:
|
|
t = raw.strip()
|
|
|
|
# 1) Strip code blocks (multi-line first, then inline)
|
|
t = re.sub(r"```[\s\S]*?```", " ", t)
|
|
t = re.sub(r"`([^`]+)`", r"\1", t)
|
|
|
|
# 2) Drop markdown table rows (any line with two or more pipes) and the
|
|
# separator lines they come with (|---|---|). A paragraph that just
|
|
# happens to contain ONE pipe survives.
|
|
cleanedLines: List[str] = []
|
|
for ln in t.split("\n"):
|
|
stripped = ln.strip()
|
|
if stripped.count("|") >= 2:
|
|
continue
|
|
if re.fullmatch(r"\s*\|?[\s\-:|]+\|?\s*", stripped) and "-" in stripped:
|
|
continue
|
|
cleanedLines.append(ln)
|
|
t = "\n".join(cleanedLines)
|
|
|
|
# 3) Drop horizontal rule lines (---, ***, ___, with optional spaces)
|
|
t = re.sub(r"(?m)^\s*([-*_])\s*\1\s*\1[\s\1]*$", "", t)
|
|
|
|
# 4) Headings: drop the leading hashes
|
|
t = re.sub(r"(?m)^\s*#{1,6}\s+", "", t)
|
|
|
|
# 5) Bullet markers at line start — keep the content, drop the bullet
|
|
t = re.sub(r"(?m)^\s*[-*•·]\s+", "", t)
|
|
# 6) Numbered list markers at line start ("1.", "2)", "3 -")
|
|
t = re.sub(r"(?m)^\s*\d+[\.\)]\s+", "", t)
|
|
|
|
# 7) Emphasis markers (after bullets so a "**Bold:**" heading is handled)
|
|
t = re.sub(r"\*\*([^*]+)\*\*", r"\1", t)
|
|
t = re.sub(r"\*([^*\n]+)\*", r"\1", t)
|
|
t = re.sub(r"__([^_]+)__", r"\1", t)
|
|
t = re.sub(r"(?<![A-Za-z0-9])_([^_\n]+)_(?![A-Za-z0-9])", r"\1", t)
|
|
|
|
# 8) Markdown links → keep the visible label
|
|
t = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", t)
|
|
|
|
# 9) Strip emojis (purely decorative for voice)
|
|
t = _EMOJI_PATTERN.sub("", t)
|
|
|
|
# 10) Final special-character pass for TTS. Engines like Google Chirp /
|
|
# Azure Neural read special chars out loud as words ("Stern", "Klammer
|
|
# auf", "Pfeil", "Unterstrich") which makes the meeting unbearable.
|
|
# We KEEP only the canonical spoken-language punctuation that TTS
|
|
# uses for prosody: . , ? ! : ; ' - (and the Unicode ellipsis …).
|
|
#
|
|
# Everything else gets stripped or replaced with a single space.
|
|
|
|
# 10a) Decorative dashes (en-dash / em-dash) between spaces → comma
|
|
# (gives a natural prosodic pause without the TTS reading "dash").
|
|
t = re.sub(r"\s+[\u2013\u2014\u2212]\s+", ", ", t)
|
|
# In-word en/em dash collapses to a regular hyphen.
|
|
t = re.sub(r"[\u2013\u2014\u2212]", "-", t)
|
|
|
|
# 10b) Smart / typographic quotes — TTS sometimes vocalises them.
|
|
t = re.sub(r"[\u201C\u201D\u201E\u201F\u00AB\u00BB\u2039\u203A]", "", t)
|
|
t = re.sub(r"[\u2018\u2019\u201A\u201B]", "'", t)
|
|
|
|
# 10c) Arrows (U+2190..U+21FF), math symbols (U+2200..U+22FF), misc
|
|
# technical (U+2300..U+23FF) — strip outright. Also a handful of
|
|
# common Latin-1 specials (±, °, ©, ®, ™, ¶, §, ¦, ·) that TTS
|
|
# engines tend to read out as words ("plus minus", "Grad").
|
|
t = re.sub(r"[\u2190-\u23FF]+", " ", t)
|
|
t = re.sub(r"[\u00A1-\u00BF\u00D7\u00F7\u2122]+", " ", t)
|
|
|
|
# 10d) Common ASCII / typographic specials that survived earlier passes.
|
|
# `*` `#` `~` `^` `=` `+` `|` `\` `<` `>` `{` `}` `[` `]` `(` `)`
|
|
# `_` `&` `@` `$` `%` `` -- replaced with a space so word
|
|
# boundaries are preserved.
|
|
t = re.sub(r"[*#~^=+|\\<>{}\[\]()_&@$%`]+", " ", t)
|
|
|
|
# 10e) Drop ASCII double-quote (single quotes are legitimate apostrophes
|
|
# in contractions like "don't" / "geht's", so we keep U+0027).
|
|
t = t.replace('"', "")
|
|
|
|
# 10f) Slash between letters/digits — TTS reads "slash". Replace with
|
|
# " or " for readability when it separates words like "und/oder".
|
|
t = re.sub(r"(?<=\w)\s*/\s*(?=\w)", " oder ", t)
|
|
# Any remaining stray slash is just whitespace.
|
|
t = t.replace("/", " ")
|
|
|
|
# 10g) Trim multiple punctuation runs ("...!!!" → "..." / "!" / etc.)
|
|
t = re.sub(r"([\.,;:!\?])\1{1,}", r"\1", t)
|
|
# Remove orphan punctuation directly preceded by whitespace
|
|
# (common after symbol stripping: " , ", " . ").
|
|
t = re.sub(r"\s+([\.,;:!\?])", r"\1", t)
|
|
# Collapse trailing colon at end of meaningful phrase to a period for
|
|
# nicer cadence ("Was ist PowerOn:" → "Was ist PowerOn.").
|
|
t = re.sub(r":\s*$", ".", t.rstrip())
|
|
# 10h) Collapse " :" tail of MULTI-LINE blocks the same way.
|
|
t = re.sub(r"\s+:\s*$", ":", t, flags=re.MULTILINE)
|
|
|
|
# 11) Collapse whitespace to single spaces; protect sentence breaks by
|
|
# turning paragraph blanks into a period if the previous chunk
|
|
# didn't already terminate.
|
|
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", t) if p.strip()]
|
|
rebuilt: List[str] = []
|
|
for p in paragraphs:
|
|
p = re.sub(r"\s+", " ", p).strip()
|
|
if not p:
|
|
continue
|
|
if not re.search(r"[\.!\?\u2026:]\s*$", p):
|
|
p = p.rstrip() + "."
|
|
rebuilt.append(p)
|
|
t = " ".join(rebuilt)
|
|
t = re.sub(r"\s+", " ", t).strip()
|
|
|
|
# If we sanitised away everything (e.g. the input was *only* a markdown
|
|
# table or a wall of pictographs) return empty — the caller (TTS / voice
|
|
# summary) treats empty as "nothing to say", which is the safe default.
|
|
# Falling back to raw markdown here would leak the very symbols we just
|
|
# spent ten passes removing.
|
|
return t
|
|
|
|
|
|
# Google Cloud TTS rejects single sentences that exceed ~5000 bytes. The Chirp3
|
|
# voices are stricter: long, comma-heavy sentences (no terminating punctuation)
|
|
# also fail with "Sentence ... is too long". We chunk well below the documented
|
|
# limit AND inject sentence terminators so the synthesizer accepts every chunk.
|
|
_TTS_MAX_CHUNK_CHARS = 800
|
|
|
|
|
|
def _splitTextForTts(text: str, maxChars: int = _TTS_MAX_CHUNK_CHARS) -> List[str]:
|
|
"""Split a long voice line into TTS-safe chunks at sentence/paragraph boundaries.
|
|
|
|
The result preserves order and contains no empty strings. A single
|
|
sentence longer than ``maxChars`` is hard-cut at word boundaries.
|
|
"""
|
|
cleaned = (text or "").strip()
|
|
if not cleaned:
|
|
return []
|
|
if len(cleaned) <= maxChars:
|
|
return [cleaned]
|
|
|
|
sentencePattern = re.compile(r"(?<=[\.!\?\u2026])\s+|\n+")
|
|
rawSentences = [s.strip() for s in sentencePattern.split(cleaned) if s and s.strip()]
|
|
if not rawSentences:
|
|
rawSentences = [cleaned]
|
|
|
|
chunks: List[str] = []
|
|
buffer = ""
|
|
for sentence in rawSentences:
|
|
if len(sentence) > maxChars:
|
|
if buffer:
|
|
chunks.append(buffer.strip())
|
|
buffer = ""
|
|
words = sentence.split(" ")
|
|
current = ""
|
|
for word in words:
|
|
candidate = (current + " " + word).strip() if current else word
|
|
if len(candidate) > maxChars and current:
|
|
chunks.append(current.strip())
|
|
current = word
|
|
else:
|
|
current = candidate
|
|
if current:
|
|
if not re.search(r"[\.!\?\u2026]\s*$", current):
|
|
current = current.rstrip() + "."
|
|
chunks.append(current.strip())
|
|
continue
|
|
|
|
candidate = (buffer + " " + sentence).strip() if buffer else sentence
|
|
if len(candidate) > maxChars and buffer:
|
|
chunks.append(buffer.strip())
|
|
buffer = sentence
|
|
else:
|
|
buffer = candidate
|
|
|
|
if buffer:
|
|
chunks.append(buffer.strip())
|
|
|
|
finalized: List[str] = []
|
|
for c in chunks:
|
|
if not c:
|
|
continue
|
|
if not re.search(r"[\.!\?\u2026]\s*$", c):
|
|
c = c.rstrip() + "."
|
|
finalized.append(c)
|
|
return finalized
|
|
|
|
|
|
async def _speakTextChunked(
|
|
websocket: Optional[WebSocket],
|
|
voiceInterface: Any,
|
|
sessionId: str,
|
|
voiceText: str,
|
|
languageCode: str,
|
|
voiceName: Optional[str],
|
|
isCancelled: Optional[Callable[[], bool]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Run TTS in chunks and dispatch each ``playAudio`` over the websocket.
|
|
|
|
Returns ``{"success": bool, "chunks": int, "played": int, "error": Optional[str], "cancelled": bool}``.
|
|
Failure for one chunk does NOT abort the rest; partial playback still
|
|
counts as ``success=True`` so the caller can decide whether to add a chat
|
|
fallback for the missing parts.
|
|
|
|
``isCancelled`` is an optional zero-arg predicate the caller passes in to
|
|
signal "abort the remaining chunks". It is checked BEFORE each Google
|
|
TTS round-trip and again BEFORE each websocket send, so a stop word in
|
|
the meeting can interrupt a multi-chunk dispatch within at most one
|
|
chunk boundary instead of waiting for the whole answer to finish.
|
|
"""
|
|
chunks = _splitTextForTts(voiceText)
|
|
result: Dict[str, Any] = {"success": False, "chunks": len(chunks), "played": 0, "error": None, "cancelled": False}
|
|
if not chunks:
|
|
result["error"] = "no text"
|
|
return result
|
|
if voiceInterface is None:
|
|
result["error"] = "no voice interface"
|
|
return result
|
|
|
|
lastError: Optional[str] = None
|
|
for idx, chunk in enumerate(chunks, start=1):
|
|
if isCancelled is not None and isCancelled():
|
|
result["cancelled"] = True
|
|
logger.info(
|
|
f"Session {sessionId}: TTS chunk loop cancelled before chunk "
|
|
f"{idx}/{len(chunks)} (user stop or newer answer in flight)"
|
|
)
|
|
break
|
|
try:
|
|
ttsResult = await voiceInterface.textToSpeech(
|
|
text=chunk,
|
|
languageCode=languageCode,
|
|
voiceName=voiceName,
|
|
)
|
|
except Exception as ttsErr: # pragma: no cover - network/runtime errors
|
|
lastError = f"chunk {idx}/{len(chunks)} raised: {ttsErr}"
|
|
logger.warning(f"Session {sessionId}: TTS {lastError}")
|
|
continue
|
|
|
|
if not isinstance(ttsResult, dict) or ttsResult.get("success") is False:
|
|
err = (ttsResult or {}).get("error", "unknown") if isinstance(ttsResult, dict) else "no result"
|
|
lastError = f"chunk {idx}/{len(chunks)} failed: {err}"
|
|
logger.warning(f"Session {sessionId}: TTS {lastError}")
|
|
continue
|
|
|
|
audioContent = ttsResult.get("audioContent")
|
|
if not audioContent:
|
|
lastError = f"chunk {idx}/{len(chunks)} returned no audioContent"
|
|
logger.warning(f"Session {sessionId}: TTS {lastError}")
|
|
continue
|
|
|
|
if websocket is None:
|
|
lastError = "websocket unavailable"
|
|
break
|
|
|
|
if isCancelled is not None and isCancelled():
|
|
result["cancelled"] = True
|
|
logger.info(
|
|
f"Session {sessionId}: TTS chunk loop cancelled before "
|
|
f"sending chunk {idx}/{len(chunks)} (audio dropped)"
|
|
)
|
|
break
|
|
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "playAudio",
|
|
"sessionId": sessionId,
|
|
"audio": {
|
|
"data": base64.b64encode(
|
|
audioContent if isinstance(audioContent, bytes) else audioContent.encode()
|
|
).decode(),
|
|
"format": "mp3",
|
|
},
|
|
}))
|
|
result["played"] += 1
|
|
except Exception as wsErr: # pragma: no cover - websocket failures
|
|
lastError = f"chunk {idx}/{len(chunks)} websocket send failed: {wsErr}"
|
|
logger.warning(f"Session {sessionId}: TTS {lastError}")
|
|
break
|
|
|
|
result["success"] = result["played"] > 0
|
|
if lastError:
|
|
result["error"] = lastError
|
|
return result
|
|
|
|
|
|
def _coercePersistedDetectedIntent(raw: Optional[str]) -> tuple:
|
|
"""Map free-form intent labels (e.g. agent:directorPrompt) to TeamsbotDetectedIntent
|
|
for DB persistence; return (enum, meta_suffix_or_None for reasoning)."""
|
|
if not raw or not str(raw).strip():
|
|
return TeamsbotDetectedIntent.NONE, None
|
|
s = str(raw).strip().lower()
|
|
for member in TeamsbotDetectedIntent:
|
|
if member.value == s:
|
|
return member, None
|
|
if s.startswith("agent:"):
|
|
return TeamsbotDetectedIntent.PROACTIVE, str(raw).strip()[:120]
|
|
return TeamsbotDetectedIntent.NONE, str(raw).strip()[:120]
|
|
|
|
|
|
# Director prompts are PRIVATE operator instructions — they must NOT be echoed
|
|
# verbatim into the meeting. The agent is asked to start its FINAL answer with
|
|
# either ``MEETING_REPLY:`` (followed by the text actually meant for the meeting)
|
|
# or ``SILENT:`` / ``INTERNAL_ONLY:`` (followed by an internal note for the
|
|
# operator UI). Anything else → treat as silent (safe default).
|
|
_DIRECTOR_REPLY_PATTERN = re.compile(
|
|
r"^\s*(MEETING_REPLY|MEETING|REPLY|SAY|SPEAK)\s*:\s*",
|
|
re.IGNORECASE,
|
|
)
|
|
_DIRECTOR_SILENT_PATTERN = re.compile(
|
|
r"^\s*(SILENT|INTERNAL(?:_ONLY)?|NOTE|NO_MEETING_OUTPUT|ACK(?:NOWLEDGE)?)\s*:\s*",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _parseDirectorPromptFinal(finalText: str) -> Dict[str, Any]:
|
|
"""Parse the agent's final answer for a director prompt.
|
|
|
|
Returns ``{"kind": "meeting"|"silent", "meetingText": str, "internalNote": str}``.
|
|
|
|
Default is ``silent`` so unmarked replies are NOT broadcast into the meeting.
|
|
"""
|
|
text = (finalText or "").strip()
|
|
if not text:
|
|
return {"kind": "silent", "meetingText": "", "internalNote": ""}
|
|
|
|
meetingMatch = _DIRECTOR_REPLY_PATTERN.match(text)
|
|
if meetingMatch:
|
|
body = text[meetingMatch.end():].strip()
|
|
return {"kind": "meeting", "meetingText": body, "internalNote": ""}
|
|
|
|
silentMatch = _DIRECTOR_SILENT_PATTERN.match(text)
|
|
if silentMatch:
|
|
body = text[silentMatch.end():].strip()
|
|
return {"kind": "silent", "meetingText": "", "internalNote": body}
|
|
|
|
# No marker → safe default: do NOT spam the meeting with the agent's
|
|
# internal reasoning. Keep the full text as an internal note for the
|
|
# operator UI so nothing is lost.
|
|
return {"kind": "silent", "meetingText": "", "internalNote": text}
|
|
|
|
|
|
# =========================================================================
|
|
# Active Service Registry (sessionId -> running TeamsbotService instance)
|
|
#
|
|
# Required so HTTP endpoints (e.g. director-prompt POST) can reach the
|
|
# TeamsbotService instance currently holding the live websocket + voice
|
|
# interface for that session, without going through the websocket loop.
|
|
# =========================================================================
|
|
_activeServices: Dict[str, "TeamsbotService"] = {}
|
|
|
|
|
|
def getActiveService(sessionId: str) -> Optional["TeamsbotService"]:
|
|
"""Return the running TeamsbotService for a session, or None if not active."""
|
|
return _activeServices.get(sessionId)
|
|
|
|
|
|
# =========================================================================
|
|
# AI Service Factory (for billing-aware AI calls)
|
|
# =========================================================================
|
|
|
|
def _createAiService(user, mandateId, featureInstanceId=None):
|
|
"""Create a properly wired AiService via the service center."""
|
|
ctx = ServiceCenterContext(
|
|
user=user,
|
|
mandate_id=mandateId,
|
|
feature_instance_id=featureInstanceId,
|
|
feature_code="teamsbot",
|
|
)
|
|
return _getServiceCenterService("ai", ctx)
|
|
|
|
|
|
# =========================================================================
|
|
# Session Event Queues (for SSE streaming to frontend)
|
|
# =========================================================================
|
|
_sessionEvents: Dict[str, asyncio.Queue] = {}
|
|
|
|
|
|
async def _emitSessionEvent(sessionId: str, eventType: str, data: Any):
|
|
"""Emit an event to the session's SSE stream.
|
|
Creates the queue on-demand so events are never silently dropped."""
|
|
if sessionId not in _sessionEvents:
|
|
_sessionEvents[sessionId] = asyncio.Queue()
|
|
await _sessionEvents[sessionId].put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()})
|
|
|
|
|
|
def _normalizeGatewayHostForBotWs(host: str) -> str:
|
|
"""Use IPv4 loopback for local dev WebSocket URLs passed to the Node browser-bot.
|
|
|
|
Node on Windows often resolves ``localhost`` to ``::1`` first; Uvicorn bound to
|
|
``0.0.0.0`` typically accepts IPv4 only, so the bot gets ``ECONNREFUSED ::1``.
|
|
"""
|
|
h = host.strip()
|
|
lower = h.lower()
|
|
if lower == "localhost":
|
|
return "127.0.0.1"
|
|
if lower.startswith("localhost:"):
|
|
return "127.0.0.1" + h[len("localhost"):]
|
|
if lower.startswith("[::1]:"):
|
|
return "127.0.0.1" + h.partition("]")[2]
|
|
if lower in ("[::1]", "::1"):
|
|
return "127.0.0.1"
|
|
return h
|
|
|
|
|
|
class TeamsbotService:
|
|
"""
|
|
Pipeline Orchestrator for Teams Bot sessions.
|
|
Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication.
|
|
"""
|
|
|
|
def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig):
|
|
self.currentUser = currentUser
|
|
self.mandateId = mandateId
|
|
self.instanceId = instanceId
|
|
self.config = config
|
|
self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl())
|
|
|
|
# State
|
|
self._lastAiCallTime: float = 0.0
|
|
self._aiAnalysisInProgress: bool = False
|
|
self._contextBuffer: List[Dict[str, Any]] = []
|
|
self._sessionContext: Optional[str] = None # User-provided background context
|
|
self._contextSummary: Optional[str] = None # AI-generated summary of long context
|
|
|
|
# Differential transcript tracking
|
|
self._lastTranscriptSpeaker: Optional[str] = None
|
|
self._lastTranscriptText: Optional[str] = None
|
|
self._lastTranscriptId: Optional[str] = None
|
|
self._lastSttTime: float = 0.0
|
|
self._lastBotResponseText: Optional[str] = None
|
|
self._lastBotResponseTs: float = 0.0
|
|
|
|
# Speaker attribution: simple last-caption-speaker model
|
|
self._lastCaptionSpeaker: Optional[str] = None
|
|
self._unattributedTranscriptIds: List[str] = []
|
|
self._knownSpeakers: set = set()
|
|
|
|
# Debounced name trigger: wait for speaker to finish before AI analysis
|
|
self._pendingNameTrigger: Optional[Dict[str, Any]] = None
|
|
self._followUpWindowEnd: float = 0.0
|
|
|
|
# Quick-ack throttle (timestamp of the last short "Moment..." ack we
|
|
# spoke into the meeting). Without this guard a long sentence with
|
|
# multiple name mentions would trigger several acks in a row.
|
|
self._lastQuickAckTs: float = 0.0
|
|
|
|
# Session-scoped phrase pool for SHORT ephemeral utterances (quick
|
|
# acks, "checking..." notices, per-round progress). Lazily populated
|
|
# by the AI in the bot's configured language + persona — no hardcoded
|
|
# strings or hardcoded language branching anywhere downstream. Keyed
|
|
# by the kinds defined in ``_EPHEMERAL_PHRASE_INTENTS``.
|
|
# * ``self._phrasePool[kind]`` -> list of variants for that kind
|
|
# * ``self._phrasePoolIdx[kind]`` -> round-robin pointer
|
|
# Concurrent generation calls for the same kind are serialised by the
|
|
# lock so we don't spawn duplicate AI requests on a burst.
|
|
self._phrasePool: Dict[str, List[str]] = {}
|
|
self._phrasePoolIdx: Dict[str, int] = {}
|
|
self._phrasePoolLock: asyncio.Lock = asyncio.Lock()
|
|
|
|
# Voice pipeline: a single per-session lock that serialises every TTS
|
|
# dispatch into the meeting. Without it three independent code paths
|
|
# (SPEECH_TEAMS direct answer, agent escalation final answer, and
|
|
# operator-driven director prompt) can all reach
|
|
# ``websocket.send_text({"type": "playAudio", ...})`` at the same time
|
|
# and the browser bot then plays interleaved chunks — i.e. "two bots
|
|
# talking over each other" exactly as the operator suspects. Chat
|
|
# (text) sends are NOT locked: they're cheap and can interleave fine.
|
|
self._meetingTtsLock: asyncio.Lock = asyncio.Lock()
|
|
# Generation counter incremented every time we begin producing a NEW
|
|
# meeting answer OR every time the user issues a hard stop. Any TTS
|
|
# chunk loop captures the counter value at start; before sending
|
|
# each chunk to the bot it re-checks the counter and bails out if
|
|
# it has moved on. This is what makes "Stopp" actually feel
|
|
# instantaneous: the in-flight TTS dispatch loop drops itself the
|
|
# moment the next chunk would have been sent, without waiting for
|
|
# any AI round-trip or extra Google TTS call to come back.
|
|
self._answerGenerationCounter: int = 0
|
|
# Tracking handles for cancellable background tasks. Keeping a
|
|
# reference lets ``_cancelInFlightSpeech`` actually call
|
|
# ``task.cancel()`` instead of just hoping the task notices the
|
|
# generation counter has moved on. Cleared in the task's own
|
|
# ``finally`` block.
|
|
self._currentEscalationTask: Optional[asyncio.Task] = None
|
|
self._currentQuickAckTask: Optional[asyncio.Task] = None
|
|
# Whether an agent escalation task is in flight. Kept separate from
|
|
# ``_aiAnalysisInProgress`` (which only covers the SPEECH_TEAMS phase)
|
|
# so a new speech trigger that arrives WHILE the agent is still
|
|
# researching does not start a parallel SPEECH_TEAMS that would then
|
|
# answer at the same time as the agent.
|
|
self._agentEscalationInFlight: bool = False
|
|
|
|
# Live transport handles for out-of-band actions (director prompts, agent escalation).
|
|
# Set in handleBotWebSocket once the bot connects; cleared on disconnect.
|
|
self._activeSessionId: Optional[str] = None
|
|
self._websocket: Optional[WebSocket] = None
|
|
self._voiceInterface = None
|
|
|
|
# Persistent director prompts kept in memory for context injection across triggers.
|
|
# Loaded from DB on (re)connect; mutated by submit/delete director prompt routes.
|
|
self._activePersistentPrompts: List[Dict[str, Any]] = []
|
|
|
|
# Recent director-prompt briefings (one-shot AND persistent) — keeps the
|
|
# operator's attached files and the agent's internal analysis available
|
|
# for later SPEECH_TEAMS triggers, even after a one-shot prompt has been
|
|
# consumed. Without this pool, the bot "forgets" attached docs as soon
|
|
# as the director prompt finished, and answers later meeting questions
|
|
# ("summarize the doc") with general babble instead of the file content.
|
|
# Capped by ``_RECENT_DIRECTOR_BRIEFINGS_MAX`` to bound prompt size.
|
|
self._recentDirectorBriefings: List[Dict[str, Any]] = []
|
|
|
|
# =========================================================================
|
|
# Session Lifecycle
|
|
# =========================================================================
|
|
|
|
async def joinMeeting(
|
|
self,
|
|
sessionId: str,
|
|
meetingLink: str,
|
|
connectionId: Optional[str] = None,
|
|
gatewayBaseUrl: str = "",
|
|
botAccountEmail: Optional[str] = None,
|
|
botAccountPassword: Optional[str] = None,
|
|
):
|
|
"""Send join command to the Browser Bot service.
|
|
|
|
The browser bot will:
|
|
1. Launch browser (headful if credentials provided, headless otherwise)
|
|
2. Navigate to Teams web app
|
|
3. Authenticate if credentials provided, otherwise join as anonymous guest
|
|
4. Enable captions/audio capture and start scraping
|
|
5. Connect back via WebSocket to send transcripts
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
# Initialize SSE event queue
|
|
_sessionEvents[sessionId] = asyncio.Queue()
|
|
|
|
try:
|
|
# Update status to JOINING
|
|
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"})
|
|
|
|
# Send join command to browser bot
|
|
session = interface.getSession(sessionId)
|
|
if not session:
|
|
raise ValueError(f"Session {sessionId} not found")
|
|
|
|
# Build the full WebSocket URL for the bot to connect back to this gateway instance
|
|
# gatewayBaseUrl is passed from the route handler (derived from request.base_url)
|
|
wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws"
|
|
gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/")
|
|
gatewayHost = _normalizeGatewayHostForBotWs(gatewayHost)
|
|
fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}"
|
|
|
|
hasAuth = bool(botAccountEmail and botAccountPassword)
|
|
logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}")
|
|
|
|
result = await self.browserBotConnector.joinMeeting(
|
|
sessionId=sessionId,
|
|
meetingUrl=meetingLink,
|
|
botName=session.get("botName", self.config.botName),
|
|
instanceId=self.instanceId,
|
|
gatewayWsUrl=fullGatewayWsUrl,
|
|
language=self.config.language,
|
|
botAccountEmail=botAccountEmail,
|
|
botAccountPassword=botAccountPassword,
|
|
transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto",
|
|
debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False,
|
|
)
|
|
|
|
if result.get("success"):
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.JOINING.value, # Will become ACTIVE when bot connects via WS
|
|
})
|
|
logger.info(f"Browser bot deployment started for session {sessionId}")
|
|
else:
|
|
errorMsg = result.get("error", "Unknown error joining meeting")
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ERROR.value,
|
|
"errorMessage": errorMsg,
|
|
})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg})
|
|
logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error joining meeting for session {sessionId}: {e}")
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ERROR.value,
|
|
"errorMessage": str(e),
|
|
})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)})
|
|
|
|
async def leaveMeeting(self, sessionId: str):
|
|
"""Send leave command to the Browser Bot service."""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
try:
|
|
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"})
|
|
|
|
await self.browserBotConnector.leaveMeeting(sessionId)
|
|
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ENDED.value,
|
|
"endedAt": getIsoTimestamp(),
|
|
})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"})
|
|
|
|
# Generate meeting summary in background
|
|
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
|
|
|
logger.info(f"Bot left meeting for session {sessionId}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error leaving meeting for session {sessionId}: {e}")
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ERROR.value,
|
|
"errorMessage": str(e),
|
|
"endedAt": getIsoTimestamp(),
|
|
})
|
|
|
|
# Cleanup event queue
|
|
_sessionEvents.pop(sessionId, None)
|
|
|
|
# =========================================================================
|
|
# Browser Bot WebSocket Communication
|
|
# =========================================================================
|
|
|
|
async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str):
|
|
"""
|
|
Main WebSocket handler for Browser Bot communication.
|
|
|
|
Receives:
|
|
- transcript: Caption text scraped from Teams
|
|
- status: Bot state changes (joined, in_lobby, left, error)
|
|
|
|
Sends:
|
|
- playAudio: TTS audio for the bot to play in the meeting
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
voiceInterface = getVoiceInterface(self.currentUser, self.mandateId)
|
|
|
|
# Load session context (user-provided background knowledge)
|
|
# If the context is long (>500 chars), summarize it to reduce token usage
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
rawContext = session.get("sessionContext")
|
|
if rawContext and len(rawContext) > 500:
|
|
logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...")
|
|
self._sessionContext = await self._summarizeSessionContext(sessionId, rawContext)
|
|
elif rawContext:
|
|
self._sessionContext = rawContext
|
|
if self._sessionContext:
|
|
logger.info(f"Session {sessionId}: Session context ready ({len(self._sessionContext)} chars)")
|
|
|
|
# Resolve system bot email for speaker detection (prevents bot from triggering AI on own speech)
|
|
try:
|
|
systemBot = interface.getActiveSystemBot(self.mandateId)
|
|
self._botAccountEmail = systemBot.get("email") if systemBot else None
|
|
if self._botAccountEmail:
|
|
logger.info(f"Session {sessionId}: Bot account email resolved: {self._botAccountEmail}")
|
|
except Exception:
|
|
self._botAccountEmail = None
|
|
|
|
# Register the live service so out-of-band callers (director prompts,
|
|
# agent escalation) can deliver text/audio through this same websocket.
|
|
self._activeSessionId = sessionId
|
|
self._websocket = websocket
|
|
self._voiceInterface = voiceInterface
|
|
_activeServices[sessionId] = self
|
|
|
|
# Notify the operator UI that the bot's WebSocket is now live so the
|
|
# director-prompt panel can enable its submit button.
|
|
try:
|
|
await _emitSessionEvent(sessionId, "botConnectionState", {
|
|
"connected": True,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
# Restore active persistent director prompts from DB (survives reconnects).
|
|
try:
|
|
self._activePersistentPrompts = interface.getActivePersistentPrompts(sessionId) or []
|
|
if self._activePersistentPrompts:
|
|
logger.info(
|
|
f"Session {sessionId}: Loaded {len(self._activePersistentPrompts)} active persistent director prompt(s)"
|
|
)
|
|
except Exception as restoreErr:
|
|
logger.warning(f"Session {sessionId}: Could not restore persistent director prompts: {restoreErr}")
|
|
self._activePersistentPrompts = []
|
|
|
|
# Pre-warm the ephemeral phrase pool in the background so the first
|
|
# quick-ack ("Moment...") and interim agent notice don't have to wait
|
|
# for the AI round-trip. Best-effort: if generation fails, the
|
|
# corresponding ephemeral cue is silently skipped at runtime — never
|
|
# falls back to hardcoded language strings.
|
|
asyncio.create_task(self._warmEphemeralPhrasePool(sessionId))
|
|
|
|
logger.info(f"[WS] Handler started for session {sessionId}")
|
|
|
|
try:
|
|
msgCount = 0
|
|
while True:
|
|
data = await websocket.receive()
|
|
msgCount += 1
|
|
|
|
if "text" not in data:
|
|
logger.debug(f"[WS] session={sessionId} msg #{msgCount}: non-text data (keys: {list(data.keys())})")
|
|
continue
|
|
|
|
message = json.loads(data["text"])
|
|
msgType = message.get("type")
|
|
if msgType not in ("audioChunk", "ping"):
|
|
logger.info(f"[WS] session={sessionId} msg #{msgCount}: type={msgType}")
|
|
|
|
if msgType == "transcript":
|
|
transcript = message.get("transcript", {})
|
|
source = transcript.get("source", "caption")
|
|
speaker = transcript.get("speaker", "Unknown")
|
|
textPreview = (transcript.get("text", "") or "")[:60]
|
|
# Caption/speakerHint: name resolution only; transcript comes from STT
|
|
logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...")
|
|
await self._processTranscript(
|
|
sessionId=sessionId,
|
|
speaker=transcript.get("speaker", "Unknown"),
|
|
text=transcript.get("text", ""),
|
|
isFinal=transcript.get("isFinal", True),
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
source=source,
|
|
)
|
|
|
|
elif msgType == "chatMessage":
|
|
chat = message.get("chat", {})
|
|
isHistory = chat.get("isHistory", False)
|
|
source = "chatHistory" if isHistory else "chat"
|
|
logger.info(
|
|
f"[WS] Chat{'[HISTORY]' if isHistory else ''}: "
|
|
f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..."
|
|
)
|
|
await self._processTranscript(
|
|
sessionId=sessionId,
|
|
speaker=chat.get("speaker", "Unknown"),
|
|
text=chat.get("text", ""),
|
|
isFinal=True,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
source=source,
|
|
)
|
|
|
|
elif msgType == "status":
|
|
status = message.get("status")
|
|
errorMessage = message.get("message")
|
|
logger.info(f"[WS] Status: status={status}, message={errorMessage}")
|
|
await self._handleBotStatus(sessionId, status, errorMessage, interface)
|
|
|
|
elif msgType == "audioChunk":
|
|
audioData = message.get("audio", {})
|
|
audioBase64 = audioData.get("data", "")
|
|
sampleRate = audioData.get("sampleRate", 16000)
|
|
captureDiagnostics = audioData.get("captureDiagnostics") or {}
|
|
if audioBase64:
|
|
await self._processAudioChunk(
|
|
sessionId=sessionId,
|
|
audioBase64=audioBase64,
|
|
sampleRate=sampleRate,
|
|
captureDiagnostics=captureDiagnostics,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
)
|
|
|
|
elif msgType == "voiceGreeting":
|
|
# Legacy path: older bot images send a pre-built greeting
|
|
# text. New bots use ``requestGreeting`` and let the
|
|
# Gateway own greeting generation.
|
|
greetingText = message.get("text", "")
|
|
greetingLang = message.get("language", self.config.language)
|
|
logger.info(
|
|
f"[WS] Voice greeting (legacy): text={greetingText[:60]}..., language={greetingLang}"
|
|
)
|
|
if greetingText and voiceInterface:
|
|
await self._dispatchGreetingToMeeting(
|
|
sessionId=sessionId,
|
|
greetingText=greetingText,
|
|
greetingLang=greetingLang,
|
|
sendToChat=False,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
)
|
|
|
|
elif msgType == "requestGreeting":
|
|
# New path: bot just signals "I have joined" — Gateway
|
|
# generates the greeting text via AI in the configured
|
|
# language + persona, then dispatches it to BOTH the
|
|
# meeting chat (sendChatMessage command) and TTS. No
|
|
# hardcoded language strings on the bot side.
|
|
requestedLang = (
|
|
message.get("language") or self.config.language or ""
|
|
).strip() or "en-US"
|
|
botNameHint = (
|
|
message.get("botName") or self.config.botName or ""
|
|
).strip() or self.config.botName
|
|
logger.info(
|
|
f"[WS] Greeting request from bot: language={requestedLang}, name={botNameHint}"
|
|
)
|
|
if voiceInterface:
|
|
try:
|
|
greetingText = await self._generateGreetingText(
|
|
requestedLang
|
|
)
|
|
except Exception as genErr:
|
|
logger.warning(
|
|
f"Greeting generation failed for session {sessionId}: {genErr}"
|
|
)
|
|
greetingText = ""
|
|
if greetingText:
|
|
await self._dispatchGreetingToMeeting(
|
|
sessionId=sessionId,
|
|
greetingText=greetingText,
|
|
greetingLang=requestedLang,
|
|
sendToChat=True,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
)
|
|
else:
|
|
logger.warning(
|
|
f"Session {sessionId}: Skipping greeting — AI generation produced no text"
|
|
)
|
|
|
|
elif msgType == "ping":
|
|
await websocket.send_text(json.dumps({"type": "pong"}))
|
|
|
|
elif msgType == "ttsPlaybackAck":
|
|
playback = message.get("playback", {}) or {}
|
|
status = playback.get("status", "unknown")
|
|
ackMessage = playback.get("message") or "Bot playback status update"
|
|
logger.info(
|
|
f"[WS] TTS playback ack: status={status}, format={playback.get('format')}, "
|
|
f"bytesBase64={playback.get('bytesBase64')}"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": f"playback_{status}",
|
|
"hasWebSocket": True,
|
|
"message": ackMessage,
|
|
"timestamp": playback.get("timestamp") or getIsoTimestamp(),
|
|
"format": playback.get("format"),
|
|
"bytesBase64": playback.get("bytesBase64"),
|
|
})
|
|
|
|
elif msgType == "mfaChallenge":
|
|
mfaData = message.get("mfa", {})
|
|
mfaType = mfaData.get("type", "unknown")
|
|
displayNumber = mfaData.get("displayNumber")
|
|
prompt = mfaData.get("prompt", "")
|
|
logger.info(f"[WS] MFA challenge: type={mfaType}, number={displayNumber}, prompt={prompt[:60]}")
|
|
|
|
await _emitSessionEvent(sessionId, "mfaChallenge", {
|
|
"mfaType": mfaType,
|
|
"displayNumber": displayNumber,
|
|
"prompt": prompt,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
|
|
from .routeFeatureTeamsbot import _mfaCodeQueues, _mfaWaitTasks
|
|
mfaQueue = asyncio.Queue()
|
|
_mfaCodeQueues[sessionId] = mfaQueue
|
|
|
|
async def _waitAndForwardMfa(sid, queue, ws):
|
|
try:
|
|
mfaResponse = await asyncio.wait_for(queue.get(), timeout=120.0)
|
|
logger.info(f"[WS] MFA response received for session {sid}: action={mfaResponse.get('action')}")
|
|
await ws.send_text(json.dumps({
|
|
"type": "mfaResponse",
|
|
"sessionId": sid,
|
|
"mfa": mfaResponse,
|
|
}))
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"[WS] MFA response timeout for session {sid}")
|
|
await ws.send_text(json.dumps({
|
|
"type": "mfaResponse",
|
|
"sessionId": sid,
|
|
"mfa": {"action": "timeout"},
|
|
}))
|
|
await _emitSessionEvent(sid, "mfaChallenge", {
|
|
"mfaType": "timeout",
|
|
"prompt": "MFA-Zeitlimit ueberschritten. Bitte erneut versuchen.",
|
|
})
|
|
except asyncio.CancelledError:
|
|
logger.info(f"[WS] MFA wait cancelled for session {sid} (resolved via page)")
|
|
finally:
|
|
_mfaCodeQueues.pop(sid, None)
|
|
_mfaWaitTasks.pop(sid, None)
|
|
|
|
_mfaWaitTasks[sessionId] = asyncio.create_task(
|
|
_waitAndForwardMfa(sessionId, mfaQueue, websocket)
|
|
)
|
|
|
|
elif msgType == "chatSendFailed":
|
|
errorData = message.get("error", {})
|
|
reason = errorData.get("reason", "unknown")
|
|
failedText = errorData.get("text", "")
|
|
logger.warning(
|
|
f"[WS] Chat send failed for session {sessionId}: "
|
|
f"reason={reason}, text={failedText[:60]}"
|
|
)
|
|
await _emitSessionEvent(sessionId, "chatSendFailed", {
|
|
"reason": reason,
|
|
"message": errorData.get("message", "Chat message could not be sent"),
|
|
"text": failedText,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
|
|
elif msgType == "mfaResolved":
|
|
success = message.get("success", False)
|
|
logger.info(f"[WS] MFA resolved: success={success}")
|
|
from .routeFeatureTeamsbot import _mfaCodeQueues, _mfaWaitTasks
|
|
task = _mfaWaitTasks.pop(sessionId, None)
|
|
if task and not task.done():
|
|
task.cancel()
|
|
_mfaCodeQueues.pop(sessionId, None)
|
|
await _emitSessionEvent(sessionId, "mfaResolved", {
|
|
"success": success,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
|
|
except Exception as e:
|
|
if "disconnect" not in str(e).lower():
|
|
logger.error(f"[WS] Error for session {sessionId}: {type(e).__name__}: {e}")
|
|
finally:
|
|
if _activeServices.get(sessionId) is self:
|
|
_activeServices.pop(sessionId, None)
|
|
self._websocket = None
|
|
self._voiceInterface = None
|
|
self._activeSessionId = None
|
|
try:
|
|
await _emitSessionEvent(sessionId, "botConnectionState", {
|
|
"connected": False,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
logger.info(f"[WS] Handler ended for session {sessionId} after {msgCount} messages")
|
|
|
|
async def _handleBotStatus(
|
|
self,
|
|
sessionId: str,
|
|
status: str,
|
|
errorMessage: Optional[str],
|
|
interface,
|
|
):
|
|
"""Handle status updates from the browser bot."""
|
|
logger.info(f"Bot status update for session {sessionId}: {status}")
|
|
|
|
statusMap = {
|
|
"connecting": TeamsbotSessionStatus.JOINING.value,
|
|
"launching": TeamsbotSessionStatus.JOINING.value,
|
|
"navigating": TeamsbotSessionStatus.JOINING.value,
|
|
"in_lobby": TeamsbotSessionStatus.JOINING.value,
|
|
"joined": TeamsbotSessionStatus.ACTIVE.value,
|
|
"in_meeting": TeamsbotSessionStatus.ACTIVE.value,
|
|
"left": TeamsbotSessionStatus.ENDED.value,
|
|
"error": TeamsbotSessionStatus.ERROR.value,
|
|
}
|
|
|
|
dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value)
|
|
|
|
updates = {"status": dbStatus}
|
|
if errorMessage:
|
|
updates["errorMessage"] = errorMessage
|
|
if dbStatus == TeamsbotSessionStatus.ACTIVE.value:
|
|
updates["startedAt"] = getIsoTimestamp()
|
|
elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
|
|
updates["endedAt"] = getIsoTimestamp()
|
|
|
|
interface.updateSession(sessionId, updates)
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
|
|
|
|
# Generate summary when session ends
|
|
if dbStatus == TeamsbotSessionStatus.ENDED.value:
|
|
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
|
|
|
async def _processAudioChunk(
|
|
self,
|
|
sessionId: str,
|
|
audioBase64: str,
|
|
sampleRate: int,
|
|
captureDiagnostics: Optional[Dict[str, Any]],
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
|
|
import base64
|
|
try:
|
|
audioBytes = base64.b64decode(audioBase64)
|
|
if len(audioBytes) < 1000:
|
|
return
|
|
|
|
if captureDiagnostics:
|
|
trackId = captureDiagnostics.get("trackId")
|
|
readyState = captureDiagnostics.get("readyState")
|
|
rms = captureDiagnostics.get("rms")
|
|
nativeSampleRate = captureDiagnostics.get("nativeSampleRate")
|
|
logger.debug(
|
|
f"[AudioChunk] diagnostics: track={trackId}, readyState={readyState}, "
|
|
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
|
|
)
|
|
|
|
# Use RMS from capture diagnostics to skip real silence.
|
|
# Byte-variation heuristics produced false positives and dropped valid speech.
|
|
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
|
|
try:
|
|
rmsVal = float(captureDiagnostics.get("rms"))
|
|
if rmsVal < 0.0003:
|
|
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
|
|
return
|
|
except Exception:
|
|
pass
|
|
|
|
if not voiceInterface:
|
|
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
|
|
return
|
|
|
|
# Treat sampleRate=0 as unknown (triggers auto-detection)
|
|
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
|
|
|
|
phraseHints = list(self._knownSpeakers)
|
|
if self.config.botName:
|
|
phraseHints.append(self.config.botName)
|
|
|
|
sttResult = await voiceInterface.speechToText(
|
|
audioContent=audioBytes,
|
|
language=self.config.language or "de-DE",
|
|
sampleRate=effectiveSampleRate,
|
|
channels=1,
|
|
skipFallbacks=True,
|
|
phraseHints=phraseHints if phraseHints else None,
|
|
alternativeLanguages=["en-US"],
|
|
)
|
|
|
|
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
|
text = sttResult["text"].strip()
|
|
if text:
|
|
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
|
|
fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False)
|
|
logger.info(
|
|
f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} "
|
|
f"(fromCaption={fromCaption}), text={text[:80]}..."
|
|
)
|
|
await self._processTranscript(
|
|
sessionId=sessionId,
|
|
speaker=resolvedSpeaker["speaker"],
|
|
text=text,
|
|
isFinal=True,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
source="audioCapture",
|
|
speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"],
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
|
|
|
def _registerSpeakerHint(self, speaker: str, text: str, sessionId: str = ""):
|
|
"""Track current speaker from captions for STT attribution.
|
|
When the first non-bot caption arrives, retroactively attributes
|
|
any STT segments that were created before a speaker was known."""
|
|
if not speaker:
|
|
return
|
|
normalizedSpeaker = speaker.strip()
|
|
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
|
|
return
|
|
|
|
prevSpeaker = self._lastCaptionSpeaker
|
|
self._lastCaptionSpeaker = normalizedSpeaker
|
|
self._knownSpeakers.add(normalizedSpeaker)
|
|
|
|
if prevSpeaker is None and self._unattributedTranscriptIds:
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
for tid in self._unattributedTranscriptIds:
|
|
interface.updateTranscript(tid, {"speaker": normalizedSpeaker})
|
|
for seg in self._contextBuffer:
|
|
if seg.get("speaker") == "Unknown" and seg.get("source") == "audioCapture":
|
|
seg["speaker"] = normalizedSpeaker
|
|
if self._lastTranscriptSpeaker == "Unknown":
|
|
self._lastTranscriptSpeaker = normalizedSpeaker
|
|
logger.info(
|
|
f"Session {sessionId}: Retroactive speaker attribution: "
|
|
f"{len(self._unattributedTranscriptIds)} segments -> {normalizedSpeaker}"
|
|
)
|
|
self._unattributedTranscriptIds.clear()
|
|
|
|
if self._pendingNameTrigger:
|
|
self._pendingNameTrigger["lastActivity"] = time.time()
|
|
|
|
def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]:
|
|
"""Speaker name for audio chunks — uses the last caption speaker."""
|
|
if self._lastCaptionSpeaker:
|
|
return {"speaker": self._lastCaptionSpeaker, "speakerResolvedFromHint": True}
|
|
return {"speaker": "Unknown", "speakerResolvedFromHint": False}
|
|
|
|
async def _processTranscript(
|
|
self,
|
|
sessionId: str,
|
|
speaker: str,
|
|
text: str,
|
|
isFinal: bool,
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
source: str = "caption",
|
|
speakerResolvedFromHint: Optional[bool] = None,
|
|
):
|
|
"""Process a transcript segment from captions or chat messages.
|
|
|
|
Differential writing: When the same speaker continues (text grows
|
|
incrementally as captions stream), we UPDATE the existing DB record
|
|
instead of creating a cascade of near-duplicate rows. A new record
|
|
is only created when the speaker changes or the text is not a
|
|
continuation of the previous segment.
|
|
"""
|
|
|
|
text = text.strip()
|
|
if not text:
|
|
return
|
|
|
|
# Captions are used ONLY for speaker name resolution (never as transcript).
|
|
# Transcript text comes exclusively from audio STT or chat.
|
|
# Address detection (bot name in caption) still triggers AI analysis
|
|
# using existing audio-based context — but caption text itself is NOT
|
|
# added to the context buffer.
|
|
if source in ("caption", "speakerHint"):
|
|
self._registerSpeakerHint(speaker, text, sessionId)
|
|
|
|
if (
|
|
source == "speakerHint"
|
|
and isFinal
|
|
and not self._isBotSpeaker(speaker)
|
|
and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
|
|
and self._detectBotName(text)
|
|
):
|
|
triggerTranscript = {"id": None, "speaker": speaker, "text": text, "source": source}
|
|
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, triggerTranscript)
|
|
if isNew:
|
|
logger.info(f"Session {sessionId}: Bot name in caption, debounce trigger started")
|
|
asyncio.create_task(self._checkPendingNameTrigger())
|
|
# Fire a short audible "Moment..." in parallel so the
|
|
# speaker hears the bot react immediately, instead of
|
|
# waiting for debounce + SPEECH_TEAMS + agent (~5-30s).
|
|
self._currentQuickAckTask = asyncio.create_task(
|
|
self._runQuickAck(sessionId)
|
|
)
|
|
return
|
|
|
|
# Chat history: messages sent before the bot joined the meeting.
|
|
# Stored in DB for reference but NOT added to the AI context buffer,
|
|
# because old messages (e.g. "nyla, summarize the protocol") would
|
|
# be treated as current requests when AI analysis is triggered.
|
|
if source == "chatHistory":
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=speaker,
|
|
text=text,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=True,
|
|
source="chatHistory",
|
|
).model_dump()
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": speaker,
|
|
"text": text,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "chatHistory",
|
|
"isHistory": True,
|
|
})
|
|
logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}")
|
|
return
|
|
|
|
# Filter out the bot's own speech (caption/audioCapture) — garbled text
|
|
# pollutes context. Chat from the bot is clean text and must appear in
|
|
# the transcript for all participants.
|
|
isBotSpeaker = self._isBotSpeaker(speaker)
|
|
if isBotSpeaker and source != "chat":
|
|
logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...")
|
|
return
|
|
|
|
# Differential transcript writing:
|
|
# audioCapture from same speaker → append text (merge STT chunks into one block)
|
|
# Start a new block after a pause (>5s gap between STT results)
|
|
sttPauseThreshold = 5.0
|
|
isMerge = (
|
|
source == "audioCapture"
|
|
and self._lastTranscriptSpeaker == speaker
|
|
and self._lastTranscriptText is not None
|
|
and self._lastTranscriptId is not None
|
|
and (time.time() - self._lastSttTime) < sttPauseThreshold
|
|
)
|
|
|
|
if isMerge:
|
|
mergedText = f"{self._lastTranscriptText} {text}"
|
|
interface.updateTranscript(self._lastTranscriptId, {
|
|
"text": mergedText,
|
|
"isFinal": isFinal,
|
|
})
|
|
self._lastTranscriptText = mergedText
|
|
createdTranscript = {"id": self._lastTranscriptId}
|
|
|
|
if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
|
|
self._contextBuffer[-1]["text"] = mergedText
|
|
else:
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=speaker,
|
|
text=text,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=isFinal,
|
|
source=source,
|
|
).model_dump()
|
|
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
self._lastTranscriptSpeaker = speaker
|
|
self._lastTranscriptText = text
|
|
self._lastTranscriptId = createdTranscript.get("id")
|
|
|
|
if source == "audioCapture" and speaker == "Unknown":
|
|
self._unattributedTranscriptIds.append(createdTranscript.get("id"))
|
|
|
|
self._contextBuffer.append({
|
|
"speaker": speaker or "Unknown",
|
|
"text": text,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": source,
|
|
})
|
|
|
|
maxSegments = self.config.contextWindowSegments
|
|
if len(self._contextBuffer) > maxSegments:
|
|
if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
|
|
asyncio.create_task(self._summarizeContextBuffer(sessionId))
|
|
self._contextBuffer = self._contextBuffer[-maxSegments:]
|
|
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("transcriptSegmentCount", 0) + 1
|
|
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
|
|
|
|
if source == "audioCapture":
|
|
self._lastSttTime = time.time()
|
|
|
|
displayText = self._lastTranscriptText if isMerge else text
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": speaker,
|
|
"text": displayText,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": isMerge,
|
|
"source": source,
|
|
"speakerResolvedFromHint": (
|
|
speakerResolvedFromHint
|
|
if speakerResolvedFromHint is not None
|
|
else False
|
|
),
|
|
})
|
|
|
|
if not isFinal:
|
|
return
|
|
|
|
if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY:
|
|
return
|
|
|
|
# Bot's own chat: stored for display only, never trigger AI
|
|
if source == "chat" and isBotSpeaker:
|
|
return
|
|
|
|
# Stop phrases: HARD STOP, no AI round-trip. We previously routed
|
|
# this through ``_analyzeAndRespond`` which spent 1-2 seconds in
|
|
# the speech LLM just to classify the intent, during which the
|
|
# current TTS kept playing — and the LLM round-trip would also
|
|
# produce yet another response that joined the queue. The new
|
|
# path goes straight to the browser bot's audio cancel and
|
|
# invalidates everything else in flight.
|
|
if self._isStopPhrase(text):
|
|
logger.info(
|
|
f"Session {sessionId}: Stop phrase detected ('{text.strip()[:60]}'), "
|
|
f"hard-cancelling in-flight speech immediately"
|
|
)
|
|
await self._cancelInFlightSpeech(
|
|
sessionId=sessionId,
|
|
websocket=websocket,
|
|
reason="userStopPhrase",
|
|
)
|
|
return
|
|
|
|
# Update activity for any pending debounced trigger
|
|
if self._pendingNameTrigger:
|
|
self._pendingNameTrigger["lastActivity"] = time.time()
|
|
|
|
# Bot name detection → debounced trigger (wait for speaker to finish)
|
|
if self._detectBotName(text):
|
|
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
if isNew:
|
|
asyncio.create_task(self._checkPendingNameTrigger())
|
|
# Audible early-feedback ack ("Moment...") in parallel — runs
|
|
# while we still wait the debounce window and SPEECH_TEAMS
|
|
# decides what to actually answer.
|
|
self._currentQuickAckTask = asyncio.create_task(
|
|
self._runQuickAck(sessionId)
|
|
)
|
|
return
|
|
|
|
# Follow-up window: after a bot response, trigger AI for any human speech
|
|
# without requiring the bot name — the AI decides via shouldRespond
|
|
if (
|
|
source == "audioCapture"
|
|
and not self._isBotSpeaker(speaker)
|
|
and time.time() < self._followUpWindowEnd
|
|
and not self._pendingNameTrigger
|
|
):
|
|
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
if isNew:
|
|
logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
|
|
asyncio.create_task(self._checkPendingNameTrigger())
|
|
return
|
|
|
|
# Periodic trigger (only when no debounce pending)
|
|
if not self._pendingNameTrigger:
|
|
shouldTrigger = self._shouldTriggerAnalysis(text)
|
|
if shouldTrigger:
|
|
logger.info(f"Session {sessionId}: Periodic trigger (buffer: {len(self._contextBuffer)} segments)")
|
|
await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
|
|
def _isBotSpeaker(self, speaker: str) -> bool:
|
|
"""Check if a transcript speaker is the bot itself.
|
|
|
|
Teams captions show the bot as e.g. "BotName (Unverified)" or
|
|
"Nyla Larsson" depending on auth/anonymous join. We match against:
|
|
- The configured/derived bot name
|
|
- The bot account display name if authenticated
|
|
"""
|
|
if not speaker:
|
|
return False
|
|
|
|
speakerLower = speaker.lower().strip()
|
|
|
|
# Match against configured bot name
|
|
botName = self.config.botName.lower().strip()
|
|
if botName and botName in speakerLower:
|
|
return True
|
|
|
|
# Match against bot account email prefix (e.g. "nyla.larsson" from "nyla.larsson@poweron.swiss")
|
|
botAccountEmail = getattr(self, '_botAccountEmail', None) or getattr(self.config, 'botAccountEmail', None)
|
|
if botAccountEmail:
|
|
emailPrefix = botAccountEmail.split("@")[0].lower().replace(".", " ")
|
|
if emailPrefix in speakerLower:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _shouldTriggerAnalysis(self, transcriptText: str, allowPeriodic: bool = True) -> bool:
|
|
"""
|
|
Decide whether to trigger AI analysis based on the latest transcript.
|
|
Bot name detection is handled separately via debounce.
|
|
This method only checks periodic/cooldown triggers.
|
|
"""
|
|
now = time.time()
|
|
timeSinceLastCall = now - self._lastAiCallTime
|
|
|
|
if timeSinceLastCall < self.config.triggerCooldownSeconds:
|
|
return False
|
|
|
|
if allowPeriodic and timeSinceLastCall >= self.config.triggerIntervalSeconds:
|
|
logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s)")
|
|
return True
|
|
|
|
return False
|
|
|
|
def _isStopPhrase(self, text: str) -> bool:
|
|
"""Check if text is an immediate-cancel command from the meeting.
|
|
|
|
Recognised intents (any language we hear in practice):
|
|
* Hard stop: stop / stopp / halt / ruhe / stille / arrete / quiet / shut
|
|
* Pause / wait: warte / wait / moment / pause / hold (hold on)
|
|
* Silence: sei still / be quiet / shut up / aufhoeren / aufhören / silence
|
|
Hits trigger the direct stop pipeline in ``_cancelInFlightSpeech``:
|
|
kill TTS, invalidate pending generations, clear name-trigger debounce.
|
|
Critically: NO new AI call is fired — the user explicitly asked the
|
|
bot to be quiet, so the worst thing we could do is generate yet
|
|
another response on top of the one we just cancelled.
|
|
"""
|
|
if not text or len(text.strip()) < 2:
|
|
return False
|
|
t = text.strip().lower()
|
|
words = [w.strip(".,!?:;\"'()[]") for w in t.split() if w.strip()]
|
|
wordSet = set(words)
|
|
stopWords = {
|
|
# Hard-stop verbs
|
|
"stop", "stopp", "halt", "ruhe", "stille", "schweig",
|
|
"arrete", "quiet", "shut", "silence",
|
|
# Pause / wait verbs (still "be quiet now" semantics)
|
|
"warte", "wait", "moment", "pause",
|
|
}
|
|
if wordSet & stopWords:
|
|
return True
|
|
if (
|
|
"sei still" in t
|
|
or "be quiet" in t
|
|
or "shut up" in t
|
|
or "hold on" in t
|
|
or "aufhoeren" in t
|
|
or "aufhören" in t
|
|
):
|
|
return True
|
|
return False
|
|
|
|
def _makeAnswerCancelHook(self) -> Callable[[], bool]:
|
|
"""Capture the current ``_answerGenerationCounter`` and return a
|
|
zero-arg predicate that returns ``True`` once a hard stop (or any
|
|
future "supersede this answer" event) has bumped the counter.
|
|
|
|
Pass the returned predicate as ``isCancelled`` into
|
|
``_speakTextChunked`` so a multi-chunk dispatch can bail out
|
|
between chunks instead of speaking a 30-second answer to the end.
|
|
"""
|
|
snapshot = self._answerGenerationCounter
|
|
return lambda: self._answerGenerationCounter != snapshot
|
|
|
|
async def _cancelInFlightSpeech(
|
|
self,
|
|
sessionId: str,
|
|
websocket: Optional[WebSocket],
|
|
reason: str,
|
|
) -> None:
|
|
"""Hard stop everything the bot is currently doing in the meeting.
|
|
|
|
Pipeline (ALL synchronous from the caller's point of view, no AI
|
|
round-trips):
|
|
|
|
1. Bump ``_answerGenerationCounter`` so any in-flight TTS chunk
|
|
loop, agent escalation or quick-ack drops its remaining work
|
|
the moment it next checks the counter.
|
|
2. Clear ``_pendingNameTrigger`` so a debounced "speaker just said
|
|
the bot name" trigger that was queued before the stop word
|
|
cannot wake up 3 seconds later and answer anyway.
|
|
3. Cancel tracked background tasks (escalation, quick-ack). The
|
|
tasks themselves swallow ``CancelledError`` in their finally
|
|
block.
|
|
4. Send ``{"type":"stopAudio"}`` to the browser bot — it stops the
|
|
current playback in the AudioContext and clears its play queue
|
|
so nothing buffered comes through afterwards.
|
|
|
|
Deliberately does NOT generate a new response. The user just told
|
|
the bot to be quiet; producing a "Okay, ich bin still" reply on
|
|
top would be the exact opposite of what was asked for.
|
|
"""
|
|
self._answerGenerationCounter += 1
|
|
gen = self._answerGenerationCounter
|
|
logger.info(
|
|
f"Session {sessionId}: Cancelling in-flight speech "
|
|
f"(reason={reason}, gen={gen})"
|
|
)
|
|
|
|
if self._pendingNameTrigger:
|
|
logger.info(
|
|
f"Session {sessionId}: Dropping pending debounced name "
|
|
f"trigger (was queued before stop)"
|
|
)
|
|
self._pendingNameTrigger = None
|
|
|
|
for taskAttr in ("_currentEscalationTask", "_currentQuickAckTask"):
|
|
task = getattr(self, taskAttr, None)
|
|
if task is not None and not task.done():
|
|
logger.info(
|
|
f"Session {sessionId}: Cancelling background task "
|
|
f"{taskAttr}"
|
|
)
|
|
task.cancel()
|
|
|
|
if websocket is not None:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "stopAudio",
|
|
"sessionId": sessionId,
|
|
"reason": reason,
|
|
}))
|
|
except Exception as stopErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Failed to send stopAudio to "
|
|
f"browser bot: {stopErr}"
|
|
)
|
|
|
|
try:
|
|
await _emitSessionEvent(sessionId, "speechCancelled", {
|
|
"reason": reason,
|
|
"generation": gen,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
def _detectBotName(self, text: str) -> bool:
|
|
"""Check if text contains the bot's name (exact or phonetically similar)."""
|
|
botNameLower = self.config.botName.lower()
|
|
textLower = text.lower()
|
|
|
|
if botNameLower in textLower:
|
|
return True
|
|
|
|
botFirstName = botNameLower.split()[0] if " " in botNameLower else botNameLower
|
|
if len(botFirstName) >= 3:
|
|
for word in textLower.split():
|
|
cleanWord = word.strip(".,!?:;\"'()[]")
|
|
if not cleanWord or len(cleanWord) < 3:
|
|
continue
|
|
if cleanWord == botFirstName:
|
|
return True
|
|
if cleanWord[0] == botFirstName[0] and abs(len(cleanWord) - len(botFirstName)) <= 2:
|
|
common = sum(1 for c in set(botFirstName) if c in cleanWord)
|
|
similarity = common / max(len(set(botFirstName)), len(set(cleanWord)))
|
|
if similarity >= 0.6:
|
|
return True
|
|
return False
|
|
|
|
def _setPendingNameTrigger(self, sessionId, interface, voiceInterface, websocket, triggerTranscript) -> bool:
|
|
"""Set or update a debounced name trigger. Returns True if newly set."""
|
|
if self._pendingNameTrigger:
|
|
self._pendingNameTrigger["lastActivity"] = time.time()
|
|
return False
|
|
self._pendingNameTrigger = {
|
|
"sessionId": sessionId,
|
|
"interface": interface,
|
|
"voiceInterface": voiceInterface,
|
|
"websocket": websocket,
|
|
"triggerTranscript": triggerTranscript,
|
|
"detectedAt": time.time(),
|
|
"lastActivity": time.time(),
|
|
}
|
|
return True
|
|
|
|
async def _warmEphemeralPhrasePool(self, sessionId: str) -> None:
|
|
"""Fire-and-forget background task: generate the ephemeral phrase
|
|
pool for every kind defined in ``_EPHEMERAL_PHRASE_INTENTS`` so the
|
|
first quick-ack / interim notice doesn't pay the AI round-trip
|
|
latency at runtime. Failures are logged but never raised — the
|
|
runtime selectors handle empty pools by silently skipping the cue."""
|
|
try:
|
|
for kind in _EPHEMERAL_PHRASE_INTENTS:
|
|
try:
|
|
await self._getEphemeralPhrases(kind)
|
|
except Exception as innerErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Phrase pool warmup failed for "
|
|
f"kind={kind}: {innerErr}"
|
|
)
|
|
except Exception as warmErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Phrase pool warmup task crashed: {warmErr}"
|
|
)
|
|
|
|
# ---------------------------------------------------------------- Voice
|
|
# When the bot's full answer is a long structured chat post (markdown
|
|
# tables, bullet lists, headings, multi-paragraph) we MUST NOT read it
|
|
# out verbatim into the meeting — even after sanitisation it sounds
|
|
# like a wall of text and easily takes 5+ minutes. The chat keeps the
|
|
# full answer; the audio path goes through ``_summarizeForVoice`` which
|
|
# asks the AI for a 1-3 sentence spoken paraphrase in the configured
|
|
# bot persona / language.
|
|
|
|
# Threshold: anything longer than this many characters (after sanitise)
|
|
# OR any answer whose source contains markdown structure (tables /
|
|
# multiple bullets / multiple headings) gets condensed before TTS.
|
|
_VOICE_DIRECT_MAX_CHARS = 600
|
|
_VOICE_SUMMARY_MAX_CHARS = 350
|
|
|
|
@staticmethod
|
|
def _looksLikeStructuredText(raw: str) -> bool:
|
|
"""Heuristic: does the original answer have markdown structure that
|
|
would be miserable to listen to verbatim? Used to trigger the
|
|
AI summary path even when the sanitised text is short enough."""
|
|
if not raw:
|
|
return False
|
|
if raw.count("|") >= 4: # at least one markdown table row
|
|
return True
|
|
if raw.count("\n#") >= 1: # at least one heading after newline
|
|
return True
|
|
if raw.count("\n- ") + raw.count("\n* ") + raw.count("\n• ") >= 3:
|
|
return True # 3+ bullets → list-like
|
|
if re.search(r"\n\d+[\.\)]\s", raw): # numbered list
|
|
count = len(re.findall(r"(?m)^\s*\d+[\.\)]\s", raw))
|
|
if count >= 3:
|
|
return True
|
|
return False
|
|
|
|
async def _summarizeForVoice(
|
|
self,
|
|
sessionId: str,
|
|
rawAnswer: str,
|
|
) -> str:
|
|
"""Return a SHORT, naturally-spoken paraphrase of ``rawAnswer`` for
|
|
TTS playback. Falls back to the sanitised + truncated original if
|
|
the AI call fails — never blocks the response.
|
|
|
|
The chat / DB / UI keep the original ``rawAnswer`` untouched. Only
|
|
the voice channel goes through this condensation.
|
|
"""
|
|
if not rawAnswer or not rawAnswer.strip():
|
|
return ""
|
|
|
|
sanitised = _voiceFriendlyMeetingText(rawAnswer)
|
|
# Short + unstructured → speak as-is, no AI round-trip
|
|
if (
|
|
len(sanitised) <= self._VOICE_DIRECT_MAX_CHARS
|
|
and not self._looksLikeStructuredText(rawAnswer)
|
|
):
|
|
return sanitised
|
|
|
|
targetLang = (self.config.language or "de-DE").strip()
|
|
botName = (self.config.botName or "").strip() or "the assistant"
|
|
persona = (self.config.aiSystemPrompt or "").strip()
|
|
personaBlock = (
|
|
f"\n\nBOT PERSONA / TONE:\n{persona}\n"
|
|
if persona else ""
|
|
)
|
|
|
|
prompt = (
|
|
f"You are condensing a long written answer into a SHORT spoken "
|
|
f"paraphrase that the assistant '{botName}' will say out loud "
|
|
f"into a Microsoft Teams meeting. The full written answer is "
|
|
f"already in the meeting chat — your job is to summarise it for "
|
|
f"the EAR, not the eye.\n\n"
|
|
f"STRICT REQUIREMENTS:\n"
|
|
f"1. Output language: BCP-47 '{targetLang}'. No other language.\n"
|
|
f"2. 1 to 3 sentences, max ~{self._VOICE_SUMMARY_MAX_CHARS} characters total.\n"
|
|
f"3. Natural spoken style — no headings, no bullet points, no "
|
|
f"tables, no markdown, no emojis, no enumerations like 'Erstens... "
|
|
f"Zweitens...' unless that genuinely flows in speech.\n"
|
|
f"4. Capture the essence and the most important conclusion. Do "
|
|
f"NOT try to fit every detail. Listeners can read the chat for "
|
|
f"the full version.\n"
|
|
f"5. End by gently pointing the audience to the chat for details, "
|
|
f"e.g. 'Details stehen im Chat.' (adapted to the target language).\n"
|
|
f"6. Output ONLY the spoken text. No JSON, no quotes around it, "
|
|
f"no preamble like 'Here is the summary:'.\n"
|
|
f"{personaBlock}\n"
|
|
f"FULL WRITTEN ANSWER (markdown-formatted, sometimes long):\n"
|
|
f"---\n{rawAnswer.strip()[:6000]}\n---\n"
|
|
)
|
|
|
|
try:
|
|
aiService = _createAiService(
|
|
self.currentUser, self.mandateId, self.instanceId
|
|
)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="",
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
),
|
|
)
|
|
response = await aiService.callAi(request)
|
|
except Exception as aiErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Voice summary AI call failed: {aiErr}"
|
|
)
|
|
return sanitised[: self._VOICE_DIRECT_MAX_CHARS]
|
|
|
|
if not response or response.errorCount != 0 or not response.content:
|
|
logger.warning(
|
|
f"Session {sessionId}: Voice summary returned empty/error"
|
|
)
|
|
return sanitised[: self._VOICE_DIRECT_MAX_CHARS]
|
|
|
|
spoken = response.content.strip()
|
|
# Defensive sanitiser pass — the model usually obeys the
|
|
# "no markdown" instruction but not always.
|
|
spoken = _voiceFriendlyMeetingText(spoken)
|
|
if not spoken:
|
|
return sanitised[: self._VOICE_DIRECT_MAX_CHARS]
|
|
|
|
logger.info(
|
|
f"Session {sessionId}: Voice summary generated "
|
|
f"(orig={len(rawAnswer)} chars, sanitised={len(sanitised)}, "
|
|
f"spoken={len(spoken)})"
|
|
)
|
|
return spoken
|
|
|
|
async def _pickQuickAckText(self) -> Optional[str]:
|
|
"""Return a short ack text in the bot's configured language. The
|
|
actual phrases are AI-generated once per session (cached) and rotated
|
|
round-robin so consecutive acks don't sound identical. Returns
|
|
``None`` only if AI generation completely failed and no fallback
|
|
variant could be produced — in that case the caller silently skips
|
|
the ack."""
|
|
return await self._pickEphemeralPhrase("quickAck")
|
|
|
|
async def _pickEphemeralPhrase(
|
|
self,
|
|
kind: str,
|
|
substitutions: Optional[Dict[str, Any]] = None,
|
|
) -> Optional[str]:
|
|
"""Round-robin selector over the cached phrase pool for ``kind``.
|
|
Lazily generates the pool on first use. ``substitutions`` is applied
|
|
to the chosen phrase via ``str.format(**substitutions)`` so kinds
|
|
like ``agentRound`` can render ``{round}`` / ``{maxRounds}``.
|
|
Returns ``None`` if no phrases are available."""
|
|
variants = await self._getEphemeralPhrases(kind)
|
|
if not variants:
|
|
return None
|
|
idx = self._phrasePoolIdx.get(kind, 0) % len(variants)
|
|
self._phrasePoolIdx[kind] = (idx + 1) % len(variants)
|
|
chosen = variants[idx]
|
|
if substitutions:
|
|
try:
|
|
chosen = chosen.format(**substitutions)
|
|
except (KeyError, IndexError, ValueError) as fmtErr:
|
|
# The AI didn't include the expected placeholder — return the
|
|
# raw phrase rather than crash. The user still hears something
|
|
# in the right language; only the numeric hint is missing.
|
|
logger.debug(
|
|
f"Ephemeral phrase substitution failed for kind={kind}: {fmtErr}"
|
|
)
|
|
return chosen
|
|
|
|
async def _getEphemeralPhrases(self, kind: str) -> List[str]:
|
|
"""Return the cached pool of AI-generated variants for ``kind``,
|
|
generating it on first request. Subsequent calls hit the in-memory
|
|
cache. Concurrent first-time callers are serialised by the pool lock
|
|
so only ONE AI request is fired per kind per session."""
|
|
cached = self._phrasePool.get(kind)
|
|
if cached:
|
|
return cached
|
|
async with self._phrasePoolLock:
|
|
cached = self._phrasePool.get(kind)
|
|
if cached:
|
|
return cached
|
|
phrases = await self._generateEphemeralPhrases(
|
|
kind, _EPHEMERAL_PHRASE_VARIANTS
|
|
)
|
|
if phrases:
|
|
self._phrasePool[kind] = phrases
|
|
return phrases
|
|
|
|
async def _generateEphemeralPhrases(
|
|
self, kind: str, count: int
|
|
) -> List[str]:
|
|
"""Ask the AI to produce ``count`` short utterances for ``kind`` in
|
|
the bot's configured language and persona. Returns ``[]`` on any
|
|
failure — callers must treat empty as 'silently skip this ephemeral
|
|
cue', NEVER fall back to a hardcoded localized string."""
|
|
intent = _EPHEMERAL_PHRASE_INTENTS.get(kind)
|
|
if not intent:
|
|
logger.warning(f"Unknown ephemeral phrase kind requested: {kind}")
|
|
return []
|
|
|
|
targetLang = (self.config.language or "").strip() or "en-US"
|
|
botName = (self.config.botName or "the assistant").strip()
|
|
persona = (self.config.aiSystemPrompt or "").strip()
|
|
|
|
# The prompt is in English on purpose — these are instructions to the
|
|
# LLM, not user-facing text. The OUTPUT is required to be in
|
|
# ``targetLang``. We ask for a strict JSON array so parsing is robust.
|
|
prompt = (
|
|
f"You are localizing short SPOKEN-LANGUAGE utterances for a "
|
|
f"meeting assistant named '{botName}'.\n\n"
|
|
f"Persona / style guide for the assistant:\n"
|
|
f"{persona or '(no persona configured — use a neutral, polite, professional tone)'}\n\n"
|
|
f"Target spoken language (BCP-47 code): {targetLang}\n\n"
|
|
f"Utterance intent:\n{intent}\n\n"
|
|
f"Generate {count} DIFFERENT variants matching this intent, in "
|
|
f"the target language. Variants should feel natural when spoken "
|
|
f"aloud, not robotic. Do NOT include the assistant's name in "
|
|
f"the variants.\n\n"
|
|
f"Output STRICTLY a JSON array of {count} plain-text strings, "
|
|
f"with no markdown fences, no commentary, no surrounding "
|
|
f"quotation marks beyond the JSON syntax itself. Example "
|
|
f"format: [\"...\", \"...\", \"...\", \"...\"]"
|
|
)
|
|
|
|
try:
|
|
aiService = _createAiService(
|
|
self.currentUser, self.mandateId, self.instanceId
|
|
)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="",
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
),
|
|
)
|
|
response = await aiService.callAi(request)
|
|
except Exception as aiErr:
|
|
logger.warning(
|
|
f"Ephemeral phrase generation failed (kind={kind}, lang={targetLang}): {aiErr}"
|
|
)
|
|
return []
|
|
|
|
if not response or response.errorCount != 0 or not response.content:
|
|
logger.warning(
|
|
f"Ephemeral phrase generation returned empty/error "
|
|
f"(kind={kind}, lang={targetLang})"
|
|
)
|
|
return []
|
|
|
|
raw = response.content.strip()
|
|
# Strip optional ```json ... ``` fences before parsing.
|
|
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
raw = re.sub(r"\s*```\s*$", "", raw)
|
|
try:
|
|
arr = json.loads(raw)
|
|
except json.JSONDecodeError as parseErr:
|
|
logger.warning(
|
|
f"Ephemeral phrase generation: could not parse JSON "
|
|
f"(kind={kind}, lang={targetLang}): {parseErr} "
|
|
f"raw={raw[:200]}"
|
|
)
|
|
return []
|
|
if not isinstance(arr, list):
|
|
return []
|
|
cleaned = [
|
|
str(v).strip()
|
|
for v in arr
|
|
if isinstance(v, str) and str(v).strip()
|
|
]
|
|
cleaned = cleaned[:count]
|
|
if cleaned:
|
|
logger.info(
|
|
f"Ephemeral phrase pool generated (kind={kind}, "
|
|
f"lang={targetLang}, count={len(cleaned)})"
|
|
)
|
|
return cleaned
|
|
|
|
def _shouldFireQuickAck(self) -> bool:
|
|
"""Centralized gate so the call sites stay short and consistent."""
|
|
now = time.time()
|
|
if (now - self._lastQuickAckTs) < _QUICK_ACK_MIN_INTERVAL_SEC:
|
|
return False
|
|
# If we are already producing a real response, the ack would step on
|
|
# the actual answer's TTS — skip it. Same for an in-flight agent
|
|
# escalation: the agent will deliver its own answer (and we already
|
|
# spoke an interim "moment please" when it started).
|
|
if self._aiAnalysisInProgress or self._agentEscalationInFlight:
|
|
return False
|
|
# Voice channel must be active. Chat-only mode would just spam "...".
|
|
channelRaw = self.config.responseChannel
|
|
channelStr = (
|
|
channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw)
|
|
).lower().strip()
|
|
if channelStr not in ("voice", "both"):
|
|
return False
|
|
if self.config.responseMode in (
|
|
TeamsbotResponseMode.MANUAL,
|
|
TeamsbotResponseMode.TRANSCRIBE_ONLY,
|
|
):
|
|
return False
|
|
return True
|
|
|
|
async def _runQuickAck(self, sessionId: str) -> None:
|
|
"""Background task: speak the short ack into the meeting via TTS.
|
|
|
|
Designed to be fired as ``asyncio.create_task(self._runQuickAck(...))``
|
|
the moment the bot's name is detected — does not block the regular
|
|
debounced analysis pipeline. Persists nothing to the DB and emits no
|
|
botResponse event; this is purely an audio cue ("Moment...") so the
|
|
speaker hears within ~1s that the bot is reacting.
|
|
"""
|
|
websocket = self._websocket
|
|
voiceInterface = self._voiceInterface
|
|
if websocket is None or voiceInterface is None:
|
|
return
|
|
if not self._shouldFireQuickAck():
|
|
return
|
|
ackText = await self._pickQuickAckText()
|
|
if not ackText:
|
|
return
|
|
# Mark the throttle BEFORE TTS so two near-simultaneous detections
|
|
# don't both fire (TTS dispatch can take a few hundred ms).
|
|
self._lastQuickAckTs = time.time()
|
|
try:
|
|
await _emitSessionEvent(sessionId, "quickAck", {
|
|
"text": ackText,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
outcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=ackText,
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
if not outcome.get("success"):
|
|
logger.info(
|
|
f"Session {sessionId}: Quick ack TTS failed silently "
|
|
f"({outcome.get('error')}) — main response will still go through"
|
|
)
|
|
except asyncio.CancelledError:
|
|
logger.info(f"Session {sessionId}: Quick ack cancelled by stop signal")
|
|
except Exception as ackErr:
|
|
logger.warning(f"Session {sessionId}: Quick ack failed: {ackErr}")
|
|
finally:
|
|
self._currentQuickAckTask = None
|
|
|
|
async def _checkPendingNameTrigger(self, delaySec: float = 3.0):
|
|
"""Async loop: fire the pending name trigger once the speaker is quiet."""
|
|
await asyncio.sleep(delaySec)
|
|
if not self._pendingNameTrigger:
|
|
return
|
|
|
|
now = time.time()
|
|
lastActivity = self._pendingNameTrigger.get("lastActivity", 0)
|
|
detectedAt = self._pendingNameTrigger.get("detectedAt", 0)
|
|
quietSec = now - lastActivity
|
|
totalWaitSec = now - detectedAt
|
|
|
|
if quietSec >= 3.0 or totalWaitSec >= 15.0:
|
|
trigger = self._pendingNameTrigger
|
|
self._pendingNameTrigger = None
|
|
logger.info(
|
|
f"Session {trigger['sessionId']}: Debounced name trigger fires "
|
|
f"(quiet={quietSec:.1f}s, totalWait={totalWaitSec:.1f}s)"
|
|
)
|
|
await self._analyzeAndRespond(
|
|
trigger["sessionId"],
|
|
trigger["interface"],
|
|
trigger["voiceInterface"],
|
|
trigger["websocket"],
|
|
trigger["triggerTranscript"],
|
|
)
|
|
else:
|
|
remaining = max(0.5, 3.0 - quietSec)
|
|
asyncio.create_task(self._checkPendingNameTrigger(remaining))
|
|
|
|
async def _analyzeAndRespond(
|
|
self,
|
|
sessionId: str,
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
triggerTranscript: Dict[str, Any],
|
|
):
|
|
"""Run SPEECH_TEAMS AI analysis and respond if needed."""
|
|
if self._aiAnalysisInProgress:
|
|
logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger")
|
|
return
|
|
# An agent escalation from a previous trigger may still be researching
|
|
# (it lives in its own task, ``_aiAnalysisInProgress`` was already
|
|
# released when SPEECH_TEAMS returned). If we let a fresh SPEECH_TEAMS
|
|
# run now, both pipelines would race to the meeting voice channel and
|
|
# the operator would hear "two bots talking". Skip until the agent
|
|
# finishes; the speaker can re-trigger by saying the bot name again
|
|
# if they have a new question.
|
|
if self._agentEscalationInFlight:
|
|
logger.info(
|
|
f"Session {sessionId}: Agent escalation still in flight — "
|
|
f"skipping new SPEECH_TEAMS trigger to prevent overlapping replies"
|
|
)
|
|
return
|
|
self._aiAnalysisInProgress = True
|
|
self._lastAiCallTime = time.time()
|
|
|
|
# Build transcript context from buffer.
|
|
# Mark bot's own utterances and chat messages for the AI.
|
|
contextLines = []
|
|
for segment in self._contextBuffer:
|
|
speaker = segment.get("speaker", "Unknown")
|
|
text = segment.get("text", "")
|
|
segSource = segment.get("source", "caption")
|
|
prefix = "Chat" if segSource == "chat" else ""
|
|
if self._isBotSpeaker(speaker):
|
|
contextLines.append(f"[YOU ({self.config.botName})]: {text}")
|
|
elif prefix:
|
|
contextLines.append(f"[{prefix}: {speaker}]: {text}")
|
|
else:
|
|
contextLines.append(f"[{speaker}]: {text}")
|
|
|
|
# Include session context if provided by the user at session start
|
|
sessionContextStr = ""
|
|
if self._sessionContext:
|
|
sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{self._sessionContext}\n"
|
|
|
|
# Include summary of earlier conversation if available
|
|
summaryStr = ""
|
|
if self._contextSummary:
|
|
summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{self._contextSummary}\n"
|
|
|
|
# Persistent director prompts: private operator instructions that stay
|
|
# in effect across triggers (e.g. "respond in English", "always be brief").
|
|
directorStr = self._buildPersistentDirectorContext()
|
|
|
|
transcriptContext = f"BOT_NAME:{self.config.botName}{sessionContextStr}{summaryStr}{directorStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines)
|
|
|
|
# Call SPEECH_TEAMS
|
|
try:
|
|
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt=self.config.aiSystemPrompt,
|
|
context=transcriptContext,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.SPEECH_TEAMS,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
|
|
# Parse structured response
|
|
try:
|
|
speechResult = SpeechTeamsResponse.model_validate_json(response.content)
|
|
except Exception:
|
|
# Try to extract JSON from response content
|
|
try:
|
|
jsonStr = response.content
|
|
if "```json" in jsonStr:
|
|
jsonStr = jsonStr.split("```json")[1].split("```")[0]
|
|
elif "```" in jsonStr:
|
|
jsonStr = jsonStr.split("```")[1].split("```")[0]
|
|
speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip())
|
|
except Exception as parseErr:
|
|
logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}")
|
|
speechResult = SpeechTeamsResponse(
|
|
shouldRespond=False,
|
|
reasoning=f"Parse error: {str(parseErr)[:100]}",
|
|
detectedIntent="none"
|
|
)
|
|
|
|
logger.info(
|
|
f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, "
|
|
f"intent={speechResult.detectedIntent}, "
|
|
f"reasoning={speechResult.reasoning[:80]}..."
|
|
)
|
|
|
|
# Emit analysis event (always, for debug/UI)
|
|
await _emitSessionEvent(sessionId, "analysis", {
|
|
"shouldRespond": speechResult.shouldRespond,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"needsAgent": speechResult.needsAgent,
|
|
"agentReason": speechResult.agentReason,
|
|
})
|
|
|
|
# Hybrid routing: SPEECH_TEAMS detected a complex request that
|
|
# requires the full agent (web research, mail, multi-step). Hand
|
|
# off to the agent path; do NOT speak the SPEECH_TEAMS placeholder.
|
|
if speechResult.needsAgent:
|
|
# Director prompts (persistent + recent one-shot) have already
|
|
# delivered files to the operator. The escalation agent MUST see
|
|
# them — otherwise it answers "summarize the doc" with general
|
|
# babble because the SPEECH_TEAMS prompt itself never had file
|
|
# access. We also forward the prior agent analysis so the
|
|
# escalation can build on, not duplicate, the earlier work.
|
|
briefings = self._collectActiveDirectorBriefings()
|
|
briefingFileIds = self._collectDirectorFileIds()
|
|
briefingBlock = ""
|
|
if briefings:
|
|
parts = []
|
|
for b in briefings:
|
|
seg = f"- ({b.get('mode')}) {b.get('text', '')}".rstrip()
|
|
if b.get("fileIds"):
|
|
seg += f"\n attachedFileIds: {', '.join(b['fileIds'])}"
|
|
if b.get("note"):
|
|
note = b["note"]
|
|
seg += (
|
|
"\n priorAgentAnalysis: "
|
|
+ (note if len(note) <= 800 else note[:800] + "...")
|
|
)
|
|
parts.append(seg)
|
|
briefingBlock = (
|
|
"\n\nACTIVE_OPERATOR_BRIEFINGS (private; you may read the "
|
|
"attached files via summarizeContent / readFile / "
|
|
"readContentObjects to answer the user precisely; do NOT "
|
|
"quote the directive text itself):\n" + "\n".join(parts)
|
|
)
|
|
logger.info(
|
|
f"Session {sessionId}: SPEECH_TEAMS escalates to agent. "
|
|
f"Reason: {speechResult.agentReason or speechResult.reasoning} | "
|
|
f"briefings={len(briefings)}, fileIds={len(briefingFileIds)}"
|
|
)
|
|
taskBrief = (
|
|
(speechResult.agentReason
|
|
or speechResult.responseText
|
|
or "Verarbeite die juengste Sprecheranfrage und antworte ins Meeting.")
|
|
+ briefingBlock
|
|
)
|
|
# Mark escalation as in-flight BEFORE we create the task so the
|
|
# ``_aiAnalysisInProgress=False`` released in our finally block
|
|
# cannot let a competing speech trigger sneak past the gate
|
|
# before the agent task has even been scheduled.
|
|
self._agentEscalationInFlight = True
|
|
self._currentEscalationTask = asyncio.create_task(
|
|
self._runEscalationAndRelease(
|
|
sessionId=sessionId,
|
|
taskBrief=taskBrief,
|
|
briefingFileIds=briefingFileIds,
|
|
triggerTranscriptId=triggerTranscript.get("id"),
|
|
)
|
|
)
|
|
return
|
|
|
|
# Step 4a: Handle STOP intent -- stop audio immediately
|
|
if speechResult.detectedIntent == "stop":
|
|
logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}")
|
|
if websocket:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "stopAudio",
|
|
"sessionId": sessionId,
|
|
}))
|
|
except Exception as stopErr:
|
|
logger.warning(f"Failed to send stop command: {stopErr}")
|
|
return
|
|
|
|
# Step 4b: Respond if AI decided to
|
|
if speechResult.shouldRespond and speechResult.responseText:
|
|
|
|
if self.config.responseMode == TeamsbotResponseMode.MANUAL:
|
|
# In manual mode, suggest but don't send
|
|
await _emitSessionEvent(sessionId, "suggestedResponse", {
|
|
"responseText": speechResult.responseText,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
})
|
|
return
|
|
|
|
# Determine response channel: per-request (AI) overrides config
|
|
channels = speechResult.responseChannels
|
|
if channels and isinstance(channels, list):
|
|
channelStr = ",".join(str(c).lower().strip() for c in channels)
|
|
sendVoice = "voice" in channelStr
|
|
sendChat = "chat" in channelStr
|
|
logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
|
|
else:
|
|
channelRaw = self.config.responseChannel
|
|
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
|
|
sendVoice = channelStr in ("voice", "both")
|
|
sendChat = channelStr in ("chat", "both")
|
|
logger.info(f"Response channel (from config): '{channelStr}'")
|
|
|
|
if sendVoice and sendChat:
|
|
responseType = TeamsbotResponseType.BOTH
|
|
elif sendVoice:
|
|
responseType = TeamsbotResponseType.AUDIO
|
|
else:
|
|
responseType = TeamsbotResponseType.CHAT
|
|
|
|
# Suppress duplicate responses in short windows ("repeat loop" protection).
|
|
canonicalText = (
|
|
speechResult.responseText
|
|
or speechResult.responseTextForVoice
|
|
or speechResult.responseTextForChat
|
|
or ""
|
|
)
|
|
normalizedResponse = (canonicalText or "").strip().lower()
|
|
nowTs = time.time()
|
|
if (
|
|
normalizedResponse
|
|
and self._lastBotResponseText == normalizedResponse
|
|
and (nowTs - self._lastBotResponseTs) < 90
|
|
):
|
|
logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
|
|
await _emitSessionEvent(sessionId, "analysis", {
|
|
"shouldRespond": False,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": "Suppressed duplicate response within 90s",
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
})
|
|
return
|
|
|
|
# Resolve text per channel (AI can send different content to voice vs chat)
|
|
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
|
|
textForChat = speechResult.responseTextForChat or speechResult.responseText
|
|
storedText = textForChat or textForVoice or speechResult.responseText
|
|
|
|
# 4a: Voice response (TTS -> Audio to bot, chunked for long replies)
|
|
if sendVoice and textForVoice:
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "requested",
|
|
"hasWebSocket": websocket is not None,
|
|
"message": "TTS generation requested",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
logger.info(
|
|
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
|
)
|
|
if not websocket:
|
|
logger.warning(
|
|
f"Session {sessionId}: TTS skipped (bot websocket unavailable, likely fallback mode)"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "unavailable",
|
|
"hasWebSocket": False,
|
|
"message": "TTS skipped — bot websocket unavailable",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
if not sendChat:
|
|
sendChat = True
|
|
else:
|
|
# Long / structured answers → AI condenses for ear; chat keeps full text.
|
|
spokenText = await self._summarizeForVoice(sessionId, textForVoice)
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
ttsOutcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=spokenText,
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
if ttsOutcome.get("success"):
|
|
logger.info(
|
|
f"Session {sessionId}: TTS audio dispatched to bot "
|
|
f"(chunks={ttsOutcome.get('chunks')}, played={ttsOutcome.get('played')})"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "dispatched",
|
|
"hasWebSocket": True,
|
|
"chunks": ttsOutcome.get("chunks"),
|
|
"played": ttsOutcome.get("played"),
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
else:
|
|
logger.warning(
|
|
f"TTS failed for session {sessionId}: {ttsOutcome.get('error')}"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "failed",
|
|
"hasWebSocket": True,
|
|
"chunks": ttsOutcome.get("chunks"),
|
|
"played": ttsOutcome.get("played"),
|
|
"message": ttsOutcome.get("error"),
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
if not sendChat:
|
|
sendChat = True # Fallback to chat if voice-only and TTS failed
|
|
|
|
# 4b: Chat response (send text message to meeting chat)
|
|
if sendChat and textForChat:
|
|
try:
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": textForChat,
|
|
}))
|
|
logger.info(f"Chat response sent for session {sessionId}")
|
|
except Exception as chatErr:
|
|
logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}")
|
|
|
|
# 4b: Store bot response
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=storedText,
|
|
responseType=responseType,
|
|
detectedIntent=speechResult.detectedIntent,
|
|
reasoning=speechResult.reasoning,
|
|
triggeredByTranscriptId=triggerTranscript.get("id"),
|
|
modelName=response.modelName,
|
|
processingTime=response.processingTime,
|
|
priceCHF=response.priceCHF,
|
|
timestamp=getIsoTimestamp(),
|
|
).model_dump()
|
|
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
|
|
# 4c: Emit SSE event
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": storedText,
|
|
"responseType": responseType.value,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
})
|
|
|
|
# Update session response count
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("botResponseCount", 0) + 1
|
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
|
|
|
self._lastBotResponseText = normalizedResponse
|
|
self._lastBotResponseTs = nowTs
|
|
|
|
# Record bot response in transcript (exactly once, regardless of channel)
|
|
botTranscriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=self.config.botName,
|
|
text=storedText,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=True,
|
|
).model_dump()
|
|
botTranscript = interface.createTranscript(botTranscriptData)
|
|
|
|
self._contextBuffer.append({
|
|
"speaker": self.config.botName,
|
|
"text": storedText,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "botResponse",
|
|
})
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": botTranscript.get("id"),
|
|
"speaker": self.config.botName,
|
|
"text": storedText,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "botResponse",
|
|
"speakerResolvedFromHint": False,
|
|
})
|
|
|
|
# Reset differential writing tracker so next STT creates a new block
|
|
self._lastTranscriptSpeaker = self.config.botName
|
|
self._lastTranscriptText = storedText
|
|
self._lastTranscriptId = botTranscript.get("id")
|
|
|
|
self._followUpWindowEnd = time.time() + 15.0
|
|
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")
|
|
|
|
# Step 5: Execute AI-issued commands (if any)
|
|
if speechResult.commands:
|
|
await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)
|
|
|
|
# When AI used only commands (no responseText), emit botResponse SSE
|
|
# so the UI shows the response. Extract text from sendChat commands.
|
|
if speechResult.shouldRespond and not speechResult.responseText:
|
|
cmdTexts = [
|
|
c.params.get("text", "") for c in speechResult.commands
|
|
if c.action == "sendChat" and c.params and c.params.get("text")
|
|
]
|
|
combinedText = " ".join(cmdTexts) if cmdTexts else None
|
|
if combinedText:
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=combinedText,
|
|
responseType=TeamsbotResponseType.CHAT,
|
|
detectedIntent=speechResult.detectedIntent,
|
|
reasoning=speechResult.reasoning,
|
|
triggeredByTranscriptId=triggerTranscript.get("id"),
|
|
modelName=response.modelName,
|
|
processingTime=response.processingTime,
|
|
priceCHF=response.priceCHF,
|
|
timestamp=getIsoTimestamp(),
|
|
).model_dump()
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": combinedText,
|
|
"responseType": TeamsbotResponseType.CHAT.value,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
})
|
|
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("botResponseCount", 0) + 1
|
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
|
|
|
self._followUpWindowEnd = time.time() + 15.0
|
|
logger.info(
|
|
f"Bot responded via commands in session {sessionId}: "
|
|
f"intent={speechResult.detectedIntent}, follow-up window open for 15s"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
|
|
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
|
|
finally:
|
|
self._aiAnalysisInProgress = False
|
|
|
|
async def _runEscalationAndRelease(
|
|
self,
|
|
sessionId: str,
|
|
taskBrief: str,
|
|
briefingFileIds: List[str],
|
|
triggerTranscriptId: Optional[str],
|
|
) -> None:
|
|
"""Background wrapper for ``_runAgentForMeeting`` that holds the
|
|
``_agentEscalationInFlight`` flag for the entire duration of the agent
|
|
run — not just for the moment we schedule the task. Without this
|
|
wrapper, ``_aiAnalysisInProgress`` would already be ``False`` while
|
|
the agent is still researching, and a fresh SPEECH_TEAMS trigger from
|
|
a new utterance would race the agent to the voice channel."""
|
|
try:
|
|
await self._runAgentForMeeting(
|
|
sessionId=sessionId,
|
|
taskText=taskBrief,
|
|
fileIds=briefingFileIds,
|
|
sourceLabel="speechEscalation",
|
|
triggerTranscriptId=triggerTranscriptId,
|
|
)
|
|
except asyncio.CancelledError:
|
|
logger.info(
|
|
f"Session {sessionId}: Escalation agent task cancelled by stop signal"
|
|
)
|
|
except Exception as escErr:
|
|
logger.error(
|
|
f"Session {sessionId}: Escalation agent task failed: "
|
|
f"{type(escErr).__name__}: {escErr}",
|
|
exc_info=True,
|
|
)
|
|
finally:
|
|
self._agentEscalationInFlight = False
|
|
self._currentEscalationTask = None
|
|
|
|
# =========================================================================
|
|
# AI Command Execution
|
|
# =========================================================================
|
|
|
|
async def _executeCommands(
|
|
self,
|
|
sessionId: str,
|
|
commands: List[TeamsbotCommand],
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Execute structured commands returned by the AI.
|
|
Each command is dispatched to a dedicated handler function."""
|
|
for cmd in commands:
|
|
action = cmd.action
|
|
params = cmd.params or {}
|
|
logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")
|
|
try:
|
|
if action == "toggleTranscript":
|
|
await self._cmdToggleTranscript(sessionId, params, websocket)
|
|
elif action == "toggleChat":
|
|
await self._cmdToggleChat(sessionId, params, websocket)
|
|
elif action == "sendChat":
|
|
await self._cmdSendChat(sessionId, params, websocket)
|
|
elif action == "readChat":
|
|
await self._cmdReadChat(sessionId, params, voiceInterface, websocket)
|
|
elif action == "readAloud":
|
|
await self._cmdReadAloud(sessionId, params, voiceInterface, websocket)
|
|
elif action == "changeLanguage":
|
|
await self._cmdChangeLanguage(sessionId, params)
|
|
elif action in ("toggleMic", "toggleCamera"):
|
|
await self._cmdToggleMicOrCamera(sessionId, action, params, websocket)
|
|
elif action == "sendMail":
|
|
await self._cmdSendMail(sessionId, params)
|
|
elif action == "storeDocument":
|
|
await self._cmdStoreDocument(sessionId, params)
|
|
else:
|
|
logger.warning(f"Session {sessionId}: Unknown command '{action}'")
|
|
except Exception as cmdErr:
|
|
logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")
|
|
|
|
async def _cmdToggleTranscript(self, sessionId: str, params: dict, websocket: WebSocket):
|
|
"""Caption on/off - toggle Teams live transcript capture."""
|
|
enable = params.get("enable", True)
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "botCommand",
|
|
"sessionId": sessionId,
|
|
"command": "toggleTranscript",
|
|
"params": {"enable": enable},
|
|
}))
|
|
|
|
async def _cmdToggleChat(self, sessionId: str, params: dict, websocket: WebSocket):
|
|
"""Chat on/off - enable/disable meeting chat monitoring."""
|
|
enable = params.get("enable", True)
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "botCommand",
|
|
"sessionId": sessionId,
|
|
"command": "toggleChat",
|
|
"params": {"enable": enable},
|
|
}))
|
|
|
|
async def _cmdSendChat(self, sessionId: str, params: dict, websocket: WebSocket):
|
|
"""Send a message to the meeting chat and record it in transcript/SSE."""
|
|
chatText = params.get("text", "")
|
|
if not chatText:
|
|
return
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": chatText,
|
|
}))
|
|
logger.info(f"Chat command sent for session {sessionId}")
|
|
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=self.config.botName,
|
|
text=chatText,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=True,
|
|
source="chat",
|
|
).model_dump()
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
self._contextBuffer.append({
|
|
"speaker": self.config.botName,
|
|
"text": chatText,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "chat",
|
|
})
|
|
self._lastTranscriptSpeaker = self.config.botName
|
|
self._lastTranscriptText = chatText
|
|
self._lastTranscriptId = createdTranscript.get("id")
|
|
self._lastBotResponseText = chatText.strip().lower()
|
|
self._lastBotResponseTs = time.time()
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": self.config.botName,
|
|
"text": chatText,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "chat",
|
|
"speakerResolvedFromHint": False,
|
|
})
|
|
|
|
async def _cmdReadChat(
|
|
self,
|
|
sessionId: str,
|
|
params: dict,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Read chat messages (from DB) with optional fromdatetime/todatetime, then speak or send to chat."""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
transcripts = interface.getTranscripts(sessionId)
|
|
fromDt = params.get("fromdatetime") or params.get("fromDateTime")
|
|
toDt = params.get("todatetime") or params.get("toDateTime")
|
|
chatOnly = [t for t in transcripts if t.get("source") in ("chat", "chatHistory")]
|
|
if fromDt:
|
|
chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") >= fromDt]
|
|
if toDt:
|
|
chatOnly = [t for t in chatOnly if (t.get("timestamp") or "") <= toDt]
|
|
summary = "\n".join(f"[{t.get('speaker', '?')}]: {t.get('text', '')}" for t in chatOnly[-20:])
|
|
if not summary:
|
|
summary = "Keine Chat-Nachrichten im angegebenen Zeitraum."
|
|
if voiceInterface and websocket:
|
|
spokenSummary = await self._summarizeForVoice(sessionId, summary[:2000])
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=spokenSummary,
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
|
|
async def _cmdReadAloud(
|
|
self,
|
|
sessionId: str,
|
|
params: dict,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Read text aloud via TTS and play in meeting."""
|
|
readText = params.get("text", "")
|
|
if readText and voiceInterface and websocket:
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=_voiceFriendlyMeetingText(readText),
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
|
|
async def _cmdChangeLanguage(self, sessionId: str, params: dict):
|
|
"""Change bot language."""
|
|
newLang = params.get("language", "")
|
|
if newLang:
|
|
self.config = self.config.model_copy(update={"language": newLang})
|
|
logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
|
|
await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})
|
|
|
|
async def _cmdToggleMicOrCamera(
|
|
self,
|
|
sessionId: str,
|
|
action: str,
|
|
params: dict,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Toggle mic or camera in the meeting."""
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "botCommand",
|
|
"sessionId": sessionId,
|
|
"command": action,
|
|
"params": params,
|
|
}))
|
|
|
|
async def _cmdSendMail(self, sessionId: str, params: dict):
|
|
"""Send email via Service Center MessagingService."""
|
|
recipient = params.get("recipient") or params.get("to", "")
|
|
subject = params.get("subject", "")
|
|
message = params.get("message") or params.get("body", "")
|
|
if not recipient or not subject:
|
|
logger.warning(f"Session {sessionId}: sendMail requires recipient and subject")
|
|
return
|
|
try:
|
|
from modules.serviceCenter import ServiceCenterContext, getService
|
|
ctx = ServiceCenterContext(
|
|
user=self.currentUser,
|
|
mandate_id=self.mandateId,
|
|
feature_instance_id=self.instanceId,
|
|
)
|
|
messaging = getService("messaging", ctx)
|
|
success = messaging.sendEmailDirect(
|
|
recipient=recipient,
|
|
subject=subject,
|
|
message=message,
|
|
userId=str(self.currentUser.id) if self.currentUser else None,
|
|
)
|
|
if success:
|
|
logger.info(f"Session {sessionId}: Email sent to {recipient}")
|
|
else:
|
|
logger.warning(f"Session {sessionId}: Email send failed for {recipient}")
|
|
except Exception as e:
|
|
logger.warning(f"Session {sessionId}: sendMail failed: {e}")
|
|
|
|
async def _cmdStoreDocument(self, sessionId: str, params: dict):
|
|
"""Store document via Service Center SharepointService."""
|
|
sitePath = params.get("sitePath") or params.get("site", "")
|
|
folderPath = params.get("folderPath") or params.get("folder", "")
|
|
fileName = params.get("fileName", "document.txt")
|
|
content = params.get("content", "")
|
|
if isinstance(content, str):
|
|
content = content.encode("utf-8")
|
|
if not sitePath or not folderPath:
|
|
logger.warning(f"Session {sessionId}: storeDocument requires sitePath and folderPath")
|
|
return
|
|
try:
|
|
from modules.serviceCenter import ServiceCenterContext, getService
|
|
ctx = ServiceCenterContext(
|
|
user=self.currentUser,
|
|
mandate_id=self.mandateId,
|
|
feature_instance_id=self.instanceId,
|
|
)
|
|
sharepoint = getService("sharepoint", ctx)
|
|
if not sharepoint.setAccessTokenFromConnection(self.currentUser):
|
|
logger.warning(f"Session {sessionId}: SharePoint connection not configured")
|
|
return
|
|
site = await sharepoint.getSiteByStandardPath(sitePath)
|
|
if not site:
|
|
logger.warning(f"Session {sessionId}: SharePoint site not found: {sitePath}")
|
|
return
|
|
result = await sharepoint.uploadFile(
|
|
siteId=site["id"],
|
|
folderPath=folderPath,
|
|
fileName=fileName,
|
|
content=content,
|
|
)
|
|
if "error" in result:
|
|
logger.warning(f"Session {sessionId}: storeDocument failed: {result['error']}")
|
|
else:
|
|
logger.info(f"Session {sessionId}: Document stored: {fileName}")
|
|
except Exception as e:
|
|
logger.warning(f"Session {sessionId}: storeDocument failed: {e}")
|
|
|
|
# =========================================================================
|
|
# Director Prompts (private operator instructions during a live meeting)
|
|
# =========================================================================
|
|
|
|
def _collectActiveDirectorBriefings(self) -> List[Dict[str, Any]]:
|
|
"""Return the deduplicated list of director-prompt briefings that are
|
|
currently relevant for the meeting context: every active persistent
|
|
prompt PLUS every recent one-shot prompt that still sits in the
|
|
``_recentDirectorBriefings`` pool. Each entry carries ``text``,
|
|
``fileIds`` (UDB attachments), ``mode``, ``promptId`` and ``note``
|
|
(the agent's internal analysis from the SILENT director run, if any).
|
|
"""
|
|
seen: Dict[str, Dict[str, Any]] = {}
|
|
for p in self._activePersistentPrompts:
|
|
pid = p.get("id") or ""
|
|
seen[pid] = {
|
|
"promptId": pid,
|
|
"mode": p.get("mode") or "persistent",
|
|
"text": (p.get("text") or "").strip(),
|
|
"fileIds": list(p.get("fileIds") or []),
|
|
"note": (p.get("responseText") or "").strip(),
|
|
}
|
|
for b in self._recentDirectorBriefings:
|
|
pid = b.get("promptId") or ""
|
|
if pid in seen:
|
|
# Refresh note with the latest analysis if the persistent run
|
|
# produced one after the prompt was first loaded from DB.
|
|
if b.get("note"):
|
|
seen[pid]["note"] = b["note"]
|
|
continue
|
|
seen[pid] = {
|
|
"promptId": pid,
|
|
"mode": b.get("mode") or "oneShot",
|
|
"text": (b.get("text") or "").strip(),
|
|
"fileIds": list(b.get("fileIds") or []),
|
|
"note": (b.get("note") or "").strip(),
|
|
}
|
|
return [v for v in seen.values() if v.get("text") or v.get("fileIds")]
|
|
|
|
def _collectDirectorFileIds(self) -> List[str]:
|
|
"""Flat, deduplicated list of UDB file IDs attached to any currently
|
|
relevant director prompt (persistent + recent one-shot). Used when
|
|
SPEECH_TEAMS escalates to the agent so the agent can actually READ the
|
|
documents the operator already provided."""
|
|
out: List[str] = []
|
|
seen: set = set()
|
|
for b in self._collectActiveDirectorBriefings():
|
|
for fid in b.get("fileIds") or []:
|
|
if fid and fid not in seen:
|
|
seen.add(fid)
|
|
out.append(fid)
|
|
return out
|
|
|
|
def _buildPersistentDirectorContext(self) -> str:
|
|
"""Render active director-prompt briefings as private operator guidance
|
|
for the SPEECH_TEAMS system prompt context block.
|
|
|
|
Surfaces three things SPEECH_TEAMS otherwise misses:
|
|
|
|
* the operator's directive text (as before)
|
|
* the IDs of any UDB files the operator attached — so SPEECH_TEAMS
|
|
knows the documents exist and can decide to escalate to the agent,
|
|
which has the tooling to read them.
|
|
* the agent's previous internal analysis of the prompt (the SILENT
|
|
``MEETING_REPLY/SILENT`` decision's note), so SPEECH_TEAMS can answer
|
|
short questions without re-running the agent.
|
|
"""
|
|
briefings = self._collectActiveDirectorBriefings()
|
|
if not briefings:
|
|
return ""
|
|
lines: List[str] = []
|
|
for b in briefings:
|
|
entry = f"- ({b.get('mode', 'persistent')}) {b.get('text', '')}".rstrip()
|
|
fileIds = b.get("fileIds") or []
|
|
if fileIds:
|
|
entry += (
|
|
"\n ATTACHED_FILES (operator-provided documents — the AGENT "
|
|
"has tools to read them via summarizeContent / readFile / "
|
|
"readContentObjects): "
|
|
+ ", ".join(fileIds)
|
|
)
|
|
note = b.get("note")
|
|
if note:
|
|
noteShort = note if len(note) <= 600 else note[:600] + "..."
|
|
entry += f"\n AGENT_ANALYSIS (already computed by the bot): {noteShort}"
|
|
lines.append(entry)
|
|
return (
|
|
"\nOPERATOR_DIRECTIVES (private; never quote them verbatim, just follow them. "
|
|
"If the user asks about an attached document, use AGENT_ANALYSIS first; "
|
|
"if more depth is needed, set needsAgent=true so the agent can re-read the file):\n"
|
|
+ "\n".join(lines)
|
|
+ "\n"
|
|
)
|
|
|
|
def _recordDirectorBriefing(
|
|
self,
|
|
prompt: Dict[str, Any],
|
|
internalNote: str,
|
|
meetingText: str,
|
|
) -> None:
|
|
"""Append a director-prompt briefing to the session-scoped pool so the
|
|
attached files and the agent's analysis stay available for subsequent
|
|
SPEECH_TEAMS triggers — even after a one-shot prompt was consumed.
|
|
Idempotent per ``promptId`` (latest entry wins)."""
|
|
pid = prompt.get("id") or ""
|
|
# Drop any older entry for the same prompt so we keep the freshest note.
|
|
self._recentDirectorBriefings = [
|
|
b for b in self._recentDirectorBriefings if b.get("promptId") != pid
|
|
]
|
|
self._recentDirectorBriefings.append({
|
|
"promptId": pid,
|
|
"mode": prompt.get("mode") or "oneShot",
|
|
"text": (prompt.get("text") or "").strip(),
|
|
"fileIds": list(prompt.get("fileIds") or []),
|
|
"note": (internalNote or meetingText or "").strip(),
|
|
"recordedAt": getIsoTimestamp(),
|
|
})
|
|
if len(self._recentDirectorBriefings) > _RECENT_DIRECTOR_BRIEFINGS_MAX:
|
|
self._recentDirectorBriefings = self._recentDirectorBriefings[
|
|
-_RECENT_DIRECTOR_BRIEFINGS_MAX:
|
|
]
|
|
|
|
async def submitDirectorPrompt(
|
|
self,
|
|
sessionId: str,
|
|
operatorUserId: str,
|
|
text: str,
|
|
mode: TeamsbotDirectorPromptMode,
|
|
fileIds: List[str],
|
|
) -> Dict[str, Any]:
|
|
"""Persist a new director prompt and trigger immediate agent processing.
|
|
|
|
Returns the created prompt record. Processing happens asynchronously
|
|
and emits SSE events ('directorPrompt') for the operator UI.
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
promptData = TeamsbotDirectorPrompt(
|
|
sessionId=sessionId,
|
|
instanceId=self.instanceId,
|
|
operatorUserId=operatorUserId,
|
|
text=text,
|
|
mode=mode,
|
|
fileIds=fileIds or [],
|
|
status=TeamsbotDirectorPromptStatus.QUEUED,
|
|
).model_dump()
|
|
created = interface.createDirectorPrompt(promptData)
|
|
|
|
# Persistent prompts join in-memory directives immediately so they
|
|
# also influence subsequent SPEECH_TEAMS triggers, not only the
|
|
# one-shot agent run we kick off below.
|
|
if mode == TeamsbotDirectorPromptMode.PERSISTENT:
|
|
self._activePersistentPrompts.append(created)
|
|
|
|
await _emitSessionEvent(sessionId, "directorPrompt", {
|
|
"id": created.get("id"),
|
|
"status": created.get("status"),
|
|
"mode": created.get("mode"),
|
|
"text": created.get("text"),
|
|
"fileIds": created.get("fileIds", []),
|
|
"createdAt": created.get("createdAt"),
|
|
})
|
|
|
|
asyncio.create_task(self._processDirectorPrompt(created))
|
|
return created
|
|
|
|
async def removePersistentPrompt(self, promptId: str) -> bool:
|
|
"""Remove a persistent director prompt (operator clicked 'remove')."""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
sessionId = self._activeSessionId
|
|
prompt = interface.getDirectorPrompt(promptId)
|
|
if not prompt:
|
|
return False
|
|
interface.updateDirectorPrompt(promptId, {
|
|
"status": TeamsbotDirectorPromptStatus.CONSUMED.value,
|
|
"consumedAt": getIsoTimestamp(),
|
|
"statusMessage": "Removed by operator",
|
|
})
|
|
self._activePersistentPrompts = [
|
|
p for p in self._activePersistentPrompts if p.get("id") != promptId
|
|
]
|
|
# Also drop the briefing copy so SPEECH_TEAMS forgets the doc reference
|
|
# immediately; otherwise the bot would keep "remembering" a doc the
|
|
# operator just retired.
|
|
self._recentDirectorBriefings = [
|
|
b for b in self._recentDirectorBriefings if b.get("promptId") != promptId
|
|
]
|
|
if sessionId:
|
|
await _emitSessionEvent(sessionId, "directorPrompt", {
|
|
"id": promptId,
|
|
"status": TeamsbotDirectorPromptStatus.CONSUMED.value,
|
|
"mode": prompt.get("mode"),
|
|
"text": prompt.get("text"),
|
|
"removed": True,
|
|
})
|
|
return True
|
|
|
|
async def _processDirectorPrompt(self, prompt: Dict[str, Any]) -> None:
|
|
"""Run the agent for a director prompt and deliver the FINAL text into
|
|
the meeting via TTS + chat (using the bot's existing channels)."""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
sessionId = prompt.get("sessionId")
|
|
promptId = prompt.get("id")
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
interface.updateDirectorPrompt(promptId, {
|
|
"status": TeamsbotDirectorPromptStatus.RUNNING.value,
|
|
})
|
|
await _emitSessionEvent(sessionId, "directorPrompt", {
|
|
"id": promptId,
|
|
"status": TeamsbotDirectorPromptStatus.RUNNING.value,
|
|
})
|
|
|
|
# Build a task brief for the agent that surfaces the meeting context.
|
|
recentTranscript = self._renderRecentTranscriptForAgent(maxLines=20)
|
|
directorText = (prompt.get("text") or "").strip()
|
|
attachedFileIds = list(prompt.get("fileIds") or [])
|
|
promptMode = (prompt.get("mode") or "").lower()
|
|
isPersistentPrompt = promptMode == TeamsbotDirectorPromptMode.PERSISTENT.value.lower()
|
|
|
|
# Make file attachment EXPLICIT in the brief. The agent service already
|
|
# prepends a "## Attached Files & Folders" header via _enrichPromptWithFiles
|
|
# when fileIds are passed, but without an explicit instruction the agent
|
|
# sometimes goes straight to a generic answer. We force the workflow:
|
|
# studyDocs -> form briefing -> decide MEETING_REPLY vs SILENT.
|
|
filesBlock = ""
|
|
if attachedFileIds:
|
|
filesBlock = (
|
|
"\nANGEHAENGTE DOKUMENTE (UDB-File-IDs): "
|
|
+ ", ".join(attachedFileIds)
|
|
+ "\nDu MUSST diese Dokumente VOR der finalen Antwort lesen / zusammenfassen "
|
|
"(z.B. summarizeContent, readFile, readContentObjects, describeImage). "
|
|
"Beziehe Fakten und Zitate aus den Dokumenten in deine Notiz / dein "
|
|
"Meeting-Reply ein, statt allgemein zu antworten.\n"
|
|
)
|
|
|
|
# Persistent prompts that ship documents are usually a "knowledge briefing"
|
|
# the operator wants the bot to STUDY now and USE LATER. The SILENT note
|
|
# in that case must be a useful, file-grounded summary that subsequent
|
|
# SPEECH_TEAMS triggers can pick up — not "noted".
|
|
persistentNoteHint = ""
|
|
if isPersistentPrompt and attachedFileIds:
|
|
persistentNoteHint = (
|
|
"\nSPEZIAL fuer PERSISTENT + Dokumente: Wenn die Anweisung KEIN explizites "
|
|
"Meeting-Statement verlangt, antworte mit 'SILENT:' und liefere als interne "
|
|
"Notiz eine STRUKTURIERTE, faktendichte Briefing-Zusammenfassung der Dokumente "
|
|
"(Stichpunkte, Kennzahlen, Aussagen, die fuer Folgefragen im Meeting relevant "
|
|
"sein koennen). Diese Notiz wird spaeteren Meeting-Antworten als Wissensbasis "
|
|
"vorgelegt — schreibe sie also so, dass der Bot daraus zitieren kann.\n"
|
|
)
|
|
|
|
taskText = (
|
|
f"Du bist der KI-Assistent in einem laufenden Teams-Meeting (Bot-Name: {self.config.botName}).\n"
|
|
f"Der Operator hat dir folgende PRIVATE Regieanweisung gegeben (die anderen Teilnehmer im "
|
|
f"Meeting sehen sie NICHT). Sie ist KEINE Frage an das Meeting, sondern eine interne "
|
|
f"Anweisung an dich:\n\n"
|
|
f"{directorText}\n"
|
|
f"{filesBlock}"
|
|
f"{persistentNoteHint}\n"
|
|
f"AKTUELLER MEETING-KONTEXT (juengste Aussagen):\n{recentTranscript}\n\n"
|
|
"ANTWORT-PROTOKOLL — Beginne deine FINALE Antwort mit GENAU EINEM dieser Marker:\n"
|
|
" • 'MEETING_REPLY:' gefolgt vom Text, der im Meeting gesprochen / in den Meeting-Chat "
|
|
"gepostet werden soll. Verwende diesen Marker NUR, wenn die Regieanweisung dich explizit "
|
|
"auffordert, jetzt etwas im Meeting zu sagen oder zu schreiben (Beispiele: 'stell dich vor', "
|
|
"'fasse zusammen', 'stelle Person X eine Frage', 'beantworte die letzte Frage'). Halte den "
|
|
"Text kurz, sprachlich passend zur Stimme und ohne Marker oder Meta-Kommentare.\n"
|
|
" • 'SILENT:' gefolgt von einer internen Notiz fuer das Operator-UI. "
|
|
"Verwende diesen Marker fuer interne Direktiven und Wissens-Briefings (Beispiele: "
|
|
"'achte ab jetzt auf X', 'merke dir Y', 'studiere Dokument Z'). "
|
|
"Dieser Text wird NICHT ins Meeting gegeben, dient aber spaeteren Meeting-Antworten "
|
|
"als Wissensbasis. Wenn Dokumente angehaengt sind, MUSS die Notiz konkrete, "
|
|
"zitierfaehige Fakten aus den Dokumenten enthalten.\n\n"
|
|
"Standard ist SILENT, wenn nicht eindeutig zur Meeting-Interaktion aufgefordert wurde. "
|
|
"Wiederhole NIEMALS die Regieanweisung selbst im MEETING_REPLY-Text."
|
|
)
|
|
|
|
try:
|
|
finalText = await self._runAgentForMeeting(
|
|
sessionId=sessionId,
|
|
taskText=taskText,
|
|
fileIds=attachedFileIds,
|
|
sourceLabel="directorPrompt",
|
|
triggerTranscriptId=None,
|
|
promptId=promptId,
|
|
directorPromptMode=True,
|
|
)
|
|
|
|
# One-shot: mark consumed; persistent: keep active but record success.
|
|
isPersistent = prompt.get("mode") == TeamsbotDirectorPromptMode.PERSISTENT.value
|
|
updates: Dict[str, Any] = {
|
|
"status": TeamsbotDirectorPromptStatus.SUCCEEDED.value,
|
|
"responseText": finalText or "",
|
|
}
|
|
if not isPersistent:
|
|
updates["status"] = TeamsbotDirectorPromptStatus.CONSUMED.value
|
|
updates["consumedAt"] = getIsoTimestamp()
|
|
interface.updateDirectorPrompt(promptId, updates)
|
|
await _emitSessionEvent(sessionId, "directorPrompt", {
|
|
"id": promptId,
|
|
"status": updates["status"],
|
|
"responseText": finalText,
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Session {sessionId}: Director prompt {promptId} failed: {type(e).__name__}: {e}",
|
|
exc_info=True,
|
|
)
|
|
interface.updateDirectorPrompt(promptId, {
|
|
"status": TeamsbotDirectorPromptStatus.FAILED.value,
|
|
"statusMessage": f"{type(e).__name__}: {str(e)[:300]}",
|
|
})
|
|
await _emitSessionEvent(sessionId, "directorPrompt", {
|
|
"id": promptId,
|
|
"status": TeamsbotDirectorPromptStatus.FAILED.value,
|
|
"error": f"{type(e).__name__}: {str(e)[:300]}",
|
|
})
|
|
self._activePersistentPrompts = [
|
|
p for p in self._activePersistentPrompts if p.get("id") != promptId
|
|
]
|
|
|
|
def _renderRecentTranscriptForAgent(self, maxLines: int = 20) -> str:
|
|
"""Render the most recent context buffer entries for inclusion in the
|
|
agent task brief (similar to SPEECH_TEAMS context, but plain text)."""
|
|
if not self._contextBuffer:
|
|
return "(noch keine Aussagen erfasst)"
|
|
recent = self._contextBuffer[-maxLines:]
|
|
lines = []
|
|
for seg in recent:
|
|
speaker = seg.get("speaker", "Unknown")
|
|
text = seg.get("text", "")
|
|
segSource = seg.get("source", "caption")
|
|
prefix = "Chat: " if segSource == "chat" else ""
|
|
if self._isBotSpeaker(speaker):
|
|
lines.append(f"[YOU ({self.config.botName})]: {text}")
|
|
else:
|
|
lines.append(f"[{prefix}{speaker}]: {text}")
|
|
return "\n".join(lines)
|
|
|
|
async def _interimAgentBusyMessage(self) -> Optional[str]:
|
|
"""Short spoken/chat line before a potentially long agent run (web,
|
|
tools). Phrasing is AI-localised to ``self.config.language`` and
|
|
cached per session — no hardcoded language branching. Returns
|
|
``None`` if generation failed; caller must treat that as
|
|
'silently skip the interim notice'."""
|
|
return await self._pickEphemeralPhrase("agentBusy")
|
|
|
|
async def _interimAgentRoundMessage(
|
|
self, roundNum: int, maxRounds: int
|
|
) -> Optional[str]:
|
|
"""Per-round progress notice for long agent runs (meeting voice /
|
|
chat, ephemeral). Phrasing is AI-localised once per session;
|
|
``{round}`` and ``{maxRounds}`` placeholders are substituted at
|
|
render time. Returns ``None`` if generation failed."""
|
|
return await self._pickEphemeralPhrase(
|
|
"agentRound",
|
|
substitutions={"round": roundNum, "maxRounds": maxRounds},
|
|
)
|
|
|
|
async def _notifyMeetingEphemeral(self, sessionId: str, text: str) -> None:
|
|
"""Deliver a short line to the meeting (TTS + chat per config) without
|
|
persisting botResponses/transcripts, so the main agent answer stays the
|
|
single recorded follow-up."""
|
|
websocket = self._websocket
|
|
voiceInterface = self._voiceInterface
|
|
if not websocket:
|
|
logger.warning(f"Session {sessionId}: Interim notice skipped — no WebSocket")
|
|
return
|
|
|
|
channelRaw = self.config.responseChannel
|
|
channelStr = (
|
|
channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw)
|
|
).lower().strip()
|
|
sendVoice = channelStr in ("voice", "both")
|
|
sendChat = channelStr in ("chat", "both")
|
|
|
|
if sendVoice and voiceInterface:
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
outcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=_voiceFriendlyMeetingText(text),
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
if not outcome.get("success"):
|
|
logger.warning(
|
|
f"Session {sessionId}: Interim TTS failed ({outcome.get('error')}) — falling back to chat"
|
|
)
|
|
if not sendChat:
|
|
sendChat = True
|
|
|
|
if sendChat:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": text,
|
|
}))
|
|
except Exception as chatErr:
|
|
logger.warning(f"Session {sessionId}: Interim chat failed: {chatErr}")
|
|
|
|
await _emitSessionEvent(sessionId, "agentRun", {
|
|
"status": "interimNotice",
|
|
"message": text,
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
|
|
async def _runAgentForMeeting(
|
|
self,
|
|
sessionId: str,
|
|
taskText: str,
|
|
fileIds: List[str],
|
|
sourceLabel: str,
|
|
triggerTranscriptId: Optional[str] = None,
|
|
promptId: Optional[str] = None,
|
|
directorPromptMode: bool = False,
|
|
) -> str:
|
|
"""Run agentService.runAgent for a meeting context, deliver the FINAL
|
|
text via the bot's existing TTS + chat channels, and return that text.
|
|
|
|
sourceLabel is used for logging and SSE differentiation
|
|
('directorPrompt' or 'speechEscalation').
|
|
|
|
``directorPromptMode`` activates the silent-by-default protocol for
|
|
operator director prompts: interim notices are suppressed, no per-round
|
|
meeting updates, and the FINAL text is parsed for an explicit
|
|
``MEETING_REPLY:`` / ``SILENT:`` marker. Only ``MEETING_REPLY`` content
|
|
is dispatched to the meeting; everything else stays internal.
|
|
"""
|
|
from modules.serviceCenter.services.serviceAgent.datamodelAgent import (
|
|
AgentConfig, AgentEventTypeEnum
|
|
)
|
|
|
|
ctx = ServiceCenterContext(
|
|
user=self.currentUser,
|
|
mandate_id=self.mandateId,
|
|
feature_instance_id=self.instanceId,
|
|
feature_code="teamsbot",
|
|
)
|
|
agentService = _getServiceCenterService("agent", ctx)
|
|
|
|
# Workflow id stable per session so RAG/round-memory accumulate per meeting.
|
|
workflowId = f"teamsbot:{sessionId}"
|
|
|
|
agentConfig = AgentConfig(
|
|
maxRounds=TEAMSBOT_AGENT_MAX_ROUNDS,
|
|
maxCostCHF=TEAMSBOT_AGENT_MAX_COST_CHF,
|
|
toolSet="core",
|
|
initialToolboxes=["core", "web"],
|
|
excludeActionTools=True,
|
|
)
|
|
|
|
await _emitSessionEvent(sessionId, "agentRun", {
|
|
"source": sourceLabel,
|
|
"promptId": promptId,
|
|
"status": "started",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
|
|
# Director prompts run silently by default — no spontaneous "moment please"
|
|
# in the meeting just because the operator gave an internal directive.
|
|
if not directorPromptMode:
|
|
try:
|
|
interimText = await self._interimAgentBusyMessage()
|
|
if interimText:
|
|
await self._notifyMeetingEphemeral(sessionId, interimText)
|
|
except Exception as interimErr:
|
|
logger.warning(f"Session {sessionId}: Interim agent notice failed: {interimErr}")
|
|
|
|
finalText: str = ""
|
|
rounds = 0
|
|
try:
|
|
async for event in agentService.runAgent(
|
|
prompt=taskText,
|
|
fileIds=fileIds or None,
|
|
config=agentConfig,
|
|
toolSet="core",
|
|
workflowId=workflowId,
|
|
):
|
|
if event.type == AgentEventTypeEnum.AGENT_PROGRESS:
|
|
rounds += 1
|
|
pdata = event.data or {}
|
|
roundNum = int(pdata.get("round", rounds))
|
|
maxR = int(pdata.get("maxRounds", TEAMSBOT_AGENT_MAX_ROUNDS))
|
|
await _emitSessionEvent(sessionId, "agentRun", {
|
|
"source": sourceLabel,
|
|
"promptId": promptId,
|
|
"status": "progress",
|
|
"round": roundNum,
|
|
"maxRounds": maxR,
|
|
})
|
|
# Runde 1: schon allgemeiner Start-Hinweis; ab Runde 2 ins Meeting melden.
|
|
# Director prompts bleiben still — keine Zwischen-Updates ins Meeting.
|
|
if roundNum >= 2 and not directorPromptMode:
|
|
try:
|
|
roundText = await self._interimAgentRoundMessage(roundNum, maxR)
|
|
if roundText:
|
|
await self._notifyMeetingEphemeral(sessionId, roundText)
|
|
except Exception as roundNoticeErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Per-round agent notice failed: {roundNoticeErr}"
|
|
)
|
|
elif event.type == AgentEventTypeEnum.TOOL_CALL:
|
|
toolName = (event.data or {}).get("toolName") if event.data else None
|
|
await _emitSessionEvent(sessionId, "agentRun", {
|
|
"source": sourceLabel,
|
|
"promptId": promptId,
|
|
"status": "toolCall",
|
|
"toolName": toolName,
|
|
})
|
|
elif event.type == AgentEventTypeEnum.FINAL:
|
|
finalText = (event.content or "").strip()
|
|
elif event.type == AgentEventTypeEnum.ERROR:
|
|
raise RuntimeError(event.content or "Agent error")
|
|
except Exception as runErr:
|
|
await _emitSessionEvent(sessionId, "agentRun", {
|
|
"source": sourceLabel,
|
|
"promptId": promptId,
|
|
"status": "error",
|
|
"error": str(runErr)[:500],
|
|
})
|
|
raise
|
|
|
|
await _emitSessionEvent(sessionId, "agentRun", {
|
|
"source": sourceLabel,
|
|
"promptId": promptId,
|
|
"status": "completed",
|
|
"rounds": rounds,
|
|
"hasText": bool(finalText),
|
|
})
|
|
|
|
if finalText:
|
|
if directorPromptMode:
|
|
decision = _parseDirectorPromptFinal(finalText)
|
|
kind = decision.get("kind", "silent")
|
|
meetingText = (decision.get("meetingText") or "").strip()
|
|
internalNote = (decision.get("internalNote") or "").strip()
|
|
|
|
logger.info(
|
|
f"Session {sessionId}: Director prompt {promptId} -> kind={kind}, "
|
|
f"meetingChars={len(meetingText)}, noteChars={len(internalNote)}"
|
|
)
|
|
|
|
await _emitSessionEvent(sessionId, "directorPrompt", {
|
|
"id": promptId,
|
|
"status": "decision",
|
|
"decision": kind,
|
|
"meetingText": meetingText,
|
|
"internalNote": internalNote,
|
|
})
|
|
|
|
# Record this prompt as a session-scoped briefing BEFORE we hand
|
|
# delivery off. This is what later SPEECH_TEAMS triggers see, so
|
|
# if the user attached a doc with mode=PERSISTENT and the agent
|
|
# produced a file-grounded SILENT note, that note (and the
|
|
# original fileIds) stays available for "summarize the doc"
|
|
# follow-up questions in the meeting.
|
|
try:
|
|
promptRecord: Dict[str, Any] = {}
|
|
if promptId:
|
|
try:
|
|
from . import interfaceFeatureTeamsbot as _ifaceDb
|
|
_iface = _ifaceDb.getInterface(
|
|
self.currentUser, self.mandateId, self.instanceId
|
|
)
|
|
promptRecord = _iface.getDirectorPrompt(promptId) or {}
|
|
except Exception as _lookupErr:
|
|
logger.debug(
|
|
f"Briefing pool: could not look up prompt {promptId}: {_lookupErr}"
|
|
)
|
|
if promptRecord or promptId:
|
|
self._recordDirectorBriefing(
|
|
prompt=promptRecord or {"id": promptId},
|
|
internalNote=internalNote,
|
|
meetingText=meetingText,
|
|
)
|
|
except Exception as briefErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Director briefing pool update failed: {briefErr}"
|
|
)
|
|
|
|
# If this was a persistent prompt, the live in-memory copy in
|
|
# ``_activePersistentPrompts`` was loaded BEFORE the agent ran
|
|
# — refresh its ``responseText`` so subsequent
|
|
# ``_collectActiveDirectorBriefings`` calls show the latest
|
|
# analysis without waiting for the next session reload.
|
|
if promptId:
|
|
for p in self._activePersistentPrompts:
|
|
if p.get("id") == promptId:
|
|
p["responseText"] = internalNote or meetingText or finalText
|
|
break
|
|
|
|
if kind == "meeting" and meetingText:
|
|
await self._deliverTextToMeeting(
|
|
sessionId=sessionId,
|
|
text=meetingText,
|
|
detectedIntent=f"agent:{sourceLabel}",
|
|
reasoning=f"Agent run ({sourceLabel})",
|
|
triggerTranscriptId=triggerTranscriptId,
|
|
)
|
|
else:
|
|
# Silent: persist as internal-only botResponse so the operator
|
|
# UI keeps a record, but DO NOT push into the meeting (no TTS,
|
|
# no chat send). The director prompt SSE above already carries
|
|
# the note for the operator UI.
|
|
await self._persistInternalDirectorReply(
|
|
sessionId=sessionId,
|
|
internalNote=internalNote or finalText,
|
|
promptId=promptId,
|
|
triggerTranscriptId=triggerTranscriptId,
|
|
)
|
|
return meetingText if kind == "meeting" else ""
|
|
|
|
await self._deliverTextToMeeting(
|
|
sessionId=sessionId,
|
|
text=finalText,
|
|
detectedIntent=f"agent:{sourceLabel}",
|
|
reasoning=f"Agent run ({sourceLabel})",
|
|
triggerTranscriptId=triggerTranscriptId,
|
|
)
|
|
|
|
return finalText
|
|
|
|
async def _deliverTextToMeeting(
|
|
self,
|
|
sessionId: str,
|
|
text: str,
|
|
detectedIntent: str,
|
|
reasoning: str,
|
|
triggerTranscriptId: Optional[str] = None,
|
|
) -> None:
|
|
"""Send agent text into the meeting via the same channels SPEECH_TEAMS
|
|
uses: TTS + chat per config, plus DB persistence and SSE events.
|
|
|
|
Uses the websocket/voiceInterface stored on this instance. If the bot
|
|
is not connected anymore, the call still records the response in the DB
|
|
and emits SSE so the operator UI shows the agent answer.
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
websocket = self._websocket
|
|
voiceInterface = self._voiceInterface
|
|
|
|
channelRaw = self.config.responseChannel
|
|
channelStr = (
|
|
channelRaw.value if hasattr(channelRaw, "value") else str(channelRaw)
|
|
).lower().strip()
|
|
sendVoice = channelStr in ("voice", "both")
|
|
sendChat = channelStr in ("chat", "both")
|
|
|
|
if sendVoice and sendChat:
|
|
responseType = TeamsbotResponseType.BOTH
|
|
elif sendVoice:
|
|
responseType = TeamsbotResponseType.AUDIO
|
|
else:
|
|
responseType = TeamsbotResponseType.CHAT
|
|
|
|
# Voice (TTS input is voice-sanitized; chat + DB keep full structured text).
|
|
# Long agent answers must be chunked: Google TTS rejects single sentences
|
|
# > ~5000 bytes, and the Chirp3 voices fail on long comma-heavy lines too.
|
|
ttsOutcome: Optional[Dict[str, Any]] = None
|
|
if sendVoice and voiceInterface and websocket:
|
|
spokenText = await self._summarizeForVoice(sessionId, text)
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
ttsOutcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=spokenText,
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "dispatched" if ttsOutcome.get("success") else "failed",
|
|
"hasWebSocket": True,
|
|
"chunks": ttsOutcome.get("chunks"),
|
|
"played": ttsOutcome.get("played"),
|
|
"error": ttsOutcome.get("error"),
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
if not ttsOutcome.get("success"):
|
|
logger.warning(
|
|
f"Session {sessionId}: Agent TTS delivery failed "
|
|
f"({ttsOutcome.get('error')}) — falling back to meeting chat"
|
|
)
|
|
if not sendChat:
|
|
sendChat = True
|
|
|
|
# Chat
|
|
if sendChat and websocket:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": text,
|
|
}))
|
|
logger.info(f"Session {sessionId}: Agent chat dispatched ({len(text)} chars)")
|
|
except Exception as chatErr:
|
|
logger.warning(f"Session {sessionId}: Agent chat delivery failed: {chatErr}")
|
|
|
|
# Persist as botResponse + transcript so it shows up in history/UI.
|
|
intentEnum, intentMeta = _coercePersistedDetectedIntent(detectedIntent)
|
|
reasoningForDb = (
|
|
f"{reasoning} [{intentMeta}]" if intentMeta else reasoning
|
|
)
|
|
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=text,
|
|
responseType=responseType,
|
|
detectedIntent=intentEnum,
|
|
reasoning=reasoningForDb,
|
|
triggeredByTranscriptId=triggerTranscriptId,
|
|
modelName="agent",
|
|
processingTime=0.0,
|
|
priceCHF=0.0,
|
|
timestamp=getIsoTimestamp(),
|
|
).model_dump()
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": text,
|
|
"responseType": responseType.value,
|
|
"detectedIntent": intentEnum.value,
|
|
"reasoning": reasoningForDb,
|
|
"modelName": "agent",
|
|
"processingTime": 0.0,
|
|
"priceCHF": 0.0,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
})
|
|
|
|
botTranscriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=self.config.botName,
|
|
text=text,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=True,
|
|
source="botResponse",
|
|
).model_dump()
|
|
botTranscript = interface.createTranscript(botTranscriptData)
|
|
|
|
self._contextBuffer.append({
|
|
"speaker": self.config.botName,
|
|
"text": text,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "botResponse",
|
|
})
|
|
self._lastTranscriptSpeaker = self.config.botName
|
|
self._lastTranscriptText = text
|
|
self._lastTranscriptId = botTranscript.get("id")
|
|
self._lastBotResponseText = text.strip().lower()
|
|
self._lastBotResponseTs = time.time()
|
|
self._followUpWindowEnd = time.time() + 15.0
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": botTranscript.get("id"),
|
|
"speaker": self.config.botName,
|
|
"text": text,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "botResponse",
|
|
"speakerResolvedFromHint": False,
|
|
})
|
|
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("botResponseCount", 0) + 1
|
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
|
|
|
async def _persistInternalDirectorReply(
|
|
self,
|
|
sessionId: str,
|
|
internalNote: str,
|
|
promptId: Optional[str],
|
|
triggerTranscriptId: Optional[str] = None,
|
|
) -> None:
|
|
"""Record a director-prompt agent reply as INTERNAL (operator-UI only).
|
|
|
|
Unlike ``_deliverTextToMeeting`` this never dispatches TTS or chat into
|
|
the meeting, never appends to the meeting context buffer, and does not
|
|
create a meeting transcript line. It only persists a botResponse and
|
|
emits an SSE event so the operator UI shows what the agent decided.
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
note = (internalNote or "").strip()
|
|
if not note:
|
|
return
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
intentEnum, _intentMeta = _coercePersistedDetectedIntent("agent:directorPrompt")
|
|
reasoningForDb = (
|
|
f"Director prompt {promptId or ''} — silent / internal only "
|
|
f"(not sent to meeting)"
|
|
).strip()
|
|
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=note,
|
|
responseType=TeamsbotResponseType.CHAT,
|
|
detectedIntent=intentEnum,
|
|
reasoning=reasoningForDb,
|
|
triggeredByTranscriptId=triggerTranscriptId,
|
|
modelName="agent",
|
|
processingTime=0.0,
|
|
priceCHF=0.0,
|
|
timestamp=getIsoTimestamp(),
|
|
).model_dump()
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": note,
|
|
"responseType": TeamsbotResponseType.CHAT.value,
|
|
"detectedIntent": intentEnum.value,
|
|
"reasoning": reasoningForDb,
|
|
"modelName": "agent",
|
|
"processingTime": 0.0,
|
|
"priceCHF": 0.0,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
"internalOnly": True,
|
|
"promptId": promptId,
|
|
})
|
|
|
|
logger.info(
|
|
f"Session {sessionId}: Director prompt {promptId} silent reply "
|
|
f"persisted internally ({len(note)} chars)"
|
|
)
|
|
|
|
# =========================================================================
|
|
# Greeting (AI-localised, no hardcoded language strings)
|
|
# =========================================================================
|
|
|
|
async def _generateGreetingText(self, languageCode: str) -> str:
|
|
"""Generate the bot's join greeting via AI in ``languageCode`` and the
|
|
configured persona. Returns empty string on failure — the caller must
|
|
treat that as 'skip the greeting' (NEVER fall back to a hardcoded
|
|
localised string)."""
|
|
targetLang = (languageCode or self.config.language or "").strip() or "en-US"
|
|
botName = (self.config.botName or "the assistant").strip()
|
|
firstName = botName.split(" ")[0] if botName else botName
|
|
persona = (self.config.aiSystemPrompt or "").strip()
|
|
|
|
# English instructions to the LLM; the OUTPUT must be in ``targetLang``.
|
|
prompt = (
|
|
f"You are localizing the join greeting for a meeting assistant.\n\n"
|
|
f"Assistant display name (use exactly this, no translation): {firstName}\n\n"
|
|
f"Persona / style guide for the assistant:\n"
|
|
f"{persona or '(no persona configured — use a neutral, polite, professional tone)'}\n\n"
|
|
f"Target spoken language (BCP-47 code): {targetLang}\n\n"
|
|
f"Generate ONE short greeting (max ~14 words) for the assistant "
|
|
f"to say AND post in chat the moment it joins a meeting. The "
|
|
f"greeting MUST:\n"
|
|
f" - be in the target language\n"
|
|
f" - introduce the assistant by name ({firstName})\n"
|
|
f" - signal that it is now present and ready\n"
|
|
f" - sound natural when spoken aloud (this text is also TTS'd)\n\n"
|
|
f"Output ONLY the greeting text, no quotes, no markdown, no "
|
|
f"commentary, no surrounding punctuation beyond what naturally "
|
|
f"belongs to the sentence."
|
|
)
|
|
|
|
try:
|
|
aiService = _createAiService(
|
|
self.currentUser, self.mandateId, self.instanceId
|
|
)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="",
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
),
|
|
)
|
|
response = await aiService.callAi(request)
|
|
except Exception as aiErr:
|
|
logger.warning(
|
|
f"Greeting generation crashed (lang={targetLang}): {aiErr}"
|
|
)
|
|
return ""
|
|
|
|
if not response or response.errorCount != 0 or not response.content:
|
|
logger.warning(
|
|
f"Greeting generation returned empty/error (lang={targetLang})"
|
|
)
|
|
return ""
|
|
|
|
text = response.content.strip()
|
|
# Strip any wrapping quotes/code fences the model might have added.
|
|
text = re.sub(r"^```.*?\n", "", text, flags=re.DOTALL)
|
|
text = re.sub(r"\n```\s*$", "", text)
|
|
text = text.strip().strip("\"'`").strip()
|
|
if not text:
|
|
return ""
|
|
logger.info(
|
|
f"Greeting generated (lang={targetLang}, chars={len(text)}): {text[:80]}"
|
|
)
|
|
return text
|
|
|
|
async def _dispatchGreetingToMeeting(
|
|
self,
|
|
sessionId: str,
|
|
greetingText: str,
|
|
greetingLang: str,
|
|
sendToChat: bool,
|
|
interface: Any,
|
|
voiceInterface: Any,
|
|
websocket: WebSocket,
|
|
) -> None:
|
|
"""Centralised dispatcher for the bot's join greeting: speaks the
|
|
text via TTS into the meeting and (optionally) tells the bot to post
|
|
it in the meeting chat. Persists the greeting as a bot transcript /
|
|
botResponse so it appears in the operator UI history.
|
|
|
|
``sendToChat`` is ``False`` for the legacy ``voiceGreeting`` path
|
|
(the bot already chatted itself) and ``True`` for the new
|
|
``requestGreeting`` path where the Gateway owns chat dispatch too.
|
|
"""
|
|
try:
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "requested",
|
|
"hasWebSocket": True,
|
|
"message": "Greeting TTS requested",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
cancelHook = self._makeAnswerCancelHook()
|
|
async with self._meetingTtsLock:
|
|
ttsOutcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=_voiceFriendlyMeetingText(greetingText),
|
|
languageCode=greetingLang,
|
|
voiceName=self.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
if ttsOutcome.get("success"):
|
|
logger.info(
|
|
f"Greeting TTS sent for session {sessionId} "
|
|
f"(chunks={ttsOutcome.get('chunks')})"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "dispatched",
|
|
"hasWebSocket": True,
|
|
"chunks": ttsOutcome.get("chunks"),
|
|
"played": ttsOutcome.get("played"),
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
else:
|
|
logger.warning(
|
|
f"Greeting TTS failed for session {sessionId}: {ttsOutcome.get('error')}"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "failed",
|
|
"hasWebSocket": True,
|
|
"message": ttsOutcome.get("error"),
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
|
|
if sendToChat:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": greetingText,
|
|
}))
|
|
logger.info(f"Greeting chat dispatch queued for session {sessionId}")
|
|
except Exception as chatErr:
|
|
logger.warning(
|
|
f"Greeting chat dispatch failed for session {sessionId}: {chatErr}"
|
|
)
|
|
|
|
greetingTranscriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=self.config.botName,
|
|
text=greetingText,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=greetingLang,
|
|
isFinal=True,
|
|
source="botResponse",
|
|
).model_dump()
|
|
greetingTranscript = interface.createTranscript(greetingTranscriptData)
|
|
|
|
self._contextBuffer.append({
|
|
"speaker": self.config.botName,
|
|
"text": greetingText,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "botResponse",
|
|
})
|
|
self._lastTranscriptSpeaker = self.config.botName
|
|
self._lastTranscriptText = greetingText
|
|
self._lastTranscriptId = greetingTranscript.get("id")
|
|
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": greetingTranscript.get("id"),
|
|
"responseText": greetingText,
|
|
"responseType": TeamsbotResponseType.AUDIO.value,
|
|
"detectedIntent": "greeting",
|
|
"reasoning": "Automatic join greeting",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": greetingTranscript.get("id"),
|
|
"speaker": self.config.botName,
|
|
"text": greetingText,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "botResponse",
|
|
"speakerResolvedFromHint": False,
|
|
})
|
|
except Exception as dispatchErr:
|
|
logger.warning(
|
|
f"Greeting dispatch failed for session {sessionId}: {dispatchErr}"
|
|
)
|
|
|
|
# =========================================================================
|
|
# Context Summarization (for long sessions)
|
|
# =========================================================================
|
|
|
|
async def _summarizeSessionContext(self, sessionId: str, rawContext: str) -> str:
|
|
"""Summarize a long user-provided session context to its essential points.
|
|
This reduces token usage in every subsequent AI call."""
|
|
try:
|
|
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt=(
|
|
"Fasse den folgenden Kontext auf die wesentlichen Punkte zusammen. "
|
|
"Behalte alle wichtigen Fakten, Namen, Zahlen, Entscheidungen und Aktionspunkte. "
|
|
"Entferne Fuelltext und Wiederholungen. "
|
|
"Antworte NUR mit der Zusammenfassung, keine Erklaerungen oder Einleitungen."
|
|
),
|
|
context=rawContext,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
if response and response.errorCount == 0 and response.content:
|
|
summary = response.content.strip()
|
|
logger.info(f"Session {sessionId}: Context summarized from {len(rawContext)} to {len(summary)} chars")
|
|
return summary
|
|
except Exception as e:
|
|
logger.warning(f"Session context summarization failed for {sessionId}: {e}")
|
|
|
|
# Fallback: return original (truncated if very long)
|
|
return rawContext[:2000] if len(rawContext) > 2000 else rawContext
|
|
|
|
async def _summarizeContextBuffer(self, sessionId: str):
|
|
"""Summarize the older part of the context buffer to preserve information
|
|
without exceeding the context window. This runs in the background."""
|
|
try:
|
|
if self._contextSummary:
|
|
return # Already summarized recently
|
|
|
|
# Take the older half of the buffer for summarization
|
|
halfPoint = len(self._contextBuffer) // 2
|
|
oldSegments = self._contextBuffer[:halfPoint]
|
|
|
|
if len(oldSegments) < 10:
|
|
return # Not enough to summarize
|
|
|
|
# Build text to summarize
|
|
lines = []
|
|
for seg in oldSegments:
|
|
speaker = seg.get("speaker", "Unknown")
|
|
text = seg.get("text", "")
|
|
lines.append(f"[{speaker}]: {text}")
|
|
textToSummarize = "\n".join(lines)
|
|
|
|
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt="Fasse das folgende Meeting-Transkript in 3-5 Saetzen zusammen. Nenne die wichtigsten Themen, Entscheidungen und offene Fragen. Antworte NUR mit der Zusammenfassung, keine Erklaerungen.",
|
|
context=textToSummarize,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
if response and response.errorCount == 0:
|
|
self._contextSummary = response.content.strip()
|
|
logger.info(f"Session {sessionId}: Context summarized ({len(oldSegments)} segments -> {len(self._contextSummary)} chars)")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Context summarization failed for session {sessionId}: {e}")
|
|
|
|
# =========================================================================
|
|
# Meeting Summary
|
|
# =========================================================================
|
|
|
|
async def _generateMeetingSummary(self, sessionId: str):
|
|
"""Generate an AI summary of the meeting after it ends."""
|
|
try:
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
transcripts = interface.getTranscripts(sessionId)
|
|
|
|
if not transcripts or len(transcripts) < 5:
|
|
return # Not enough content for a summary
|
|
|
|
# Build full transcript
|
|
fullTranscript = "\n".join(
|
|
f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}"
|
|
for t in transcripts
|
|
)
|
|
|
|
aiService = _createAiService(self.currentUser, self.mandateId, self.instanceId)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.",
|
|
context=fullTranscript,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.BALANCED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
if response and response.errorCount == 0:
|
|
interface.updateSession(sessionId, {"summary": response.content})
|
|
logger.info(f"Meeting summary generated for session {sessionId}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")
|