1319 lines
60 KiB
Python
1319 lines
60 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Teamsbot Service - Pipeline Orchestrator.
|
|
Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge.
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
import asyncio
|
|
import time
|
|
import base64
|
|
from typing import Optional, Dict, Any, List
|
|
|
|
from fastapi import WebSocket
|
|
|
|
from modules.datamodels.datamodelUam import User
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
|
from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp
|
|
|
|
from .datamodelTeamsbot import (
|
|
TeamsbotSessionStatus,
|
|
TeamsbotTranscript,
|
|
TeamsbotBotResponse,
|
|
TeamsbotResponseType,
|
|
TeamsbotConfig,
|
|
TeamsbotResponseMode,
|
|
TeamsbotResponseChannel,
|
|
SpeechTeamsResponse,
|
|
TeamsbotCommand,
|
|
)
|
|
from .browserBotConnector import BrowserBotConnector
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# =========================================================================
|
|
# Minimal Service Context (for AI billing in bridge callbacks)
|
|
# =========================================================================
|
|
|
|
class _ServiceContext:
|
|
"""Minimal context providing user/mandate info for AiService billing.
|
|
Used by bridge callbacks where a full Services instance is not available."""
|
|
|
|
def __init__(self, user, mandateId, featureInstanceId=None):
|
|
self.user = user
|
|
self.mandateId = mandateId
|
|
self.featureInstanceId = featureInstanceId
|
|
self.featureCode = "teamsbot"
|
|
|
|
|
|
# =========================================================================
|
|
# Session Event Queues (for SSE streaming to frontend)
|
|
# =========================================================================
|
|
_sessionEvents: Dict[str, asyncio.Queue] = {}
|
|
|
|
|
|
async def _emitSessionEvent(sessionId: str, eventType: str, data: Any):
|
|
"""Emit an event to the session's SSE stream.
|
|
Creates the queue on-demand so events are never silently dropped."""
|
|
if sessionId not in _sessionEvents:
|
|
_sessionEvents[sessionId] = asyncio.Queue()
|
|
await _sessionEvents[sessionId].put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()})
|
|
|
|
|
|
class TeamsbotService:
|
|
"""
|
|
Pipeline Orchestrator for Teams Bot sessions.
|
|
Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication.
|
|
"""
|
|
|
|
def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig):
|
|
self.currentUser = currentUser
|
|
self.mandateId = mandateId
|
|
self.instanceId = instanceId
|
|
self.config = config
|
|
self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl())
|
|
|
|
# State
|
|
self._lastAiCallTime: float = 0.0
|
|
self._contextBuffer: List[Dict[str, Any]] = []
|
|
self._sessionContext: Optional[str] = None # User-provided background context
|
|
self._contextSummary: Optional[str] = None # AI-generated summary of long context
|
|
|
|
# Differential transcript tracking: only write new text, update existing
|
|
# record when the same speaker continues speaking
|
|
self._lastTranscriptSpeaker: Optional[str] = None
|
|
self._lastTranscriptText: Optional[str] = None
|
|
self._lastTranscriptId: Optional[str] = None
|
|
self._recentSpeakerHints: List[Dict[str, Any]] = []
|
|
self._lastBotResponseText: Optional[str] = None
|
|
self._lastBotResponseTs: float = 0.0
|
|
|
|
# =========================================================================
|
|
# Session Lifecycle
|
|
# =========================================================================
|
|
|
|
async def joinMeeting(
|
|
self,
|
|
sessionId: str,
|
|
meetingLink: str,
|
|
connectionId: Optional[str] = None,
|
|
gatewayBaseUrl: str = "",
|
|
botAccountEmail: Optional[str] = None,
|
|
botAccountPassword: Optional[str] = None,
|
|
):
|
|
"""Send join command to the Browser Bot service.
|
|
|
|
The browser bot will:
|
|
1. Launch browser (headful if credentials provided, headless otherwise)
|
|
2. Navigate to Teams web app
|
|
3. Authenticate if credentials provided, otherwise join as anonymous guest
|
|
4. Enable captions/audio capture and start scraping
|
|
5. Connect back via WebSocket to send transcripts
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
# Initialize SSE event queue
|
|
_sessionEvents[sessionId] = asyncio.Queue()
|
|
|
|
try:
|
|
# Update status to JOINING
|
|
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"})
|
|
|
|
# Send join command to browser bot
|
|
session = interface.getSession(sessionId)
|
|
if not session:
|
|
raise ValueError(f"Session {sessionId} not found")
|
|
|
|
# Build the full WebSocket URL for the bot to connect back to this gateway instance
|
|
# gatewayBaseUrl is passed from the route handler (derived from request.base_url)
|
|
wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws"
|
|
gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/")
|
|
fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}"
|
|
|
|
hasAuth = bool(botAccountEmail and botAccountPassword)
|
|
logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}")
|
|
|
|
result = await self.browserBotConnector.joinMeeting(
|
|
sessionId=sessionId,
|
|
meetingUrl=meetingLink,
|
|
botName=session.get("botName", self.config.botName),
|
|
instanceId=self.instanceId,
|
|
gatewayWsUrl=fullGatewayWsUrl,
|
|
language=self.config.language,
|
|
botAccountEmail=botAccountEmail,
|
|
botAccountPassword=botAccountPassword,
|
|
transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto",
|
|
debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False,
|
|
)
|
|
|
|
if result.get("success"):
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.JOINING.value, # Will become ACTIVE when bot connects via WS
|
|
})
|
|
logger.info(f"Browser bot deployment started for session {sessionId}")
|
|
else:
|
|
errorMsg = result.get("error", "Unknown error joining meeting")
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ERROR.value,
|
|
"errorMessage": errorMsg,
|
|
})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg})
|
|
logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error joining meeting for session {sessionId}: {e}")
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ERROR.value,
|
|
"errorMessage": str(e),
|
|
})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)})
|
|
|
|
async def leaveMeeting(self, sessionId: str):
|
|
"""Send leave command to the Browser Bot service."""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
|
|
try:
|
|
interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"})
|
|
|
|
await self.browserBotConnector.leaveMeeting(sessionId)
|
|
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ENDED.value,
|
|
"endedAt": getIsoTimestamp(),
|
|
})
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"})
|
|
|
|
# Generate meeting summary in background
|
|
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
|
|
|
logger.info(f"Bot left meeting for session {sessionId}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error leaving meeting for session {sessionId}: {e}")
|
|
interface.updateSession(sessionId, {
|
|
"status": TeamsbotSessionStatus.ERROR.value,
|
|
"errorMessage": str(e),
|
|
"endedAt": getIsoTimestamp(),
|
|
})
|
|
|
|
# Cleanup event queue
|
|
_sessionEvents.pop(sessionId, None)
|
|
|
|
# =========================================================================
|
|
# Browser Bot WebSocket Communication
|
|
# =========================================================================
|
|
|
|
async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str):
|
|
"""
|
|
Main WebSocket handler for Browser Bot communication.
|
|
|
|
Receives:
|
|
- transcript: Caption text scraped from Teams
|
|
- status: Bot state changes (joined, in_lobby, left, error)
|
|
|
|
Sends:
|
|
- playAudio: TTS audio for the bot to play in the meeting
|
|
"""
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
voiceInterface = getVoiceInterface(self.currentUser, self.mandateId)
|
|
|
|
# Load session context (user-provided background knowledge)
|
|
# If the context is long (>500 chars), summarize it to reduce token usage
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
rawContext = session.get("sessionContext")
|
|
if rawContext and len(rawContext) > 500:
|
|
logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...")
|
|
self._sessionContext = await self._summarizeSessionContext(sessionId, rawContext)
|
|
elif rawContext:
|
|
self._sessionContext = rawContext
|
|
if self._sessionContext:
|
|
logger.info(f"Session {sessionId}: Session context ready ({len(self._sessionContext)} chars)")
|
|
|
|
# Resolve system bot email for speaker detection (prevents bot from triggering AI on own speech)
|
|
try:
|
|
systemBot = interface.getActiveSystemBot(self.mandateId)
|
|
self._botAccountEmail = systemBot.get("email") if systemBot else None
|
|
if self._botAccountEmail:
|
|
logger.info(f"Session {sessionId}: Bot account email resolved: {self._botAccountEmail}")
|
|
except Exception:
|
|
self._botAccountEmail = None
|
|
|
|
logger.info(f"[WS] Handler started for session {sessionId}")
|
|
|
|
try:
|
|
msgCount = 0
|
|
while True:
|
|
data = await websocket.receive()
|
|
msgCount += 1
|
|
|
|
if "text" not in data:
|
|
logger.debug(f"[WS] session={sessionId} msg #{msgCount}: non-text data (keys: {list(data.keys())})")
|
|
continue
|
|
|
|
message = json.loads(data["text"])
|
|
msgType = message.get("type")
|
|
if msgType not in ("audioChunk", "ping"):
|
|
logger.info(f"[WS] session={sessionId} msg #{msgCount}: type={msgType}")
|
|
|
|
if msgType == "transcript":
|
|
transcript = message.get("transcript", {})
|
|
source = transcript.get("source", "caption")
|
|
speaker = transcript.get("speaker", "Unknown")
|
|
textPreview = (transcript.get("text", "") or "")[:60]
|
|
# Caption/speakerHint: name resolution only; transcript comes from STT
|
|
logger.info(f"[WS] Transcript (source={source}, speaker={speaker}): {textPreview}...")
|
|
await self._processTranscript(
|
|
sessionId=sessionId,
|
|
speaker=transcript.get("speaker", "Unknown"),
|
|
text=transcript.get("text", ""),
|
|
isFinal=transcript.get("isFinal", True),
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
source=source,
|
|
)
|
|
|
|
elif msgType == "chatMessage":
|
|
chat = message.get("chat", {})
|
|
isHistory = chat.get("isHistory", False)
|
|
source = "chatHistory" if isHistory else "chat"
|
|
logger.info(
|
|
f"[WS] Chat{'[HISTORY]' if isHistory else ''}: "
|
|
f"speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}..."
|
|
)
|
|
await self._processTranscript(
|
|
sessionId=sessionId,
|
|
speaker=chat.get("speaker", "Unknown"),
|
|
text=chat.get("text", ""),
|
|
isFinal=True,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
source=source,
|
|
)
|
|
|
|
elif msgType == "status":
|
|
status = message.get("status")
|
|
errorMessage = message.get("message")
|
|
logger.info(f"[WS] Status: status={status}, message={errorMessage}")
|
|
await self._handleBotStatus(sessionId, status, errorMessage, interface)
|
|
|
|
elif msgType == "audioChunk":
|
|
audioData = message.get("audio", {})
|
|
audioBase64 = audioData.get("data", "")
|
|
sampleRate = audioData.get("sampleRate", 16000)
|
|
captureDiagnostics = audioData.get("captureDiagnostics") or {}
|
|
if audioBase64:
|
|
await self._processAudioChunk(
|
|
sessionId=sessionId,
|
|
audioBase64=audioBase64,
|
|
sampleRate=sampleRate,
|
|
captureDiagnostics=captureDiagnostics,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
)
|
|
|
|
elif msgType == "voiceGreeting":
|
|
greetingText = message.get("text", "")
|
|
greetingLang = message.get("language", self.config.language)
|
|
logger.info(f"[WS] Voice greeting: text={greetingText[:60]}..., language={greetingLang}")
|
|
if greetingText and voiceInterface:
|
|
try:
|
|
ttsResult = await voiceInterface.textToSpeech(
|
|
text=greetingText,
|
|
languageCode=greetingLang,
|
|
voiceName=self.config.voiceId
|
|
)
|
|
if ttsResult and isinstance(ttsResult, dict):
|
|
audioContent = ttsResult.get("audioContent")
|
|
if audioContent:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "playAudio",
|
|
"sessionId": sessionId,
|
|
"audio": {
|
|
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
|
|
"format": "mp3",
|
|
}
|
|
}))
|
|
logger.info(f"Voice greeting TTS sent for session {sessionId}")
|
|
except Exception as ttsErr:
|
|
logger.warning(f"Voice greeting TTS failed for session {sessionId}: {ttsErr}")
|
|
|
|
elif msgType == "ping":
|
|
await websocket.send_text(json.dumps({"type": "pong"}))
|
|
|
|
elif msgType == "ttsPlaybackAck":
|
|
playback = message.get("playback", {}) or {}
|
|
status = playback.get("status", "unknown")
|
|
ackMessage = playback.get("message") or "Bot playback status update"
|
|
logger.info(
|
|
f"[WS] TTS playback ack: status={status}, format={playback.get('format')}, "
|
|
f"bytesBase64={playback.get('bytesBase64')}"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": f"playback_{status}",
|
|
"hasWebSocket": True,
|
|
"message": ackMessage,
|
|
"timestamp": playback.get("timestamp") or getIsoTimestamp(),
|
|
"format": playback.get("format"),
|
|
"bytesBase64": playback.get("bytesBase64"),
|
|
})
|
|
|
|
except Exception as e:
|
|
if "disconnect" not in str(e).lower():
|
|
logger.error(f"[WS] Error for session {sessionId}: {type(e).__name__}: {e}")
|
|
|
|
logger.info(f"[WS] Handler ended for session {sessionId} after {msgCount} messages")
|
|
|
|
async def _handleBotStatus(
|
|
self,
|
|
sessionId: str,
|
|
status: str,
|
|
errorMessage: Optional[str],
|
|
interface,
|
|
):
|
|
"""Handle status updates from the browser bot."""
|
|
logger.info(f"Bot status update for session {sessionId}: {status}")
|
|
|
|
statusMap = {
|
|
"connecting": TeamsbotSessionStatus.JOINING.value,
|
|
"launching": TeamsbotSessionStatus.JOINING.value,
|
|
"navigating": TeamsbotSessionStatus.JOINING.value,
|
|
"in_lobby": TeamsbotSessionStatus.JOINING.value,
|
|
"joined": TeamsbotSessionStatus.ACTIVE.value,
|
|
"in_meeting": TeamsbotSessionStatus.ACTIVE.value,
|
|
"left": TeamsbotSessionStatus.ENDED.value,
|
|
"error": TeamsbotSessionStatus.ERROR.value,
|
|
}
|
|
|
|
dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value)
|
|
|
|
updates = {"status": dbStatus}
|
|
if errorMessage:
|
|
updates["errorMessage"] = errorMessage
|
|
if dbStatus == TeamsbotSessionStatus.ACTIVE.value:
|
|
updates["startedAt"] = getIsoTimestamp()
|
|
elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
|
|
updates["endedAt"] = getIsoTimestamp()
|
|
|
|
interface.updateSession(sessionId, updates)
|
|
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
|
|
|
|
# Generate summary when session ends
|
|
if dbStatus == TeamsbotSessionStatus.ENDED.value:
|
|
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
|
|
|
async def _processAudioChunk(
|
|
self,
|
|
sessionId: str,
|
|
audioBase64: str,
|
|
sampleRate: int,
|
|
captureDiagnostics: Optional[Dict[str, Any]],
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
|
|
import base64
|
|
try:
|
|
audioBytes = base64.b64decode(audioBase64)
|
|
if len(audioBytes) < 1000:
|
|
return
|
|
|
|
if captureDiagnostics:
|
|
trackId = captureDiagnostics.get("trackId")
|
|
readyState = captureDiagnostics.get("readyState")
|
|
rms = captureDiagnostics.get("rms")
|
|
nativeSampleRate = captureDiagnostics.get("nativeSampleRate")
|
|
logger.debug(
|
|
f"[AudioChunk] diagnostics: track={trackId}, readyState={readyState}, "
|
|
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
|
|
)
|
|
|
|
# Use RMS from capture diagnostics to skip real silence.
|
|
# Byte-variation heuristics produced false positives and dropped valid speech.
|
|
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
|
|
try:
|
|
rmsVal = float(captureDiagnostics.get("rms"))
|
|
if rmsVal < 0.0015:
|
|
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
|
|
return
|
|
except Exception:
|
|
pass
|
|
|
|
if not voiceInterface:
|
|
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
|
|
return
|
|
|
|
# Treat sampleRate=0 as unknown (triggers auto-detection)
|
|
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
|
|
|
|
sttResult = await voiceInterface.speechToText(
|
|
audioContent=audioBytes,
|
|
language=self.config.language or "de-DE",
|
|
sampleRate=effectiveSampleRate,
|
|
channels=1,
|
|
skipFallbacks=True,
|
|
)
|
|
|
|
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
|
text = sttResult["text"].strip()
|
|
if text:
|
|
resolvedSpeaker = self._resolveSpeakerForAudioCapture()
|
|
fromCaption = resolvedSpeaker.get("speakerResolvedFromHint", False)
|
|
logger.info(
|
|
f"[AudioChunk] STT result: speaker={resolvedSpeaker.get('speaker', 'Meeting Audio')} "
|
|
f"(fromCaption={fromCaption}), text={text[:80]}..."
|
|
)
|
|
await self._processTranscript(
|
|
sessionId=sessionId,
|
|
speaker=resolvedSpeaker["speaker"],
|
|
text=text,
|
|
isFinal=True,
|
|
interface=interface,
|
|
voiceInterface=voiceInterface,
|
|
websocket=websocket,
|
|
source="audioCapture",
|
|
speakerResolvedFromHint=resolvedSpeaker["speakerResolvedFromHint"],
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")
|
|
|
|
def _registerSpeakerHint(self, speaker: str, text: str):
|
|
"""Store recent speaker hints from captions for audio-mode speaker attribution."""
|
|
if not speaker:
|
|
return
|
|
normalizedSpeaker = speaker.strip()
|
|
if not normalizedSpeaker or self._isBotSpeaker(normalizedSpeaker):
|
|
return
|
|
|
|
self._recentSpeakerHints.append({
|
|
"speaker": normalizedSpeaker,
|
|
"text": (text or "").strip(),
|
|
"timestamp": time.time(),
|
|
})
|
|
|
|
# Keep only the latest 20 hints
|
|
if len(self._recentSpeakerHints) > 20:
|
|
self._recentSpeakerHints = self._recentSpeakerHints[-20:]
|
|
|
|
def _resolveSpeakerForAudioCapture(self) -> Dict[str, Any]:
|
|
"""Best-effort speaker name for audio chunks using recent caption hints."""
|
|
if not self._recentSpeakerHints:
|
|
return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
|
|
|
|
nowTs = time.time()
|
|
# Prefer very recent hints to reduce wrong attribution
|
|
for hint in reversed(self._recentSpeakerHints):
|
|
hintAge = nowTs - hint.get("timestamp", 0)
|
|
if hintAge <= 15:
|
|
return {
|
|
"speaker": hint.get("speaker", "Meeting Audio"),
|
|
"speakerResolvedFromHint": True,
|
|
}
|
|
|
|
return {"speaker": "Meeting Audio", "speakerResolvedFromHint": False}
|
|
|
|
async def _processTranscript(
|
|
self,
|
|
sessionId: str,
|
|
speaker: str,
|
|
text: str,
|
|
isFinal: bool,
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
source: str = "caption",
|
|
speakerResolvedFromHint: Optional[bool] = None,
|
|
):
|
|
"""Process a transcript segment from captions or chat messages.
|
|
|
|
Differential writing: When the same speaker continues (text grows
|
|
incrementally as captions stream), we UPDATE the existing DB record
|
|
instead of creating a cascade of near-duplicate rows. A new record
|
|
is only created when the speaker changes or the text is not a
|
|
continuation of the previous segment.
|
|
"""
|
|
|
|
text = text.strip()
|
|
if not text:
|
|
return
|
|
|
|
# Speaker hints are lightweight caption-derived signals used for
|
|
# speaker attribution only. Caption text is NOT used as transcript
|
|
# (transcript comes from STT/audioCapture or chat).
|
|
# For address detection we still allow transient analysis from
|
|
# speaker hints (without DB write), otherwise direct calls like
|
|
# "Nyla, hörst du mich?" can be missed when audio capture is silent.
|
|
if source in ("caption", "speakerHint"):
|
|
self._registerSpeakerHint(speaker, text)
|
|
# Do NOT emit caption text as transcript to UI; caption is for name resolution only.
|
|
|
|
if (
|
|
source == "speakerHint"
|
|
and isFinal
|
|
and self.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
|
|
):
|
|
# Keep hint text only in volatile context (not persisted).
|
|
self._contextBuffer.append({
|
|
"speaker": speaker or "Unknown",
|
|
"text": text,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "speakerHint",
|
|
})
|
|
maxSegments = self.config.contextWindowSegments
|
|
if len(self._contextBuffer) > maxSegments:
|
|
self._contextBuffer = self._contextBuffer[-maxSegments:]
|
|
|
|
shouldTriggerFromHint = self._shouldTriggerAnalysis(text, allowPeriodic=False)
|
|
logger.debug(
|
|
f"Session {sessionId}: speakerHint shouldTriggerAnalysis={shouldTriggerFromHint}, "
|
|
f"bufferSize={len(self._contextBuffer)}"
|
|
)
|
|
if shouldTriggerFromHint:
|
|
logger.info(
|
|
f"Session {sessionId}: Triggering AI analysis from speakerHint "
|
|
f"(buffer: {len(self._contextBuffer)} segments)"
|
|
)
|
|
await self._analyzeAndRespond(
|
|
sessionId,
|
|
interface,
|
|
voiceInterface,
|
|
websocket,
|
|
{"id": None, "speaker": speaker, "text": text, "source": source},
|
|
)
|
|
return
|
|
|
|
# Chat history: messages sent before the bot joined the meeting.
|
|
# Stored in DB and context (available if someone refers to chat history)
|
|
# but never used to trigger AI responses.
|
|
if source == "chatHistory":
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=speaker,
|
|
text=text,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=True,
|
|
).model_dump()
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
self._contextBuffer.append({
|
|
"speaker": speaker or "Unknown",
|
|
"text": text,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "chatHistory",
|
|
})
|
|
maxSegments = self.config.contextWindowSegments
|
|
if len(self._contextBuffer) > maxSegments:
|
|
self._contextBuffer = self._contextBuffer[-maxSegments:]
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": speaker,
|
|
"text": text,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "chatHistory",
|
|
"isHistory": True,
|
|
})
|
|
logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}")
|
|
return
|
|
|
|
# Filter out the bot's own speech entirely — captions of the bot's
|
|
# own voice come back as garbled text (e.g. German TTS → English caption)
|
|
# which pollutes the context buffer and confuses AI analysis.
|
|
isBotSpeaker = self._isBotSpeaker(speaker)
|
|
if isBotSpeaker:
|
|
logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...")
|
|
return
|
|
|
|
# Differential transcript writing:
|
|
# If the same speaker is still talking and the new text is a
|
|
# continuation (starts with the previous text), UPDATE the existing
|
|
# record instead of creating a new one. This avoids cascading rows like:
|
|
# "Der AHV"
|
|
# "Der AHV Fonds"
|
|
# "Der AHV Fonds hat 2025"
|
|
# and instead keeps a single row that grows until the speaker changes.
|
|
isContinuation = (
|
|
self._lastTranscriptSpeaker == speaker
|
|
and self._lastTranscriptText
|
|
and self._lastTranscriptId
|
|
and text.startswith(self._lastTranscriptText)
|
|
and source in ("caption", "audioCapture")
|
|
)
|
|
|
|
if isContinuation:
|
|
interface.updateTranscript(self._lastTranscriptId, {
|
|
"text": text,
|
|
"isFinal": isFinal,
|
|
})
|
|
self._lastTranscriptText = text
|
|
createdTranscript = {"id": self._lastTranscriptId}
|
|
|
|
# Update context buffer: replace last entry for same speaker
|
|
if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
|
|
self._contextBuffer[-1]["text"] = text
|
|
else:
|
|
# New speaker or non-continuation → create a new record
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=speaker,
|
|
text=text,
|
|
timestamp=getIsoTimestamp(),
|
|
confidence=1.0,
|
|
language=self.config.language,
|
|
isFinal=isFinal,
|
|
).model_dump()
|
|
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
# Track for differential writing
|
|
self._lastTranscriptSpeaker = speaker
|
|
self._lastTranscriptText = text
|
|
self._lastTranscriptId = createdTranscript.get("id")
|
|
|
|
# Append to context buffer
|
|
self._contextBuffer.append({
|
|
"speaker": speaker or "Unknown",
|
|
"text": text,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": source,
|
|
})
|
|
|
|
# Keep only last N segments
|
|
maxSegments = self.config.contextWindowSegments
|
|
if len(self._contextBuffer) > maxSegments:
|
|
if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
|
|
asyncio.create_task(self._summarizeContextBuffer(sessionId))
|
|
self._contextBuffer = self._contextBuffer[-maxSegments:]
|
|
|
|
# Update session transcript count (only for new records)
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("transcriptSegmentCount", 0) + 1
|
|
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
|
|
|
|
# Emit SSE event for live transcript (always, for UI updates)
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": speaker,
|
|
"text": text,
|
|
"confidence": 1.0,
|
|
"timestamp": getIsoTimestamp(),
|
|
"isContinuation": isContinuation,
|
|
"source": source,
|
|
"speakerResolvedFromHint": (
|
|
speakerResolvedFromHint
|
|
if speakerResolvedFromHint is not None
|
|
else False
|
|
),
|
|
})
|
|
|
|
# Check if AI analysis should be triggered (only for final transcripts)
|
|
if not isFinal:
|
|
return
|
|
|
|
if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY:
|
|
logger.debug(f"Session {sessionId}: responseMode=TRANSCRIBE_ONLY, skipping AI analysis")
|
|
return
|
|
|
|
shouldTrigger = self._shouldTriggerAnalysis(text)
|
|
logger.debug(f"Session {sessionId}: shouldTriggerAnalysis={shouldTrigger}, bufferSize={len(self._contextBuffer)}, responseMode={self.config.responseMode}")
|
|
|
|
if not shouldTrigger:
|
|
return
|
|
|
|
# SPEECH_TEAMS AI analysis
|
|
logger.info(f"Session {sessionId}: Triggering AI analysis (buffer: {len(self._contextBuffer)} segments)")
|
|
await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
|
|
def _isBotSpeaker(self, speaker: str) -> bool:
|
|
"""Check if a transcript speaker is the bot itself.
|
|
|
|
Teams captions show the bot as e.g. "BotName (Unverified)" or
|
|
"Nyla Larsson" depending on auth/anonymous join. We match against:
|
|
- The configured/derived bot name
|
|
- The bot account display name if authenticated
|
|
"""
|
|
if not speaker:
|
|
return False
|
|
|
|
speakerLower = speaker.lower().strip()
|
|
|
|
# Match against configured bot name
|
|
botName = self.config.botName.lower().strip()
|
|
if botName and botName in speakerLower:
|
|
return True
|
|
|
|
# Match against bot account email prefix (e.g. "nyla.larsson" from "nyla.larsson@poweron.swiss")
|
|
botAccountEmail = getattr(self, '_botAccountEmail', None) or getattr(self.config, 'botAccountEmail', None)
|
|
if botAccountEmail:
|
|
emailPrefix = botAccountEmail.split("@")[0].lower().replace(".", " ")
|
|
if emailPrefix in speakerLower:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _shouldTriggerAnalysis(self, transcriptText: str, allowPeriodic: bool = True) -> bool:
|
|
"""
|
|
Decide whether to trigger AI analysis based on the latest transcript.
|
|
Triggers:
|
|
- Bot name mentioned (immediate)
|
|
- Periodic interval elapsed
|
|
- Cooldown respected
|
|
"""
|
|
now = time.time()
|
|
timeSinceLastCall = now - self._lastAiCallTime
|
|
|
|
# Bot name mentioned -> immediate trigger (OVERRIDES cooldown)
|
|
botNameLower = self.config.botName.lower()
|
|
transcriptLower = transcriptText.lower()
|
|
if botNameLower in transcriptLower:
|
|
logger.info(f"Trigger: Bot name '{self.config.botName}' detected in transcript (overrides cooldown): '{transcriptText[:60]}...'")
|
|
return True
|
|
|
|
# Also check first name and phonetically similar words (speech recognition artifacts)
|
|
botFirstName = botNameLower.split()[0] if " " in botNameLower else botNameLower
|
|
if len(botFirstName) >= 3:
|
|
for word in transcriptLower.split():
|
|
# Strip punctuation from word
|
|
cleanWord = word.strip(".,!?:;\"'()[]")
|
|
if not cleanWord or len(cleanWord) < 3:
|
|
continue
|
|
# Exact first name match
|
|
if cleanWord == botFirstName:
|
|
logger.info(f"Trigger: Bot first name '{botFirstName}' detected: '{transcriptText[:60]}...'")
|
|
return True
|
|
# Simple phonetic similarity: same first letter, similar length, high character overlap
|
|
if cleanWord[0] == botFirstName[0] and abs(len(cleanWord) - len(botFirstName)) <= 2:
|
|
common = sum(1 for c in set(botFirstName) if c in cleanWord)
|
|
similarity = common / max(len(set(botFirstName)), len(set(cleanWord)))
|
|
if similarity >= 0.6:
|
|
logger.info(f"Trigger: Phonetically similar to '{botFirstName}' -> '{cleanWord}' (sim={similarity:.2f}): '{transcriptText[:60]}...'")
|
|
return True
|
|
|
|
# Cooldown check (only for non-name triggers)
|
|
if timeSinceLastCall < self.config.triggerCooldownSeconds:
|
|
logger.debug(f"Trigger: Cooldown active ({timeSinceLastCall:.1f}s < {self.config.triggerCooldownSeconds}s)")
|
|
return False
|
|
|
|
# Periodic trigger
|
|
if allowPeriodic and timeSinceLastCall >= self.config.triggerIntervalSeconds:
|
|
logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s since last call)")
|
|
return True
|
|
|
|
logger.debug(f"Trigger: No trigger ({timeSinceLastCall:.1f}s / {self.config.triggerIntervalSeconds}s interval)")
|
|
return False
|
|
|
|
async def _analyzeAndRespond(
|
|
self,
|
|
sessionId: str,
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
triggerTranscript: Dict[str, Any],
|
|
):
|
|
"""Run SPEECH_TEAMS AI analysis and respond if needed."""
|
|
self._lastAiCallTime = time.time()
|
|
|
|
# Build transcript context from buffer.
|
|
# Mark bot's own utterances and chat messages for the AI.
|
|
contextLines = []
|
|
for segment in self._contextBuffer:
|
|
speaker = segment.get("speaker", "Unknown")
|
|
text = segment.get("text", "")
|
|
segSource = segment.get("source", "caption")
|
|
prefix = "Chat" if segSource == "chat" else ""
|
|
if self._isBotSpeaker(speaker):
|
|
contextLines.append(f"[YOU ({self.config.botName})]: {text}")
|
|
elif prefix:
|
|
contextLines.append(f"[{prefix}: {speaker}]: {text}")
|
|
else:
|
|
contextLines.append(f"[{speaker}]: {text}")
|
|
|
|
# Include session context if provided by the user at session start
|
|
sessionContextStr = ""
|
|
if self._sessionContext:
|
|
sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{self._sessionContext}\n"
|
|
|
|
# Include summary of earlier conversation if available
|
|
summaryStr = ""
|
|
if self._contextSummary:
|
|
summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{self._contextSummary}\n"
|
|
|
|
transcriptContext = f"BOT_NAME:{self.config.botName}{sessionContextStr}{summaryStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines)
|
|
|
|
# Call SPEECH_TEAMS
|
|
try:
|
|
from modules.services.serviceAi.mainServiceAi import AiService
|
|
|
|
# Create minimal service context for AI billing
|
|
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
|
aiService = AiService(serviceCenter=serviceContext)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt=self.config.aiSystemPrompt,
|
|
context=transcriptContext,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.SPEECH_TEAMS,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
|
|
# Parse structured response
|
|
try:
|
|
speechResult = SpeechTeamsResponse.model_validate_json(response.content)
|
|
except Exception:
|
|
# Try to extract JSON from response content
|
|
try:
|
|
jsonStr = response.content
|
|
if "```json" in jsonStr:
|
|
jsonStr = jsonStr.split("```json")[1].split("```")[0]
|
|
elif "```" in jsonStr:
|
|
jsonStr = jsonStr.split("```")[1].split("```")[0]
|
|
speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip())
|
|
except Exception as parseErr:
|
|
logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}")
|
|
speechResult = SpeechTeamsResponse(
|
|
shouldRespond=False,
|
|
reasoning=f"Parse error: {str(parseErr)[:100]}",
|
|
detectedIntent="none"
|
|
)
|
|
|
|
logger.info(
|
|
f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, "
|
|
f"intent={speechResult.detectedIntent}, "
|
|
f"reasoning={speechResult.reasoning[:80]}..."
|
|
)
|
|
|
|
# Emit analysis event (always, for debug/UI)
|
|
await _emitSessionEvent(sessionId, "analysis", {
|
|
"shouldRespond": speechResult.shouldRespond,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
})
|
|
|
|
# Step 4a: Handle STOP intent -- stop audio immediately
|
|
if speechResult.detectedIntent == "stop":
|
|
logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}")
|
|
if websocket:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "stopAudio",
|
|
"sessionId": sessionId,
|
|
}))
|
|
except Exception as stopErr:
|
|
logger.warning(f"Failed to send stop command: {stopErr}")
|
|
return
|
|
|
|
# Step 4b: Respond if AI decided to
|
|
if speechResult.shouldRespond and speechResult.responseText:
|
|
|
|
if self.config.responseMode == TeamsbotResponseMode.MANUAL:
|
|
# In manual mode, suggest but don't send
|
|
await _emitSessionEvent(sessionId, "suggestedResponse", {
|
|
"responseText": speechResult.responseText,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
})
|
|
return
|
|
|
|
# Determine response channel (voice, chat, or both)
|
|
# Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
|
|
channelRaw = self.config.responseChannel
|
|
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
|
|
logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")
|
|
|
|
sendVoice = channelStr in ("voice", "both")
|
|
sendChat = channelStr in ("chat", "both")
|
|
|
|
if sendVoice and sendChat:
|
|
responseType = TeamsbotResponseType.BOTH
|
|
elif sendVoice:
|
|
responseType = TeamsbotResponseType.AUDIO
|
|
else:
|
|
responseType = TeamsbotResponseType.CHAT
|
|
|
|
# Suppress duplicate responses in short windows ("repeat loop" protection).
|
|
normalizedResponse = (speechResult.responseText or "").strip().lower()
|
|
nowTs = time.time()
|
|
if (
|
|
normalizedResponse
|
|
and self._lastBotResponseText == normalizedResponse
|
|
and (nowTs - self._lastBotResponseTs) < 90
|
|
):
|
|
logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
|
|
await _emitSessionEvent(sessionId, "analysis", {
|
|
"shouldRespond": False,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": "Suppressed duplicate response within 90s",
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
})
|
|
return
|
|
|
|
# 4a: Voice response (TTS -> Audio to bot)
|
|
if sendVoice:
|
|
try:
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "requested",
|
|
"hasWebSocket": websocket is not None,
|
|
"message": "TTS generation requested",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
logger.info(
|
|
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
|
)
|
|
ttsResult = await voiceInterface.textToSpeech(
|
|
text=speechResult.responseText,
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId
|
|
)
|
|
|
|
if not ttsResult or not isinstance(ttsResult, dict):
|
|
raise RuntimeError("TTS returned invalid result payload")
|
|
|
|
if ttsResult.get("success") is False:
|
|
raise RuntimeError(f"TTS backend error: {ttsResult.get('error', 'unknown')}")
|
|
|
|
audioContent = ttsResult.get("audioContent")
|
|
if not audioContent:
|
|
raise RuntimeError("TTS returned no audioContent")
|
|
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "playAudio",
|
|
"sessionId": sessionId,
|
|
"audio": {
|
|
"data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
|
|
"format": "mp3",
|
|
},
|
|
}))
|
|
logger.info(f"Session {sessionId}: TTS audio dispatched to bot")
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "dispatched",
|
|
"hasWebSocket": True,
|
|
"message": "TTS audio dispatched to bot",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
else:
|
|
logger.warning(
|
|
f"Session {sessionId}: TTS audio generated but cannot be played (bot websocket unavailable, likely fallback mode)"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "unavailable",
|
|
"hasWebSocket": False,
|
|
"message": "TTS audio generated but bot websocket unavailable",
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
except Exception as ttsErr:
|
|
logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "failed",
|
|
"hasWebSocket": websocket is not None,
|
|
"message": str(ttsErr),
|
|
"timestamp": getIsoTimestamp(),
|
|
})
|
|
if not sendChat:
|
|
sendChat = True # Fallback to chat if voice-only and TTS failed
|
|
|
|
# 4b: Chat response (send text message to meeting chat)
|
|
if sendChat:
|
|
try:
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": speechResult.responseText,
|
|
}))
|
|
logger.info(f"Chat response sent for session {sessionId}")
|
|
except Exception as chatErr:
|
|
logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}")
|
|
|
|
# 4b: Store bot response
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=speechResult.responseText,
|
|
responseType=responseType,
|
|
detectedIntent=speechResult.detectedIntent,
|
|
reasoning=speechResult.reasoning,
|
|
triggeredByTranscriptId=triggerTranscript.get("id"),
|
|
modelName=response.modelName,
|
|
processingTime=response.processingTime,
|
|
priceCHF=response.priceCHF,
|
|
timestamp=getIsoTimestamp(),
|
|
).model_dump()
|
|
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
|
|
# 4c: Emit SSE event
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": speechResult.responseText,
|
|
"responseType": responseType.value,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
})
|
|
|
|
# Update session response count
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("botResponseCount", 0) + 1
|
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
|
|
|
self._lastBotResponseText = normalizedResponse
|
|
self._lastBotResponseTs = nowTs
|
|
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
|
|
|
|
# Step 5: Execute AI-issued commands (if any)
|
|
if speechResult.commands:
|
|
await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)
|
|
|
|
except Exception as e:
|
|
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
|
|
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
|
|
|
|
# =========================================================================
|
|
# AI Command Execution
|
|
# =========================================================================
|
|
|
|
async def _executeCommands(
|
|
self,
|
|
sessionId: str,
|
|
commands: List[TeamsbotCommand],
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
):
|
|
"""Execute structured commands returned by the AI.
|
|
|
|
Each command is sent to the browser bot via WebSocket as a
|
|
'botCommand' message. The bot's TeamsActionsService handles
|
|
the actual Teams UI interaction (checking state, toggling, etc.).
|
|
"""
|
|
for cmd in commands:
|
|
action = cmd.action
|
|
params = cmd.params or {}
|
|
logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")
|
|
|
|
try:
|
|
if action == "toggleTranscript":
|
|
enable = params.get("enable", True)
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "botCommand",
|
|
"sessionId": sessionId,
|
|
"command": "toggleTranscript",
|
|
"params": {"enable": enable},
|
|
}))
|
|
|
|
elif action == "sendChat":
|
|
chatText = params.get("text", "")
|
|
if chatText and websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": chatText,
|
|
}))
|
|
|
|
elif action == "readAloud":
|
|
readText = params.get("text", "")
|
|
if readText and voiceInterface:
|
|
ttsResult = await voiceInterface.textToSpeech(
|
|
text=readText,
|
|
languageCode=self.config.language,
|
|
voiceName=self.config.voiceId,
|
|
)
|
|
if ttsResult and isinstance(ttsResult, dict):
|
|
audioContent = ttsResult.get("audioContent")
|
|
if audioContent and websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "playAudio",
|
|
"sessionId": sessionId,
|
|
"audio": {
|
|
"data": base64.b64encode(
|
|
audioContent if isinstance(audioContent, bytes) else audioContent.encode()
|
|
).decode(),
|
|
"format": "mp3",
|
|
},
|
|
}))
|
|
|
|
elif action == "changeLanguage":
|
|
newLang = params.get("language", "")
|
|
if newLang:
|
|
self.config = self.config.model_copy(update={"language": newLang})
|
|
logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
|
|
await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})
|
|
|
|
elif action in ("toggleMic", "toggleCamera"):
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "botCommand",
|
|
"sessionId": sessionId,
|
|
"command": action,
|
|
"params": params,
|
|
}))
|
|
|
|
else:
|
|
logger.warning(f"Session {sessionId}: Unknown command '{action}'")
|
|
|
|
except Exception as cmdErr:
|
|
logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")
|
|
|
|
# =========================================================================
|
|
# Context Summarization (for long sessions)
|
|
# =========================================================================
|
|
|
|
async def _summarizeSessionContext(self, sessionId: str, rawContext: str) -> str:
|
|
"""Summarize a long user-provided session context to its essential points.
|
|
This reduces token usage in every subsequent AI call."""
|
|
try:
|
|
from modules.services.serviceAi.mainServiceAi import AiService
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
|
|
|
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
|
aiService = AiService(serviceCenter=serviceContext)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt=(
|
|
"Fasse den folgenden Kontext auf die wesentlichen Punkte zusammen. "
|
|
"Behalte alle wichtigen Fakten, Namen, Zahlen, Entscheidungen und Aktionspunkte. "
|
|
"Entferne Fuelltext und Wiederholungen. "
|
|
"Antworte NUR mit der Zusammenfassung, keine Erklaerungen oder Einleitungen."
|
|
),
|
|
context=rawContext,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
if response and response.errorCount == 0 and response.content:
|
|
summary = response.content.strip()
|
|
logger.info(f"Session {sessionId}: Context summarized from {len(rawContext)} to {len(summary)} chars")
|
|
return summary
|
|
except Exception as e:
|
|
logger.warning(f"Session context summarization failed for {sessionId}: {e}")
|
|
|
|
# Fallback: return original (truncated if very long)
|
|
return rawContext[:2000] if len(rawContext) > 2000 else rawContext
|
|
|
|
async def _summarizeContextBuffer(self, sessionId: str):
|
|
"""Summarize the older part of the context buffer to preserve information
|
|
without exceeding the context window. This runs in the background."""
|
|
try:
|
|
if self._contextSummary:
|
|
return # Already summarized recently
|
|
|
|
# Take the older half of the buffer for summarization
|
|
halfPoint = len(self._contextBuffer) // 2
|
|
oldSegments = self._contextBuffer[:halfPoint]
|
|
|
|
if len(oldSegments) < 10:
|
|
return # Not enough to summarize
|
|
|
|
# Build text to summarize
|
|
lines = []
|
|
for seg in oldSegments:
|
|
speaker = seg.get("speaker", "Unknown")
|
|
text = seg.get("text", "")
|
|
lines.append(f"[{speaker}]: {text}")
|
|
textToSummarize = "\n".join(lines)
|
|
|
|
from modules.services.serviceAi.mainServiceAi import AiService
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
|
|
|
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
|
aiService = AiService(serviceCenter=serviceContext)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt="Fasse das folgende Meeting-Transkript in 3-5 Saetzen zusammen. Nenne die wichtigsten Themen, Entscheidungen und offene Fragen. Antworte NUR mit der Zusammenfassung, keine Erklaerungen.",
|
|
context=textToSummarize,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
if response and response.errorCount == 0:
|
|
self._contextSummary = response.content.strip()
|
|
logger.info(f"Session {sessionId}: Context summarized ({len(oldSegments)} segments -> {len(self._contextSummary)} chars)")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Context summarization failed for session {sessionId}: {e}")
|
|
|
|
# =========================================================================
|
|
# Meeting Summary
|
|
# =========================================================================
|
|
|
|
async def _generateMeetingSummary(self, sessionId: str):
|
|
"""Generate an AI summary of the meeting after it ends."""
|
|
try:
|
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
|
|
|
interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
|
|
transcripts = interface.getTranscripts(sessionId)
|
|
|
|
if not transcripts or len(transcripts) < 5:
|
|
return # Not enough content for a summary
|
|
|
|
# Build full transcript
|
|
fullTranscript = "\n".join(
|
|
f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}"
|
|
for t in transcripts
|
|
)
|
|
|
|
from modules.services.serviceAi.mainServiceAi import AiService
|
|
|
|
serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
|
|
aiService = AiService(serviceCenter=serviceContext)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.",
|
|
context=fullTranscript,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.BALANCED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
if response and response.errorCount == 0:
|
|
interface.updateSession(sessionId, {"summary": response.content})
|
|
logger.info(f"Meeting summary generated for session {sessionId}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")
|