feat: differential transcript, fix bot name in WS handler, AI commands support
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
6963719499
commit
1227324703
5 changed files with 220 additions and 39 deletions
|
|
@ -234,12 +234,19 @@ class TeamsbotConfigUpdateRequest(BaseModel):
|
||||||
# SPEECH_TEAMS AI Response Model
|
# SPEECH_TEAMS AI Response Model
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
|
class TeamsbotCommand(BaseModel):
|
||||||
|
"""A structured command the AI can issue to control Teams meeting actions."""
|
||||||
|
action: str = Field(description="Command action: toggleTranscript, sendChat, readAloud, changeLanguage, toggleMic, toggleCamera")
|
||||||
|
params: Optional[Dict[str, Any]] = Field(default=None, description="Action-specific parameters")
|
||||||
|
|
||||||
|
|
||||||
class SpeechTeamsResponse(BaseModel):
|
class SpeechTeamsResponse(BaseModel):
|
||||||
"""Structured response from the SPEECH_TEAMS AI handler."""
|
"""Structured response from the SPEECH_TEAMS AI handler."""
|
||||||
shouldRespond: bool = Field(description="Whether the bot should respond")
|
shouldRespond: bool = Field(description="Whether the bot should respond")
|
||||||
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
|
responseText: Optional[str] = Field(default=None, description="The bot's response text (only if shouldRespond=True)")
|
||||||
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
|
reasoning: str = Field(default="", description="Reasoning for the decision (for logging/debug)")
|
||||||
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, none")
|
detectedIntent: str = Field(default="none", description="Detected intent: addressed, question, proactive, stop, none")
|
||||||
|
commands: Optional[List[TeamsbotCommand]] = Field(default=None, description="Optional list of commands to execute (e.g. toggle transcript, send chat, change language)")
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
|
||||||
|
|
@ -148,6 +148,10 @@ class TeamsbotObjects:
|
||||||
transcriptData["creationDate"] = getIsoTimestamp()
|
transcriptData["creationDate"] = getIsoTimestamp()
|
||||||
return self.db.recordCreate(TeamsbotTranscript, transcriptData)
|
return self.db.recordCreate(TeamsbotTranscript, transcriptData)
|
||||||
|
|
||||||
|
def updateTranscript(self, transcriptId: str, updates: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Update an existing transcript segment (used for differential writing)."""
|
||||||
|
return self.db.recordModify(TeamsbotTranscript, transcriptId, updates)
|
||||||
|
|
||||||
def _deleteTranscriptsBySession(self, sessionId: str) -> int:
|
def _deleteTranscriptsBySession(self, sessionId: str) -> int:
|
||||||
"""Delete all transcripts for a session."""
|
"""Delete all transcripts for a session."""
|
||||||
records = self.db.getRecordset(TeamsbotTranscript, recordFilter={"sessionId": sessionId})
|
records = self.db.getRecordset(TeamsbotTranscript, recordFilter={"sessionId": sessionId})
|
||||||
|
|
|
||||||
|
|
@ -1110,8 +1110,33 @@ async def botWebsocket(
|
||||||
logger.warning(f"Could not load original user {startedByUserId}, falling back to system user")
|
logger.warning(f"Could not load original user {startedByUserId}, falling back to system user")
|
||||||
originalUser = systemUser
|
originalUser = systemUser
|
||||||
|
|
||||||
|
# Build effective config with the session's actual bot name.
|
||||||
|
# The session stores the resolved bot name (from system bot or user override).
|
||||||
|
# Without this, the default config botName (e.g. "AI Assistant") is used,
|
||||||
|
# which is wrong for registered system bots.
|
||||||
|
sessionBotName = session.get("botName") if session else None
|
||||||
|
if sessionBotName:
|
||||||
|
config = config.model_copy(update={"botName": sessionBotName})
|
||||||
|
logger.info(f"Browser Bot WebSocket: Using session botName '{sessionBotName}' (not default '{_getInstanceConfig(instanceId).botName}')")
|
||||||
|
|
||||||
|
# Also merge user-specific settings if available
|
||||||
|
if startedByUserId:
|
||||||
|
interface = interfaceDb.getInterface(originalUser, mandateId=mandateId, featureInstanceId=instanceId)
|
||||||
|
userSettings = interface.getUserSettings(startedByUserId, instanceId)
|
||||||
|
if userSettings:
|
||||||
|
overrides = {}
|
||||||
|
for field in ["aiSystemPrompt", "responseMode", "responseChannel", "transferMode",
|
||||||
|
"language", "voiceId", "triggerIntervalSeconds", "triggerCooldownSeconds",
|
||||||
|
"contextWindowSegments"]:
|
||||||
|
value = userSettings.get(field)
|
||||||
|
if value is not None:
|
||||||
|
overrides[field] = value
|
||||||
|
if overrides:
|
||||||
|
config = config.model_copy(update=overrides)
|
||||||
|
logger.info(f"Browser Bot WebSocket: Applied user settings overrides: {list(overrides.keys())}")
|
||||||
|
|
||||||
service = TeamsbotService(originalUser, mandateId, instanceId, config)
|
service = TeamsbotService(originalUser, mandateId, instanceId, config)
|
||||||
logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}")
|
logger.info(f"Browser Bot WebSocket service created: session={sessionId}, mandateId={mandateId}, user={originalUser.id}, botName={config.botName}")
|
||||||
|
|
||||||
await service.handleBotWebSocket(websocket, sessionId)
|
await service.handleBotWebSocket(websocket, sessionId)
|
||||||
except WebSocketDisconnect:
|
except WebSocketDisconnect:
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ from .datamodelTeamsbot import (
|
||||||
TeamsbotResponseMode,
|
TeamsbotResponseMode,
|
||||||
TeamsbotResponseChannel,
|
TeamsbotResponseChannel,
|
||||||
SpeechTeamsResponse,
|
SpeechTeamsResponse,
|
||||||
|
TeamsbotCommand,
|
||||||
)
|
)
|
||||||
from .browserBotConnector import BrowserBotConnector
|
from .browserBotConnector import BrowserBotConnector
|
||||||
|
|
||||||
|
|
@ -81,6 +82,12 @@ class TeamsbotService:
|
||||||
self._sessionContext: Optional[str] = None # User-provided background context
|
self._sessionContext: Optional[str] = None # User-provided background context
|
||||||
self._contextSummary: Optional[str] = None # AI-generated summary of long context
|
self._contextSummary: Optional[str] = None # AI-generated summary of long context
|
||||||
|
|
||||||
|
# Differential transcript tracking: only write new text, update existing
|
||||||
|
# record when the same speaker continues speaking
|
||||||
|
self._lastTranscriptSpeaker: Optional[str] = None
|
||||||
|
self._lastTranscriptText: Optional[str] = None
|
||||||
|
self._lastTranscriptId: Optional[str] = None
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session Lifecycle
|
# Session Lifecycle
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
|
|
@ -438,61 +445,99 @@ class TeamsbotService:
|
||||||
websocket: WebSocket,
|
websocket: WebSocket,
|
||||||
source: str = "caption",
|
source: str = "caption",
|
||||||
):
|
):
|
||||||
"""Process a transcript segment from captions or chat messages."""
|
"""Process a transcript segment from captions or chat messages.
|
||||||
|
|
||||||
|
Differential writing: When the same speaker continues (text grows
|
||||||
|
incrementally as captions stream), we UPDATE the existing DB record
|
||||||
|
instead of creating a cascade of near-duplicate rows. A new record
|
||||||
|
is only created when the speaker changes or the text is not a
|
||||||
|
continuation of the previous segment.
|
||||||
|
"""
|
||||||
|
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if not text:
|
if not text:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Filter out the bot's own speech from AI triggering.
|
# Filter out the bot's own speech from AI triggering.
|
||||||
# The bot hears itself via captions — these should be stored in the
|
|
||||||
# transcript for the record, but must NOT trigger AI analysis (feedback loop).
|
|
||||||
isBotSpeaker = self._isBotSpeaker(speaker)
|
isBotSpeaker = self._isBotSpeaker(speaker)
|
||||||
|
|
||||||
# Store transcript segment
|
# Differential transcript writing:
|
||||||
|
# If the same speaker is still talking and the new text is a
|
||||||
|
# continuation (starts with the previous text), UPDATE the existing
|
||||||
|
# record instead of creating a new one. This avoids cascading rows like:
|
||||||
|
# "Der AHV"
|
||||||
|
# "Der AHV Fonds"
|
||||||
|
# "Der AHV Fonds hat 2025"
|
||||||
|
# and instead keeps a single row that grows until the speaker changes.
|
||||||
|
isContinuation = (
|
||||||
|
self._lastTranscriptSpeaker == speaker
|
||||||
|
and self._lastTranscriptText
|
||||||
|
and self._lastTranscriptId
|
||||||
|
and text.startswith(self._lastTranscriptText)
|
||||||
|
and source == "caption" # only for captions, chat messages are always new
|
||||||
|
)
|
||||||
|
|
||||||
|
if isContinuation:
|
||||||
|
interface.updateTranscript(self._lastTranscriptId, {
|
||||||
|
"text": text,
|
||||||
|
"isFinal": isFinal,
|
||||||
|
})
|
||||||
|
self._lastTranscriptText = text
|
||||||
|
createdTranscript = {"id": self._lastTranscriptId}
|
||||||
|
|
||||||
|
# Update context buffer: replace last entry for same speaker
|
||||||
|
if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
|
||||||
|
self._contextBuffer[-1]["text"] = text
|
||||||
|
else:
|
||||||
|
# New speaker or non-continuation → create a new record
|
||||||
transcriptData = TeamsbotTranscript(
|
transcriptData = TeamsbotTranscript(
|
||||||
sessionId=sessionId,
|
sessionId=sessionId,
|
||||||
speaker=speaker,
|
speaker=speaker,
|
||||||
text=text,
|
text=text,
|
||||||
timestamp=getIsoTimestamp(),
|
timestamp=getIsoTimestamp(),
|
||||||
confidence=1.0, # Captions don't have confidence scores
|
confidence=1.0,
|
||||||
language=self.config.language,
|
language=self.config.language,
|
||||||
isFinal=isFinal,
|
isFinal=isFinal,
|
||||||
).model_dump()
|
).model_dump()
|
||||||
|
|
||||||
createdTranscript = interface.createTranscript(transcriptData)
|
createdTranscript = interface.createTranscript(transcriptData)
|
||||||
|
|
||||||
# Update context buffer (mark source for chat messages)
|
# Track for differential writing
|
||||||
|
self._lastTranscriptSpeaker = speaker
|
||||||
|
self._lastTranscriptText = text
|
||||||
|
self._lastTranscriptId = createdTranscript.get("id")
|
||||||
|
|
||||||
|
# Append to context buffer
|
||||||
self._contextBuffer.append({
|
self._contextBuffer.append({
|
||||||
"speaker": speaker or "Unknown",
|
"speaker": speaker or "Unknown",
|
||||||
"text": text,
|
"text": text,
|
||||||
"timestamp": getUtcTimestamp(),
|
"timestamp": getUtcTimestamp(),
|
||||||
"source": source,
|
"source": source,
|
||||||
})
|
})
|
||||||
|
|
||||||
# Keep only last N segments
|
# Keep only last N segments
|
||||||
maxSegments = self.config.contextWindowSegments
|
maxSegments = self.config.contextWindowSegments
|
||||||
if len(self._contextBuffer) > maxSegments:
|
if len(self._contextBuffer) > maxSegments:
|
||||||
# When buffer overflows, summarize the older half to preserve context
|
|
||||||
# without losing information. The summary replaces the old segments.
|
|
||||||
if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
|
if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
|
||||||
asyncio.create_task(self._summarizeContextBuffer(sessionId))
|
asyncio.create_task(self._summarizeContextBuffer(sessionId))
|
||||||
self._contextBuffer = self._contextBuffer[-maxSegments:]
|
self._contextBuffer = self._contextBuffer[-maxSegments:]
|
||||||
|
|
||||||
# Emit SSE event for live transcript
|
# Update session transcript count (only for new records)
|
||||||
|
session = interface.getSession(sessionId)
|
||||||
|
if session:
|
||||||
|
count = session.get("transcriptSegmentCount", 0) + 1
|
||||||
|
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
|
||||||
|
|
||||||
|
# Emit SSE event for live transcript (always, for UI updates)
|
||||||
await _emitSessionEvent(sessionId, "transcript", {
|
await _emitSessionEvent(sessionId, "transcript", {
|
||||||
"id": createdTranscript.get("id"),
|
"id": createdTranscript.get("id"),
|
||||||
"speaker": speaker,
|
"speaker": speaker,
|
||||||
"text": text,
|
"text": text,
|
||||||
"confidence": 1.0,
|
"confidence": 1.0,
|
||||||
"timestamp": getIsoTimestamp(),
|
"timestamp": getIsoTimestamp(),
|
||||||
|
"isContinuation": isContinuation,
|
||||||
})
|
})
|
||||||
|
|
||||||
# Update session transcript count
|
|
||||||
session = interface.getSession(sessionId)
|
|
||||||
if session:
|
|
||||||
count = session.get("transcriptSegmentCount", 0) + 1
|
|
||||||
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
|
|
||||||
|
|
||||||
# Skip AI analysis for bot's own speech (prevents feedback loop)
|
# Skip AI analysis for bot's own speech (prevents feedback loop)
|
||||||
if isBotSpeaker:
|
if isBotSpeaker:
|
||||||
logger.debug(f"Session {sessionId}: Skipping AI trigger for bot's own speech: [{speaker}] {text[:60]}...")
|
logger.debug(f"Session {sessionId}: Skipping AI trigger for bot's own speech: [{speaker}] {text[:60]}...")
|
||||||
|
|
@ -805,10 +850,100 @@ class TeamsbotService:
|
||||||
|
|
||||||
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
|
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
|
||||||
|
|
||||||
|
# Step 5: Execute AI-issued commands (if any)
|
||||||
|
if speechResult.commands:
|
||||||
|
await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
|
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
|
||||||
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
|
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# AI Command Execution
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
async def _executeCommands(
|
||||||
|
self,
|
||||||
|
sessionId: str,
|
||||||
|
commands: List[TeamsbotCommand],
|
||||||
|
voiceInterface,
|
||||||
|
websocket: WebSocket,
|
||||||
|
):
|
||||||
|
"""Execute structured commands returned by the AI.
|
||||||
|
|
||||||
|
Each command is sent to the browser bot via WebSocket as a
|
||||||
|
'botCommand' message. The bot's TeamsActionsService handles
|
||||||
|
the actual Teams UI interaction (checking state, toggling, etc.).
|
||||||
|
"""
|
||||||
|
for cmd in commands:
|
||||||
|
action = cmd.action
|
||||||
|
params = cmd.params or {}
|
||||||
|
logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if action == "toggleTranscript":
|
||||||
|
enable = params.get("enable", True)
|
||||||
|
if websocket:
|
||||||
|
await websocket.send_text(json.dumps({
|
||||||
|
"type": "botCommand",
|
||||||
|
"sessionId": sessionId,
|
||||||
|
"command": "toggleTranscript",
|
||||||
|
"params": {"enable": enable},
|
||||||
|
}))
|
||||||
|
|
||||||
|
elif action == "sendChat":
|
||||||
|
chatText = params.get("text", "")
|
||||||
|
if chatText and websocket:
|
||||||
|
await websocket.send_text(json.dumps({
|
||||||
|
"type": "sendChatMessage",
|
||||||
|
"sessionId": sessionId,
|
||||||
|
"text": chatText,
|
||||||
|
}))
|
||||||
|
|
||||||
|
elif action == "readAloud":
|
||||||
|
readText = params.get("text", "")
|
||||||
|
if readText and voiceInterface:
|
||||||
|
ttsResult = await voiceInterface.textToSpeech(
|
||||||
|
text=readText,
|
||||||
|
languageCode=self.config.language,
|
||||||
|
voiceName=self.config.voiceId,
|
||||||
|
)
|
||||||
|
if ttsResult and isinstance(ttsResult, dict):
|
||||||
|
audioContent = ttsResult.get("audioContent")
|
||||||
|
if audioContent and websocket:
|
||||||
|
await websocket.send_text(json.dumps({
|
||||||
|
"type": "playAudio",
|
||||||
|
"sessionId": sessionId,
|
||||||
|
"audio": {
|
||||||
|
"data": base64.b64encode(
|
||||||
|
audioContent if isinstance(audioContent, bytes) else audioContent.encode()
|
||||||
|
).decode(),
|
||||||
|
"format": "mp3",
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
|
||||||
|
elif action == "changeLanguage":
|
||||||
|
newLang = params.get("language", "")
|
||||||
|
if newLang:
|
||||||
|
self.config = self.config.model_copy(update={"language": newLang})
|
||||||
|
logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
|
||||||
|
await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})
|
||||||
|
|
||||||
|
elif action in ("toggleMic", "toggleCamera"):
|
||||||
|
if websocket:
|
||||||
|
await websocket.send_text(json.dumps({
|
||||||
|
"type": "botCommand",
|
||||||
|
"sessionId": sessionId,
|
||||||
|
"command": action,
|
||||||
|
"params": params,
|
||||||
|
}))
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.warning(f"Session {sessionId}: Unknown command '{action}'")
|
||||||
|
|
||||||
|
except Exception as cmdErr:
|
||||||
|
logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Context Summarization (for long sessions)
|
# Context Summarization (for long sessions)
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
|
|
|
||||||
|
|
@ -379,12 +379,22 @@ shouldRespond auf false. Du musst NICHT antworten wenn jemand dich stoppt."""
|
||||||
|
|
||||||
basePrompt += f"""
|
basePrompt += f"""
|
||||||
|
|
||||||
|
KOMMANDOS: Du kannst optionale Aktions-Kommandos ausfuehren lassen.
|
||||||
|
Verfuegbare Kommandos (im "commands" Array):
|
||||||
|
- {{"action": "toggleTranscript", "params": {{"enable": true/false}}}} — Transkription ein-/ausschalten
|
||||||
|
- {{"action": "sendChat", "params": {{"text": "Nachricht"}}}} — Zusaetzliche Nachricht in den Chat schreiben (unabhaengig von responseText)
|
||||||
|
- {{"action": "readAloud", "params": {{"text": "Text zum Vorlesen"}}}} — Einen bestimmten Text vorlesen (unabhaengig von responseText)
|
||||||
|
- {{"action": "changeLanguage", "params": {{"language": "en-US"}}}} — Kommunikationssprache aendern (z.B. "de-DE", "en-US", "fr-FR")
|
||||||
|
|
||||||
|
Verwende Kommandos NUR wenn explizit darum gebeten wird (z.B. "schalte die Transkription ein", "schreib das in den Chat", "lies das vor", "sprich Englisch").
|
||||||
|
|
||||||
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
|
WICHTIG: Antworte IMMER als valides JSON in exakt diesem Format:
|
||||||
{{
|
{{
|
||||||
"shouldRespond": true/false,
|
"shouldRespond": true/false,
|
||||||
"responseText": "Deine Antwort hier" oder null,
|
"responseText": "Deine Antwort hier" oder null,
|
||||||
"reasoning": "Kurze Begruendung deiner Entscheidung",
|
"reasoning": "Kurze Begruendung deiner Entscheidung",
|
||||||
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none"
|
"detectedIntent": "addressed" | "question" | "proactive" | "stop" | "none",
|
||||||
|
"commands": [] oder null
|
||||||
}}
|
}}
|
||||||
|
|
||||||
detectedIntent-Werte:
|
detectedIntent-Werte:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue