Fix: add missing Automation2Workflow/Automation2WorkflowRun imports to interfaceFeatureGraphicalEditor.py (caused scheduler crash on boot) Refactor: gdprDeletion via onUserDelete lifecycle hooks Refactor: i18nBootSync accounting labels via app.py parameter injection Refactor: serviceHub moved to serviceCenter/serviceHub.py Split: teamsbot/service.py, realEstate/main, routeTrustee, routeBilling Cleanup: remove obsolete methodTrustee, serviceExceptions shim Co-authored-by: Cursor <cursoragent@cursor.com>
996 lines
39 KiB
Python
996 lines
39 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
Teamsbot Service — Conversation & AI analysis logic.
|
|
|
|
Extracted from service.py. All functions accept `service` (a TeamsbotService
|
|
instance) as the first parameter so the class can delegate to them.
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
import re
|
|
import asyncio
|
|
import time
|
|
from typing import Optional, Dict, Any, List
|
|
|
|
from fastapi import WebSocket
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
|
|
from modules.shared.timeUtils import getUtcTimestamp
|
|
|
|
from .datamodelTeamsbot import (
|
|
TeamsbotTranscript,
|
|
TeamsbotBotResponse,
|
|
TeamsbotResponseType,
|
|
TeamsbotResponseMode,
|
|
TeamsbotResponseChannel,
|
|
SpeechTeamsResponse,
|
|
TeamsbotDirectorPromptMode,
|
|
TeamsbotDirectorPromptStatus,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _analyzeAndRespond(
|
|
service,
|
|
sessionId: str,
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
triggerTranscript: Dict[str, Any],
|
|
):
|
|
"""Run SPEECH_TEAMS AI analysis and respond if needed."""
|
|
from .service import (
|
|
_emitSessionEvent, createAiService, _speakTextChunked,
|
|
_voiceFriendlyMeetingText,
|
|
TEAMSBOT_AGENT_MAX_ROUNDS, TEAMSBOT_AGENT_MAX_COST_CHF,
|
|
)
|
|
|
|
if service._aiAnalysisInProgress:
|
|
logger.info(f"Session {sessionId}: AI analysis already in progress, skipping duplicate trigger")
|
|
return
|
|
if service._agentEscalationInFlight:
|
|
logger.info(
|
|
f"Session {sessionId}: Agent escalation still in flight — "
|
|
f"skipping new SPEECH_TEAMS trigger to prevent overlapping replies"
|
|
)
|
|
return
|
|
service._aiAnalysisInProgress = True
|
|
service._lastAiCallTime = time.time()
|
|
|
|
contextLines = []
|
|
for segment in service._contextBuffer:
|
|
speaker = segment.get("speaker", "Unknown")
|
|
text = segment.get("text", "")
|
|
segSource = segment.get("source", "caption")
|
|
prefix = "Chat" if segSource == "chat" else ""
|
|
if service._isBotSpeaker(speaker):
|
|
contextLines.append(f"[YOU ({service.config.botName})]: {text}")
|
|
elif prefix:
|
|
contextLines.append(f"[{prefix}: {speaker}]: {text}")
|
|
else:
|
|
contextLines.append(f"[{speaker}]: {text}")
|
|
|
|
sessionContextStr = ""
|
|
if service._sessionContext:
|
|
sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{service._sessionContext}\n"
|
|
|
|
summaryStr = ""
|
|
if service._contextSummary:
|
|
summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{service._contextSummary}\n"
|
|
|
|
directorStr = service._buildPersistentDirectorContext()
|
|
|
|
transcriptContext = f"BOT_NAME:{service.config.botName}{sessionContextStr}{summaryStr}{directorStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines)
|
|
|
|
try:
|
|
aiService = createAiService(service.currentUser, service.mandateId, service.instanceId)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
|
|
request = AiCallRequest(
|
|
prompt=service.config.aiSystemPrompt,
|
|
context=transcriptContext,
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.SPEECH_TEAMS,
|
|
priority=PriorityEnum.SPEED,
|
|
)
|
|
)
|
|
|
|
response = await aiService.callAi(request)
|
|
|
|
try:
|
|
speechResult = SpeechTeamsResponse.model_validate_json(response.content)
|
|
except Exception:
|
|
try:
|
|
jsonStr = response.content
|
|
if "```json" in jsonStr:
|
|
jsonStr = jsonStr.split("```json")[1].split("```")[0]
|
|
elif "```" in jsonStr:
|
|
jsonStr = jsonStr.split("```")[1].split("```")[0]
|
|
speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip())
|
|
except Exception as parseErr:
|
|
logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}")
|
|
speechResult = SpeechTeamsResponse(
|
|
shouldRespond=False,
|
|
reasoning=f"Parse error: {str(parseErr)[:100]}",
|
|
detectedIntent="none"
|
|
)
|
|
|
|
logger.info(
|
|
f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, "
|
|
f"intent={speechResult.detectedIntent}, "
|
|
f"reasoning={speechResult.reasoning[:80]}..."
|
|
)
|
|
|
|
await _emitSessionEvent(sessionId, "analysis", {
|
|
"shouldRespond": speechResult.shouldRespond,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"needsAgent": speechResult.needsAgent,
|
|
"agentReason": speechResult.agentReason,
|
|
})
|
|
|
|
if speechResult.needsAgent:
|
|
briefings = service._collectActiveDirectorBriefings()
|
|
briefingFileIds = service._collectDirectorFileIds()
|
|
briefingBlock = ""
|
|
if briefings:
|
|
parts = []
|
|
for b in briefings:
|
|
seg = f"- ({b.get('mode')}) {b.get('text', '')}".rstrip()
|
|
if b.get("fileIds"):
|
|
seg += f"\n attachedFileIds: {', '.join(b['fileIds'])}"
|
|
if b.get("note"):
|
|
note = b["note"]
|
|
seg += (
|
|
"\n priorAgentAnalysis: "
|
|
+ (note if len(note) <= 800 else note[:800] + "...")
|
|
)
|
|
parts.append(seg)
|
|
briefingBlock = (
|
|
"\n\nACTIVE_OPERATOR_BRIEFINGS (private; you may read the "
|
|
"attached files via summarizeContent / readFile / "
|
|
"readContentObjects to answer the user precisely; do NOT "
|
|
"quote the directive text itself):\n" + "\n".join(parts)
|
|
)
|
|
logger.info(
|
|
f"Session {sessionId}: SPEECH_TEAMS escalates to agent. "
|
|
f"Reason: {speechResult.agentReason or speechResult.reasoning} | "
|
|
f"briefings={len(briefings)}, fileIds={len(briefingFileIds)}"
|
|
)
|
|
taskBrief = (
|
|
(speechResult.agentReason
|
|
or speechResult.responseText
|
|
or "Verarbeite die juengste Sprecheranfrage und antworte ins Meeting.")
|
|
+ briefingBlock
|
|
)
|
|
service._agentEscalationInFlight = True
|
|
service._currentEscalationTask = asyncio.create_task(
|
|
_runEscalationAndRelease(
|
|
service,
|
|
sessionId=sessionId,
|
|
taskBrief=taskBrief,
|
|
briefingFileIds=briefingFileIds,
|
|
triggerTranscriptId=triggerTranscript.get("id"),
|
|
)
|
|
)
|
|
return
|
|
|
|
if speechResult.detectedIntent == "stop":
|
|
logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}")
|
|
if websocket:
|
|
try:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "stopAudio",
|
|
"sessionId": sessionId,
|
|
}))
|
|
except Exception as stopErr:
|
|
logger.warning(f"Failed to send stop command: {stopErr}")
|
|
return
|
|
|
|
if speechResult.shouldRespond and speechResult.responseText:
|
|
|
|
if service.config.responseMode == TeamsbotResponseMode.MANUAL:
|
|
await _emitSessionEvent(sessionId, "suggestedResponse", {
|
|
"responseText": speechResult.responseText,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
})
|
|
return
|
|
|
|
channels = speechResult.responseChannels
|
|
if channels and isinstance(channels, list):
|
|
channelStr = ",".join(str(c).lower().strip() for c in channels)
|
|
sendVoice = "voice" in channelStr
|
|
sendChat = "chat" in channelStr
|
|
logger.info(f"Response channel (from AI): voice={sendVoice}, chat={sendChat}")
|
|
else:
|
|
channelRaw = service.config.responseChannel
|
|
channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
|
|
sendVoice = channelStr in ("voice", "both")
|
|
sendChat = channelStr in ("chat", "both")
|
|
logger.info(f"Response channel (from config): '{channelStr}'")
|
|
|
|
if sendVoice and sendChat:
|
|
responseType = TeamsbotResponseType.BOTH
|
|
elif sendVoice:
|
|
responseType = TeamsbotResponseType.AUDIO
|
|
else:
|
|
responseType = TeamsbotResponseType.CHAT
|
|
|
|
canonicalText = (
|
|
speechResult.responseText
|
|
or speechResult.responseTextForVoice
|
|
or speechResult.responseTextForChat
|
|
or ""
|
|
)
|
|
normalizedResponse = (canonicalText or "").strip().lower()
|
|
nowTs = time.time()
|
|
if (
|
|
normalizedResponse
|
|
and service._lastBotResponseText == normalizedResponse
|
|
and (nowTs - service._lastBotResponseTs) < 90
|
|
):
|
|
logger.info(f"Session {sessionId}: Suppressing duplicate bot response within 90s window")
|
|
await _emitSessionEvent(sessionId, "analysis", {
|
|
"shouldRespond": False,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": "Suppressed duplicate response within 90s",
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
})
|
|
return
|
|
|
|
textForVoice = speechResult.responseTextForVoice or speechResult.responseText
|
|
textForChat = speechResult.responseTextForChat or speechResult.responseText
|
|
storedText = textForChat or textForVoice or speechResult.responseText
|
|
|
|
if sendVoice and textForVoice:
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "requested",
|
|
"hasWebSocket": websocket is not None,
|
|
"message": "TTS generation requested",
|
|
"timestamp": getUtcTimestamp(),
|
|
})
|
|
logger.info(
|
|
f"Session {sessionId}: TTS requested (websocket_available={websocket is not None})"
|
|
)
|
|
if not websocket:
|
|
logger.warning(
|
|
f"Session {sessionId}: TTS skipped (bot websocket unavailable, likely fallback mode)"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "unavailable",
|
|
"hasWebSocket": False,
|
|
"message": "TTS skipped — bot websocket unavailable",
|
|
"timestamp": getUtcTimestamp(),
|
|
})
|
|
if not sendChat:
|
|
sendChat = True
|
|
else:
|
|
spokenText = await _summarizeForVoice(service, sessionId, textForVoice)
|
|
cancelHook = service._makeAnswerCancelHook()
|
|
async with service._meetingTtsLock:
|
|
ttsOutcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=spokenText,
|
|
languageCode=service.config.language,
|
|
voiceName=service.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
if ttsOutcome.get("success"):
|
|
logger.info(
|
|
f"Session {sessionId}: TTS audio dispatched to bot "
|
|
f"(chunks={ttsOutcome.get('chunks')}, played={ttsOutcome.get('played')})"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "dispatched",
|
|
"hasWebSocket": True,
|
|
"chunks": ttsOutcome.get("chunks"),
|
|
"played": ttsOutcome.get("played"),
|
|
"timestamp": getUtcTimestamp(),
|
|
})
|
|
else:
|
|
logger.warning(
|
|
f"TTS failed for session {sessionId}: {ttsOutcome.get('error')}"
|
|
)
|
|
await _emitSessionEvent(sessionId, "ttsDeliveryStatus", {
|
|
"status": "failed",
|
|
"hasWebSocket": True,
|
|
"chunks": ttsOutcome.get("chunks"),
|
|
"played": ttsOutcome.get("played"),
|
|
"message": ttsOutcome.get("error"),
|
|
"timestamp": getUtcTimestamp(),
|
|
})
|
|
if not sendChat:
|
|
sendChat = True
|
|
|
|
if sendChat and textForChat:
|
|
try:
|
|
if websocket:
|
|
await websocket.send_text(json.dumps({
|
|
"type": "sendChatMessage",
|
|
"sessionId": sessionId,
|
|
"text": textForChat,
|
|
}))
|
|
logger.info(f"Chat response sent for session {sessionId}")
|
|
except Exception as chatErr:
|
|
logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}")
|
|
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=storedText,
|
|
responseType=responseType,
|
|
detectedIntent=speechResult.detectedIntent,
|
|
reasoning=speechResult.reasoning,
|
|
triggeredByTranscriptId=triggerTranscript.get("id"),
|
|
modelName=response.modelName,
|
|
processingTime=response.processingTime,
|
|
priceCHF=response.priceCHF,
|
|
timestamp=getUtcTimestamp(),
|
|
).model_dump()
|
|
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": storedText,
|
|
"responseType": responseType.value,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
})
|
|
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("botResponseCount", 0) + 1
|
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
|
|
|
service._lastBotResponseText = normalizedResponse
|
|
service._lastBotResponseTs = nowTs
|
|
|
|
botTranscriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=service.config.botName,
|
|
text=storedText,
|
|
timestamp=getUtcTimestamp(),
|
|
confidence=1.0,
|
|
language=service.config.language,
|
|
isFinal=True,
|
|
).model_dump()
|
|
botTranscript = interface.createTranscript(botTranscriptData)
|
|
|
|
service._contextBuffer.append({
|
|
"speaker": service.config.botName,
|
|
"text": storedText,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": "botResponse",
|
|
})
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": botTranscript.get("id"),
|
|
"speaker": service.config.botName,
|
|
"text": storedText,
|
|
"confidence": 1.0,
|
|
"timestamp": getUtcTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "botResponse",
|
|
"speakerResolvedFromHint": False,
|
|
})
|
|
|
|
service._lastTranscriptSpeaker = service.config.botName
|
|
service._lastTranscriptText = storedText
|
|
service._lastTranscriptId = botTranscript.get("id")
|
|
|
|
service._followUpWindowEnd = time.time() + 15.0
|
|
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")
|
|
|
|
if speechResult.commands:
|
|
from .serviceCommands import _executeCommands
|
|
await _executeCommands(service, sessionId, speechResult.commands, voiceInterface, websocket)
|
|
|
|
if speechResult.shouldRespond and not speechResult.responseText:
|
|
cmdTexts = [
|
|
c.params.get("text", "") for c in speechResult.commands
|
|
if c.action == "sendChat" and c.params and c.params.get("text")
|
|
]
|
|
combinedText = " ".join(cmdTexts) if cmdTexts else None
|
|
if combinedText:
|
|
botResponseData = TeamsbotBotResponse(
|
|
sessionId=sessionId,
|
|
responseText=combinedText,
|
|
responseType=TeamsbotResponseType.CHAT,
|
|
detectedIntent=speechResult.detectedIntent,
|
|
reasoning=speechResult.reasoning,
|
|
triggeredByTranscriptId=triggerTranscript.get("id"),
|
|
modelName=response.modelName,
|
|
processingTime=response.processingTime,
|
|
priceCHF=response.priceCHF,
|
|
timestamp=getUtcTimestamp(),
|
|
).model_dump()
|
|
createdResponse = interface.createBotResponse(botResponseData)
|
|
await _emitSessionEvent(sessionId, "botResponse", {
|
|
"id": createdResponse.get("id"),
|
|
"responseText": combinedText,
|
|
"responseType": TeamsbotResponseType.CHAT.value,
|
|
"detectedIntent": speechResult.detectedIntent,
|
|
"reasoning": speechResult.reasoning,
|
|
"modelName": response.modelName,
|
|
"processingTime": response.processingTime,
|
|
"priceCHF": response.priceCHF,
|
|
"timestamp": botResponseData.get("timestamp"),
|
|
})
|
|
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("botResponseCount", 0) + 1
|
|
interface.updateSession(sessionId, {"botResponseCount": count})
|
|
|
|
service._followUpWindowEnd = time.time() + 15.0
|
|
logger.info(
|
|
f"Bot responded via commands in session {sessionId}: "
|
|
f"intent={speechResult.detectedIntent}, follow-up window open for 15s"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
|
|
await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})
|
|
finally:
|
|
service._aiAnalysisInProgress = False
|
|
|
|
|
|
async def _processTranscript(
|
|
service,
|
|
sessionId: str,
|
|
speaker: str,
|
|
text: str,
|
|
isFinal: bool,
|
|
interface,
|
|
voiceInterface,
|
|
websocket: WebSocket,
|
|
source: str = "caption",
|
|
speakerResolvedFromHint: Optional[bool] = None,
|
|
):
|
|
"""Process a transcript segment from captions or chat messages."""
|
|
from .service import _emitSessionEvent
|
|
|
|
text = text.strip()
|
|
if not text:
|
|
return
|
|
|
|
if source in ("caption", "speakerHint"):
|
|
service._registerSpeakerHint(speaker, text, sessionId)
|
|
|
|
if (
|
|
source == "speakerHint"
|
|
and isFinal
|
|
and not service._isBotSpeaker(speaker)
|
|
and service.config.responseMode != TeamsbotResponseMode.TRANSCRIBE_ONLY
|
|
and service._detectBotName(text)
|
|
):
|
|
triggerTranscript = {"id": None, "speaker": speaker, "text": text, "source": source}
|
|
isNew = service._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, triggerTranscript)
|
|
if isNew:
|
|
logger.info(f"Session {sessionId}: Bot name in caption, debounce trigger started")
|
|
asyncio.create_task(_checkPendingNameTrigger(service))
|
|
service._currentQuickAckTask = asyncio.create_task(
|
|
_runQuickAck(service, sessionId)
|
|
)
|
|
return
|
|
|
|
if source == "chatHistory":
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=speaker,
|
|
text=text,
|
|
timestamp=getUtcTimestamp(),
|
|
confidence=1.0,
|
|
language=service.config.language,
|
|
isFinal=True,
|
|
source="chatHistory",
|
|
).model_dump()
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": speaker,
|
|
"text": text,
|
|
"confidence": 1.0,
|
|
"timestamp": getUtcTimestamp(),
|
|
"isContinuation": False,
|
|
"source": "chatHistory",
|
|
"isHistory": True,
|
|
})
|
|
logger.debug(f"Session {sessionId}: Chat history stored (no AI trigger): [{speaker}] {text[:60]}")
|
|
return
|
|
|
|
isBotSpeaker = service._isBotSpeaker(speaker)
|
|
if isBotSpeaker and source != "chat":
|
|
logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...")
|
|
return
|
|
|
|
sttPauseThreshold = 5.0
|
|
isMerge = (
|
|
source == "audioCapture"
|
|
and service._lastTranscriptSpeaker == speaker
|
|
and service._lastTranscriptText is not None
|
|
and service._lastTranscriptId is not None
|
|
and (time.time() - service._lastSttTime) < sttPauseThreshold
|
|
)
|
|
|
|
if isMerge:
|
|
mergedText = f"{service._lastTranscriptText} {text}"
|
|
interface.updateTranscript(service._lastTranscriptId, {
|
|
"text": mergedText,
|
|
"isFinal": isFinal,
|
|
})
|
|
service._lastTranscriptText = mergedText
|
|
createdTranscript = {"id": service._lastTranscriptId}
|
|
|
|
if service._contextBuffer and service._contextBuffer[-1].get("speaker") == speaker:
|
|
service._contextBuffer[-1]["text"] = mergedText
|
|
else:
|
|
transcriptData = TeamsbotTranscript(
|
|
sessionId=sessionId,
|
|
speaker=speaker,
|
|
text=text,
|
|
timestamp=getUtcTimestamp(),
|
|
confidence=1.0,
|
|
language=service.config.language,
|
|
isFinal=isFinal,
|
|
source=source,
|
|
).model_dump()
|
|
|
|
createdTranscript = interface.createTranscript(transcriptData)
|
|
|
|
service._lastTranscriptSpeaker = speaker
|
|
service._lastTranscriptText = text
|
|
service._lastTranscriptId = createdTranscript.get("id")
|
|
|
|
if source == "audioCapture" and speaker == "Unknown":
|
|
service._unattributedTranscriptIds.append(createdTranscript.get("id"))
|
|
|
|
service._contextBuffer.append({
|
|
"speaker": speaker or "Unknown",
|
|
"text": text,
|
|
"timestamp": getUtcTimestamp(),
|
|
"source": source,
|
|
})
|
|
|
|
maxSegments = service.config.contextWindowSegments
|
|
if len(service._contextBuffer) > maxSegments:
|
|
if not service._contextSummary and len(service._contextBuffer) > maxSegments * 1.5:
|
|
asyncio.create_task(service._summarizeContextBuffer(sessionId))
|
|
service._contextBuffer = service._contextBuffer[-maxSegments:]
|
|
|
|
session = interface.getSession(sessionId)
|
|
if session:
|
|
count = session.get("transcriptSegmentCount", 0) + 1
|
|
interface.updateSession(sessionId, {"transcriptSegmentCount": count})
|
|
|
|
if source == "audioCapture":
|
|
service._lastSttTime = time.time()
|
|
|
|
displayText = service._lastTranscriptText if isMerge else text
|
|
await _emitSessionEvent(sessionId, "transcript", {
|
|
"id": createdTranscript.get("id"),
|
|
"speaker": speaker,
|
|
"text": displayText,
|
|
"confidence": 1.0,
|
|
"timestamp": getUtcTimestamp(),
|
|
"isContinuation": isMerge,
|
|
"source": source,
|
|
"speakerResolvedFromHint": (
|
|
speakerResolvedFromHint
|
|
if speakerResolvedFromHint is not None
|
|
else False
|
|
),
|
|
})
|
|
|
|
if not isFinal:
|
|
return
|
|
|
|
if service.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY:
|
|
return
|
|
|
|
if source == "chat" and isBotSpeaker:
|
|
return
|
|
|
|
if service._isStopPhrase(text):
|
|
logger.info(
|
|
f"Session {sessionId}: Stop phrase detected ('{text.strip()[:60]}'), "
|
|
f"hard-cancelling in-flight speech immediately"
|
|
)
|
|
from .serviceWebSocket import _cancelInFlightSpeech
|
|
await _cancelInFlightSpeech(
|
|
service,
|
|
sessionId=sessionId,
|
|
websocket=websocket,
|
|
reason="userStopPhrase",
|
|
)
|
|
return
|
|
|
|
if service._pendingNameTrigger:
|
|
service._pendingNameTrigger["lastActivity"] = time.time()
|
|
|
|
if service._detectBotName(text):
|
|
isNew = service._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
if isNew:
|
|
asyncio.create_task(_checkPendingNameTrigger(service))
|
|
service._currentQuickAckTask = asyncio.create_task(
|
|
_runQuickAck(service, sessionId)
|
|
)
|
|
return
|
|
|
|
if (
|
|
source == "audioCapture"
|
|
and not service._isBotSpeaker(speaker)
|
|
and time.time() < service._followUpWindowEnd
|
|
and not service._pendingNameTrigger
|
|
):
|
|
isNew = service._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
if isNew:
|
|
logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
|
|
asyncio.create_task(_checkPendingNameTrigger(service))
|
|
return
|
|
|
|
if not service._pendingNameTrigger:
|
|
shouldTrigger = service._shouldTriggerAnalysis(text)
|
|
if shouldTrigger:
|
|
logger.info(f"Session {sessionId}: Periodic trigger (buffer: {len(service._contextBuffer)} segments)")
|
|
await _analyzeAndRespond(service, sessionId, interface, voiceInterface, websocket, createdTranscript)
|
|
|
|
|
|
async def _summarizeForVoice(
|
|
service,
|
|
sessionId: str,
|
|
rawAnswer: str,
|
|
) -> str:
|
|
"""Return a SHORT, naturally-spoken paraphrase of ``rawAnswer`` for TTS."""
|
|
from .service import _voiceFriendlyMeetingText, createAiService
|
|
|
|
if not rawAnswer or not rawAnswer.strip():
|
|
return ""
|
|
|
|
sanitised = _voiceFriendlyMeetingText(rawAnswer)
|
|
if (
|
|
len(sanitised) <= service._VOICE_DIRECT_MAX_CHARS
|
|
and not service._looksLikeStructuredText(rawAnswer)
|
|
):
|
|
return sanitised
|
|
|
|
targetLang = (service.config.language or "de-DE").strip()
|
|
botName = (service.config.botName or "").strip() or "the assistant"
|
|
persona = (service.config.aiSystemPrompt or "").strip()
|
|
personaBlock = (
|
|
f"\n\nBOT PERSONA / TONE:\n{persona}\n"
|
|
if persona else ""
|
|
)
|
|
|
|
prompt = (
|
|
f"You are condensing a long written answer into a SHORT spoken "
|
|
f"paraphrase that the assistant '{botName}' will say out loud "
|
|
f"into a Microsoft Teams meeting. The full written answer is "
|
|
f"already in the meeting chat — your job is to summarise it for "
|
|
f"the EAR, not the eye.\n\n"
|
|
f"STRICT REQUIREMENTS:\n"
|
|
f"1. Output language: BCP-47 '{targetLang}'. No other language.\n"
|
|
f"2. 1 to 3 sentences, max ~{service._VOICE_SUMMARY_MAX_CHARS} characters total.\n"
|
|
f"3. Natural spoken style — no headings, no bullet points, no "
|
|
f"tables, no markdown, no emojis, no enumerations like 'Erstens... "
|
|
f"Zweitens...' unless that genuinely flows in speech.\n"
|
|
f"4. Capture the essence and the most important conclusion. Do "
|
|
f"NOT try to fit every detail. Listeners can read the chat for "
|
|
f"the full version.\n"
|
|
f"5. End by gently pointing the audience to the chat for details, "
|
|
f"e.g. 'Details stehen im Chat.' (adapted to the target language).\n"
|
|
f"6. Output ONLY the spoken text. No JSON, no quotes around it, "
|
|
f"no preamble like 'Here is the summary:'.\n"
|
|
f"{personaBlock}\n"
|
|
f"FULL WRITTEN ANSWER (markdown-formatted, sometimes long):\n"
|
|
f"---\n{rawAnswer.strip()[:6000]}\n---\n"
|
|
)
|
|
|
|
try:
|
|
aiService = createAiService(
|
|
service.currentUser, service.mandateId, service.instanceId
|
|
)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="",
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
),
|
|
)
|
|
response = await aiService.callAi(request)
|
|
except Exception as aiErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Voice summary AI call failed: {aiErr}"
|
|
)
|
|
return sanitised[: service._VOICE_DIRECT_MAX_CHARS]
|
|
|
|
if not response or response.errorCount != 0 or not response.content:
|
|
logger.warning(
|
|
f"Session {sessionId}: Voice summary returned empty/error"
|
|
)
|
|
return sanitised[: service._VOICE_DIRECT_MAX_CHARS]
|
|
|
|
spoken = response.content.strip()
|
|
spoken = _voiceFriendlyMeetingText(spoken)
|
|
if not spoken:
|
|
return sanitised[: service._VOICE_DIRECT_MAX_CHARS]
|
|
|
|
logger.info(
|
|
f"Session {sessionId}: Voice summary generated "
|
|
f"(orig={len(rawAnswer)} chars, sanitised={len(sanitised)}, "
|
|
f"spoken={len(spoken)})"
|
|
)
|
|
return spoken
|
|
|
|
|
|
async def _pickQuickAckText(service) -> Optional[str]:
|
|
"""Return a short ack text in the bot's configured language."""
|
|
return await _pickEphemeralPhrase(service, "quickAck")
|
|
|
|
|
|
async def _pickEphemeralPhrase(
|
|
service,
|
|
kind: str,
|
|
substitutions: Optional[Dict[str, Any]] = None,
|
|
) -> Optional[str]:
|
|
"""Round-robin selector over the cached phrase pool for ``kind``."""
|
|
variants = await _getEphemeralPhrases(service, kind)
|
|
if not variants:
|
|
return None
|
|
idx = service._phrasePoolIdx.get(kind, 0) % len(variants)
|
|
service._phrasePoolIdx[kind] = (idx + 1) % len(variants)
|
|
chosen = variants[idx]
|
|
if substitutions:
|
|
try:
|
|
chosen = chosen.format(**substitutions)
|
|
except (KeyError, IndexError, ValueError) as fmtErr:
|
|
logger.debug(
|
|
f"Ephemeral phrase substitution failed for kind={kind}: {fmtErr}"
|
|
)
|
|
return chosen
|
|
|
|
|
|
async def _getEphemeralPhrases(service, kind: str) -> List[str]:
|
|
"""Return the cached pool of AI-generated variants for ``kind``."""
|
|
cached = service._phrasePool.get(kind)
|
|
if cached:
|
|
return cached
|
|
async with service._phrasePoolLock:
|
|
cached = service._phrasePool.get(kind)
|
|
if cached:
|
|
return cached
|
|
phrases = await _generateEphemeralPhrases(service, kind, 4)
|
|
if phrases:
|
|
service._phrasePool[kind] = phrases
|
|
return phrases
|
|
|
|
|
|
async def _generateEphemeralPhrases(
|
|
service, kind: str, count: int
|
|
) -> List[str]:
|
|
"""Ask the AI to produce ``count`` short utterances for ``kind``."""
|
|
from .service import createAiService, _EPHEMERAL_PHRASE_INTENTS
|
|
|
|
intent = _EPHEMERAL_PHRASE_INTENTS.get(kind)
|
|
if not intent:
|
|
logger.warning(f"Unknown ephemeral phrase kind requested: {kind}")
|
|
return []
|
|
|
|
targetLang = (service.config.language or "").strip() or "en-US"
|
|
botName = (service.config.botName or "the assistant").strip()
|
|
persona = (service.config.aiSystemPrompt or "").strip()
|
|
|
|
prompt = (
|
|
f"You are localizing short SPOKEN-LANGUAGE utterances for a "
|
|
f"meeting assistant named '{botName}'.\n\n"
|
|
f"Persona / style guide for the assistant:\n"
|
|
f"{persona or '(no persona configured — use a neutral, polite, professional tone)'}\n\n"
|
|
f"Target spoken language (BCP-47 code): {targetLang}\n\n"
|
|
f"Utterance intent:\n{intent}\n\n"
|
|
f"Generate {count} DIFFERENT variants matching this intent, in "
|
|
f"the target language. Variants should feel natural when spoken "
|
|
f"aloud, not robotic. Do NOT include the assistant's name in "
|
|
f"the variants.\n\n"
|
|
f"Output STRICTLY a JSON array of {count} plain-text strings, "
|
|
f"with no markdown fences, no commentary, no surrounding "
|
|
f"quotation marks beyond the JSON syntax itself. Example "
|
|
f"format: [\"...\", \"...\", \"...\", \"...\"]"
|
|
)
|
|
|
|
try:
|
|
aiService = createAiService(
|
|
service.currentUser, service.mandateId, service.instanceId
|
|
)
|
|
await aiService.ensureAiObjectsInitialized()
|
|
request = AiCallRequest(
|
|
prompt=prompt,
|
|
context="",
|
|
options=AiCallOptions(
|
|
operationType=OperationTypeEnum.DATA_ANALYSE,
|
|
priority=PriorityEnum.SPEED,
|
|
),
|
|
)
|
|
response = await aiService.callAi(request)
|
|
except Exception as aiErr:
|
|
logger.warning(
|
|
f"Ephemeral phrase generation failed (kind={kind}, lang={targetLang}): {aiErr}"
|
|
)
|
|
return []
|
|
|
|
if not response or response.errorCount != 0 or not response.content:
|
|
logger.warning(
|
|
f"Ephemeral phrase generation returned empty/error "
|
|
f"(kind={kind}, lang={targetLang})"
|
|
)
|
|
return []
|
|
|
|
raw = response.content.strip()
|
|
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
|
raw = re.sub(r"\s*```\s*$", "", raw)
|
|
try:
|
|
arr = json.loads(raw)
|
|
except json.JSONDecodeError as parseErr:
|
|
logger.warning(
|
|
f"Ephemeral phrase generation: could not parse JSON "
|
|
f"(kind={kind}, lang={targetLang}): {parseErr} "
|
|
f"raw={raw[:200]}"
|
|
)
|
|
return []
|
|
if not isinstance(arr, list):
|
|
return []
|
|
cleaned = [
|
|
str(v).strip()
|
|
for v in arr
|
|
if isinstance(v, str) and str(v).strip()
|
|
]
|
|
cleaned = cleaned[:count]
|
|
if cleaned:
|
|
logger.info(
|
|
f"Ephemeral phrase pool generated (kind={kind}, "
|
|
f"lang={targetLang}, count={len(cleaned)})"
|
|
)
|
|
return cleaned
|
|
|
|
|
|
async def _runQuickAck(service, sessionId: str) -> None:
|
|
"""Background task: speak a short ack into the meeting via TTS."""
|
|
from .service import _emitSessionEvent, _speakTextChunked
|
|
|
|
websocket = service._websocket
|
|
voiceInterface = service._voiceInterface
|
|
if websocket is None or voiceInterface is None:
|
|
return
|
|
if not service._shouldFireQuickAck():
|
|
return
|
|
ackText = await _pickQuickAckText(service)
|
|
if not ackText:
|
|
return
|
|
service._lastQuickAckTs = time.time()
|
|
try:
|
|
await _emitSessionEvent(sessionId, "quickAck", {
|
|
"text": ackText,
|
|
"timestamp": getUtcTimestamp(),
|
|
})
|
|
cancelHook = service._makeAnswerCancelHook()
|
|
async with service._meetingTtsLock:
|
|
outcome = await _speakTextChunked(
|
|
websocket=websocket,
|
|
voiceInterface=voiceInterface,
|
|
sessionId=sessionId,
|
|
voiceText=ackText,
|
|
languageCode=service.config.language,
|
|
voiceName=service.config.voiceId,
|
|
isCancelled=cancelHook,
|
|
)
|
|
if not outcome.get("success"):
|
|
logger.info(
|
|
f"Session {sessionId}: Quick ack TTS failed silently "
|
|
f"({outcome.get('error')}) — main response will still go through"
|
|
)
|
|
except asyncio.CancelledError:
|
|
logger.info(f"Session {sessionId}: Quick ack cancelled by stop signal")
|
|
except Exception as ackErr:
|
|
logger.warning(f"Session {sessionId}: Quick ack failed: {ackErr}")
|
|
finally:
|
|
service._currentQuickAckTask = None
|
|
|
|
|
|
async def _checkPendingNameTrigger(service, delaySec: float = 3.0):
|
|
"""Async loop: fire the pending name trigger once the speaker is quiet."""
|
|
await asyncio.sleep(delaySec)
|
|
if not service._pendingNameTrigger:
|
|
return
|
|
|
|
now = time.time()
|
|
lastActivity = service._pendingNameTrigger.get("lastActivity", 0)
|
|
detectedAt = service._pendingNameTrigger.get("detectedAt", 0)
|
|
quietSec = now - lastActivity
|
|
totalWaitSec = now - detectedAt
|
|
|
|
if quietSec >= 3.0 or totalWaitSec >= 15.0:
|
|
trigger = service._pendingNameTrigger
|
|
service._pendingNameTrigger = None
|
|
logger.info(
|
|
f"Session {trigger['sessionId']}: Debounced name trigger fires "
|
|
f"(quiet={quietSec:.1f}s, totalWait={totalWaitSec:.1f}s)"
|
|
)
|
|
await _analyzeAndRespond(
|
|
service,
|
|
trigger["sessionId"],
|
|
trigger["interface"],
|
|
trigger["voiceInterface"],
|
|
trigger["websocket"],
|
|
trigger["triggerTranscript"],
|
|
)
|
|
else:
|
|
remaining = max(0.5, 3.0 - quietSec)
|
|
asyncio.create_task(_checkPendingNameTrigger(service, remaining))
|
|
|
|
|
|
async def _warmEphemeralPhrasePool(service, sessionId: str) -> None:
|
|
"""Fire-and-forget: generate ephemeral phrase pool for all kinds."""
|
|
from .service import _EPHEMERAL_PHRASE_INTENTS
|
|
|
|
try:
|
|
for kind in _EPHEMERAL_PHRASE_INTENTS:
|
|
try:
|
|
await _getEphemeralPhrases(service, kind)
|
|
except Exception as innerErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Phrase pool warmup failed for "
|
|
f"kind={kind}: {innerErr}"
|
|
)
|
|
except Exception as warmErr:
|
|
logger.warning(
|
|
f"Session {sessionId}: Phrase pool warmup task crashed: {warmErr}"
|
|
)
|
|
|
|
|
|
async def _runEscalationAndRelease(
|
|
service,
|
|
sessionId: str,
|
|
taskBrief: str,
|
|
briefingFileIds: List[str],
|
|
triggerTranscriptId: Optional[str],
|
|
) -> None:
|
|
"""Background wrapper for ``_runAgentForMeeting`` that holds the
|
|
``_agentEscalationInFlight`` flag for the duration of the agent run."""
|
|
try:
|
|
await service._runAgentForMeeting(
|
|
sessionId=sessionId,
|
|
taskText=taskBrief,
|
|
fileIds=briefingFileIds,
|
|
sourceLabel="speechEscalation",
|
|
triggerTranscriptId=triggerTranscriptId,
|
|
)
|
|
except asyncio.CancelledError:
|
|
logger.info(
|
|
f"Session {sessionId}: Escalation agent task cancelled by stop signal"
|
|
)
|
|
except Exception as escErr:
|
|
logger.error(
|
|
f"Session {sessionId}: Escalation agent task failed: "
|
|
f"{type(escErr).__name__}: {escErr}",
|
|
exc_info=True,
|
|
)
|
|
finally:
|
|
service._agentEscalationInFlight = False
|
|
service._currentEscalationTask = None
|