gateway/modules/features/teamsbot/service.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Teamsbot Service - Pipeline Orchestrator.
Manages the audio processing pipeline: STT -> Context Buffer -> SPEECH_TEAMS -> TTS -> Bridge.
"""

import logging
import json
import asyncio
import time
import base64
from typing import Optional, Dict, Any, List

from fastapi import WebSocket

from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum
from modules.shared.timeUtils import getUtcTimestamp, getIsoTimestamp

from .datamodelTeamsbot import (
    TeamsbotSessionStatus,
    TeamsbotTranscript,
    TeamsbotBotResponse,
    TeamsbotResponseType,
    TeamsbotConfig,
    TeamsbotResponseMode,
    TeamsbotResponseChannel,
    SpeechTeamsResponse,
    TeamsbotCommand,
)
from .browserBotConnector import BrowserBotConnector

logger = logging.getLogger(__name__)


# =========================================================================
# Minimal Service Context (for AI billing in bridge callbacks)
# =========================================================================

class _ServiceContext:
    """Minimal context providing user/mandate info for AiService billing.
    Used by bridge callbacks where a full Services instance is not available."""

    def __init__(self, user, mandateId, featureInstanceId=None):
        self.user = user
        self.mandateId = mandateId
        self.featureInstanceId = featureInstanceId
        self.featureCode = "teamsbot"


# =========================================================================
# Session Event Queues (for SSE streaming to frontend)
# =========================================================================
_sessionEvents: Dict[str, asyncio.Queue] = {}


async def _emitSessionEvent(sessionId: str, eventType: str, data: Any):
    """Emit an event to the session's SSE stream.
    Creates the queue on-demand so events are never silently dropped."""
    if sessionId not in _sessionEvents:
        _sessionEvents[sessionId] = asyncio.Queue()
    await _sessionEvents[sessionId].put({"type": eventType, "data": data, "timestamp": getIsoTimestamp()})


class TeamsbotService:
    """
    Pipeline Orchestrator for Teams Bot sessions.
    Coordinates VoiceObjects (STT/TTS), AiService (SPEECH_TEAMS), and Bridge communication.
    """

    def __init__(self, currentUser: User, mandateId: str, instanceId: str, config: TeamsbotConfig):
        self.currentUser = currentUser
        self.mandateId = mandateId
        self.instanceId = instanceId
        self.config = config
        self.browserBotConnector = BrowserBotConnector(config._getEffectiveBrowserBotUrl())

        # State
        self._lastAiCallTime: float = 0.0
        self._contextBuffer: List[Dict[str, Any]] = []
        self._sessionContext: Optional[str] = None  # User-provided background context
        self._contextSummary: Optional[str] = None  # AI-generated summary of long context

        # Differential transcript tracking: only write new text, update existing
        # record when the same speaker continues speaking
        self._lastTranscriptSpeaker: Optional[str] = None
        self._lastTranscriptText: Optional[str] = None
        self._lastTranscriptId: Optional[str] = None

    # =========================================================================
    # Session Lifecycle
    # =========================================================================

    async def joinMeeting(
        self,
        sessionId: str,
        meetingLink: str,
        connectionId: Optional[str] = None,
        gatewayBaseUrl: str = "",
        botAccountEmail: Optional[str] = None,
        botAccountPassword: Optional[str] = None,
    ):
        """Send join command to the Browser Bot service.

        The browser bot will:
        1. Launch browser (headful if credentials provided, headless otherwise)
        2. Navigate to Teams web app
        3. Authenticate if credentials provided, otherwise join as anonymous guest
        4. Enable captions/audio capture and start scraping
        5. Connect back via WebSocket to send transcripts
        """
        from . import interfaceFeatureTeamsbot as interfaceDb

        interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)

        # Initialize SSE event queue
        _sessionEvents[sessionId] = asyncio.Queue()

        try:
            # Update status to JOINING
            interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.JOINING.value})
            await _emitSessionEvent(sessionId, "statusChange", {"status": "joining"})

            # Send join command to browser bot
            session = interface.getSession(sessionId)
            if not session:
                raise ValueError(f"Session {sessionId} not found")

            # Build the full WebSocket URL for the bot to connect back to this gateway instance
            # gatewayBaseUrl is passed from the route handler (derived from request.base_url)
            wsScheme = "wss" if gatewayBaseUrl.startswith("https") else "ws"
            gatewayHost = gatewayBaseUrl.replace("https://", "").replace("http://", "").rstrip("/")
            fullGatewayWsUrl = f"{wsScheme}://{gatewayHost}/api/teamsbot/{self.instanceId}/bot/ws/{sessionId}"

            hasAuth = bool(botAccountEmail and botAccountPassword)
            logger.info(f"Joining meeting for session {sessionId}: auth={hasAuth}, email={botAccountEmail or 'N/A'}, transferMode={self.config.transferMode}")

            result = await self.browserBotConnector.joinMeeting(
                sessionId=sessionId,
                meetingUrl=meetingLink,
                botName=session.get("botName", self.config.botName),
                instanceId=self.instanceId,
                gatewayWsUrl=fullGatewayWsUrl,
                language=self.config.language,
                botAccountEmail=botAccountEmail,
                botAccountPassword=botAccountPassword,
                transferMode=self.config.transferMode if hasattr(self.config, 'transferMode') else "auto",
                debugMode=self.config.debugMode if hasattr(self.config, 'debugMode') else False,
            )

            if result.get("success"):
                interface.updateSession(sessionId, {
                    "status": TeamsbotSessionStatus.JOINING.value,  # Will become ACTIVE when bot connects via WS
                })
                logger.info(f"Browser bot deployment started for session {sessionId}")
            else:
                errorMsg = result.get("error", "Unknown error joining meeting")
                interface.updateSession(sessionId, {
                    "status": TeamsbotSessionStatus.ERROR.value,
                    "errorMessage": errorMsg,
                })
                await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": errorMsg})
                logger.error(f"Failed to deploy browser bot for session {sessionId}: {errorMsg}")

        except Exception as e:
            logger.error(f"Error joining meeting for session {sessionId}: {e}")
            interface.updateSession(sessionId, {
                "status": TeamsbotSessionStatus.ERROR.value,
                "errorMessage": str(e),
            })
            await _emitSessionEvent(sessionId, "statusChange", {"status": "error", "errorMessage": str(e)})

    async def leaveMeeting(self, sessionId: str):
        """Send leave command to the Browser Bot service."""
        from . import interfaceFeatureTeamsbot as interfaceDb

        interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)

        try:
            interface.updateSession(sessionId, {"status": TeamsbotSessionStatus.LEAVING.value})
            await _emitSessionEvent(sessionId, "statusChange", {"status": "leaving"})

            await self.browserBotConnector.leaveMeeting(sessionId)

            interface.updateSession(sessionId, {
                "status": TeamsbotSessionStatus.ENDED.value,
                "endedAt": getIsoTimestamp(),
            })
            await _emitSessionEvent(sessionId, "statusChange", {"status": "ended"})

            # Generate meeting summary in background
            asyncio.create_task(self._generateMeetingSummary(sessionId))

            logger.info(f"Bot left meeting for session {sessionId}")

        except Exception as e:
            logger.error(f"Error leaving meeting for session {sessionId}: {e}")
            interface.updateSession(sessionId, {
                "status": TeamsbotSessionStatus.ERROR.value,
                "errorMessage": str(e),
                "endedAt": getIsoTimestamp(),
            })

        # Cleanup event queue
        _sessionEvents.pop(sessionId, None)

    # =========================================================================
    # Browser Bot WebSocket Communication
    # =========================================================================

    async def handleBotWebSocket(self, websocket: WebSocket, sessionId: str):
        """
        Main WebSocket handler for Browser Bot communication.

        Receives:
        - transcript: Caption text scraped from Teams
        - status: Bot state changes (joined, in_lobby, left, error)

        Sends:
        - playAudio: TTS audio for the bot to play in the meeting
        """
        from . import interfaceFeatureTeamsbot as interfaceDb
        from modules.interfaces.interfaceVoiceObjects import getVoiceInterface

        interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
        voiceInterface = getVoiceInterface(self.currentUser, self.mandateId)

        # Load session context (user-provided background knowledge)
        # If the context is long (>500 chars), summarize it to reduce token usage
        session = interface.getSession(sessionId)
        if session:
            rawContext = session.get("sessionContext")
            if rawContext and len(rawContext) > 500:
                logger.info(f"Session {sessionId}: Summarizing long session context ({len(rawContext)} chars)...")
                self._sessionContext = await self._summarizeSessionContext(sessionId, rawContext)
            elif rawContext:
                self._sessionContext = rawContext
            if self._sessionContext:
                logger.info(f"Session {sessionId}: Session context ready ({len(self._sessionContext)} chars)")

        # Resolve system bot email for speaker detection (prevents bot from triggering AI on own speech)
        try:
            systemBot = interface.getActiveSystemBot(self.mandateId)
            self._botAccountEmail = systemBot.get("email") if systemBot else None
            if self._botAccountEmail:
                logger.info(f"Session {sessionId}: Bot account email resolved: {self._botAccountEmail}")
        except Exception:
            self._botAccountEmail = None

        logger.info(f"[WS-DEBUG] WebSocket handler started for session {sessionId}")

        try:
            msgCount = 0
            while True:
                data = await websocket.receive()
                msgCount += 1

                if "text" not in data:
                    logger.debug(f"[WS-DEBUG] session={sessionId} msg #{msgCount}: non-text data received (keys: {list(data.keys())})")
                    continue

                message = json.loads(data["text"])
                msgType = message.get("type")
                logger.info(f"[WS-DEBUG] session={sessionId} msg #{msgCount}: type={msgType}")

                if msgType == "transcript":
                    transcript = message.get("transcript", {})
                    logger.info(f"[WS-DEBUG] Transcript received: speaker={transcript.get('speaker')}, text={transcript.get('text', '')[:60]}...")
                    await self._processTranscript(
                        sessionId=sessionId,
                        speaker=transcript.get("speaker", "Unknown"),
                        text=transcript.get("text", ""),
                        isFinal=transcript.get("isFinal", True),
                        interface=interface,
                        voiceInterface=voiceInterface,
                        websocket=websocket,
                    )

                elif msgType == "chatMessage":
                    chat = message.get("chat", {})
                    logger.info(f"[WS-DEBUG] Chat message received: speaker={chat.get('speaker')}, text={chat.get('text', '')[:60]}...")
                    await self._processTranscript(
                        sessionId=sessionId,
                        speaker=chat.get("speaker", "Unknown"),
                        text=chat.get("text", ""),
                        isFinal=True,
                        interface=interface,
                        voiceInterface=voiceInterface,
                        websocket=websocket,
                        source="chat",
                    )

                elif msgType == "status":
                    status = message.get("status")
                    errorMessage = message.get("message")
                    logger.info(f"[WS-DEBUG] Status received: status={status}, message={errorMessage}")
                    await self._handleBotStatus(sessionId, status, errorMessage, interface)

                elif msgType == "audioChunk":
                    audioData = message.get("audio", {})
                    audioBase64 = audioData.get("data", "")
                    sampleRate = audioData.get("sampleRate", 16000)
                    if audioBase64:
                        await self._processAudioChunk(
                            sessionId=sessionId,
                            audioBase64=audioBase64,
                            sampleRate=sampleRate,
                            interface=interface,
                            voiceInterface=voiceInterface,
                            websocket=websocket,
                        )

                elif msgType == "voiceGreeting":
                    greetingText = message.get("text", "")
                    greetingLang = message.get("language", self.config.language)
                    logger.info(f"[WS-DEBUG] Voice greeting request: text={greetingText[:60]}..., language={greetingLang}")
                    if greetingText and voiceInterface:
                        try:
                            ttsResult = await voiceInterface.textToSpeech(
                                text=greetingText,
                                languageCode=greetingLang,
                                voiceName=self.config.voiceId
                            )
                            if ttsResult and isinstance(ttsResult, dict):
                                audioContent = ttsResult.get("audioContent")
                                if audioContent:
                                    await websocket.send_text(json.dumps({
                                        "type": "playAudio",
                                        "sessionId": sessionId,
                                        "audio": {
                                            "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
                                            "format": "mp3",
                                        }
                                    }))
                                    logger.info(f"Voice greeting TTS sent for session {sessionId}")
                        except Exception as ttsErr:
                            logger.warning(f"Voice greeting TTS failed for session {sessionId}: {ttsErr}")

                elif msgType == "ping":
                    await websocket.send_text(json.dumps({"type": "pong"}))

        except Exception as e:
            if "disconnect" not in str(e).lower():
                logger.error(f"[WS-DEBUG] WebSocket error for session {sessionId}: {type(e).__name__}: {e}")

        logger.info(f"[WS-DEBUG] WebSocket handler ended for session {sessionId} after {msgCount} messages")

    async def _handleBotStatus(
        self,
        sessionId: str,
        status: str,
        errorMessage: Optional[str],
        interface,
    ):
        """Handle status updates from the browser bot."""
        logger.info(f"Bot status update for session {sessionId}: {status}")

        statusMap = {
            "connecting": TeamsbotSessionStatus.JOINING.value,
            "launching": TeamsbotSessionStatus.JOINING.value,
            "navigating": TeamsbotSessionStatus.JOINING.value,
            "in_lobby": TeamsbotSessionStatus.JOINING.value,
            "joined": TeamsbotSessionStatus.ACTIVE.value,
            "in_meeting": TeamsbotSessionStatus.ACTIVE.value,
            "left": TeamsbotSessionStatus.ENDED.value,
            "error": TeamsbotSessionStatus.ERROR.value,
        }

        dbStatus = statusMap.get(status, TeamsbotSessionStatus.ACTIVE.value)

        updates = {"status": dbStatus}
        if errorMessage:
            updates["errorMessage"] = errorMessage
        if dbStatus == TeamsbotSessionStatus.ACTIVE.value:
            updates["startedAt"] = getIsoTimestamp()
        elif dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
            updates["endedAt"] = getIsoTimestamp()

        interface.updateSession(sessionId, updates)
        await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})

        # Generate summary when session ends
        if dbStatus == TeamsbotSessionStatus.ENDED.value:
            asyncio.create_task(self._generateMeetingSummary(sessionId))

    async def _processAudioChunk(
        self,
        sessionId: str,
        audioBase64: str,
        sampleRate: int,
        interface,
        voiceInterface,
        websocket: WebSocket,
    ):
        """Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
        import base64
        try:
            audioBytes = base64.b64decode(audioBase64)
            if len(audioBytes) < 1000:
                return

            # Use the existing Google Cloud Speech connector for STT
            speechConnector = voiceInterface.getSpeechConnector() if voiceInterface else None
            if not speechConnector or not hasattr(speechConnector, 'speech_client'):
                logger.warning(f"[AudioChunk] No speech client available for session {sessionId}")
                return

            from google.cloud import speech
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=sampleRate,
                language_code=self.config.language or "de-DE",
                enable_automatic_punctuation=True,
            )
            audio = speech.RecognitionAudio(content=audioBytes)

            response = speechConnector.speech_client.recognize(config=config, audio=audio)

            for result in response.results:
                if result.alternatives:
                    text = result.alternatives[0].transcript.strip()
                    if text:
                        logger.info(f"[AudioChunk] STT result: {text[:80]}...")
                        await self._processTranscript(
                            sessionId=sessionId,
                            speaker="Meeting Audio",
                            text=text,
                            isFinal=True,
                            interface=interface,
                            voiceInterface=voiceInterface,
                            websocket=websocket,
                            source="audioCapture",
                        )
        except Exception as e:
            logger.error(f"[AudioChunk] STT error for session {sessionId}: {type(e).__name__}: {e}")

    async def _processTranscript(
        self,
        sessionId: str,
        speaker: str,
        text: str,
        isFinal: bool,
        interface,
        voiceInterface,
        websocket: WebSocket,
        source: str = "caption",
    ):
        """Process a transcript segment from captions or chat messages.

        Differential writing: When the same speaker continues (text grows
        incrementally as captions stream), we UPDATE the existing DB record
        instead of creating a cascade of near-duplicate rows. A new record
        is only created when the speaker changes or the text is not a
        continuation of the previous segment.
        """

        text = text.strip()
        if not text:
            return

        # Filter out the bot's own speech entirely — captions of the bot's
        # own voice come back as garbled text (e.g. German TTS → English caption)
        # which pollutes the context buffer and confuses AI analysis.
        isBotSpeaker = self._isBotSpeaker(speaker)
        if isBotSpeaker:
            logger.debug(f"Session {sessionId}: Ignoring own bot caption from: [{speaker}] {text[:80]}...")
            return

        # Differential transcript writing:
        # If the same speaker is still talking and the new text is a
        # continuation (starts with the previous text), UPDATE the existing
        # record instead of creating a new one. This avoids cascading rows like:
        #   "Der AHV"
        #   "Der AHV Fonds"
        #   "Der AHV Fonds hat 2025"
        # and instead keeps a single row that grows until the speaker changes.
        isContinuation = (
            self._lastTranscriptSpeaker == speaker
            and self._lastTranscriptText
            and self._lastTranscriptId
            and text.startswith(self._lastTranscriptText)
            and source == "caption"  # only for captions, chat messages are always new
        )

        if isContinuation:
            interface.updateTranscript(self._lastTranscriptId, {
                "text": text,
                "isFinal": isFinal,
            })
            self._lastTranscriptText = text
            createdTranscript = {"id": self._lastTranscriptId}

            # Update context buffer: replace last entry for same speaker
            if self._contextBuffer and self._contextBuffer[-1].get("speaker") == speaker:
                self._contextBuffer[-1]["text"] = text
        else:
            # New speaker or non-continuation → create a new record
            transcriptData = TeamsbotTranscript(
                sessionId=sessionId,
                speaker=speaker,
                text=text,
                timestamp=getIsoTimestamp(),
                confidence=1.0,
                language=self.config.language,
                isFinal=isFinal,
            ).model_dump()

            createdTranscript = interface.createTranscript(transcriptData)

            # Track for differential writing
            self._lastTranscriptSpeaker = speaker
            self._lastTranscriptText = text
            self._lastTranscriptId = createdTranscript.get("id")

            # Append to context buffer
            self._contextBuffer.append({
                "speaker": speaker or "Unknown",
                "text": text,
                "timestamp": getUtcTimestamp(),
                "source": source,
            })

            # Keep only last N segments
            maxSegments = self.config.contextWindowSegments
            if len(self._contextBuffer) > maxSegments:
                if not self._contextSummary and len(self._contextBuffer) > maxSegments * 1.5:
                    asyncio.create_task(self._summarizeContextBuffer(sessionId))
                self._contextBuffer = self._contextBuffer[-maxSegments:]

            # Update session transcript count (only for new records)
            session = interface.getSession(sessionId)
            if session:
                count = session.get("transcriptSegmentCount", 0) + 1
                interface.updateSession(sessionId, {"transcriptSegmentCount": count})

        # Emit SSE event for live transcript (always, for UI updates)
        await _emitSessionEvent(sessionId, "transcript", {
            "id": createdTranscript.get("id"),
            "speaker": speaker,
            "text": text,
            "confidence": 1.0,
            "timestamp": getIsoTimestamp(),
            "isContinuation": isContinuation,
        })

        # Check if AI analysis should be triggered (only for final transcripts)
        if not isFinal:
            return

        if self.config.responseMode == TeamsbotResponseMode.TRANSCRIBE_ONLY:
            logger.debug(f"Session {sessionId}: responseMode=TRANSCRIBE_ONLY, skipping AI analysis")
            return

        shouldTrigger = self._shouldTriggerAnalysis(text)
        logger.debug(f"Session {sessionId}: shouldTriggerAnalysis={shouldTrigger}, bufferSize={len(self._contextBuffer)}, responseMode={self.config.responseMode}")

        if not shouldTrigger:
            return

        # SPEECH_TEAMS AI analysis
        logger.info(f"Session {sessionId}: Triggering AI analysis (buffer: {len(self._contextBuffer)} segments)")
        await self._analyzeAndRespond(sessionId, interface, voiceInterface, websocket, createdTranscript)

    def _isBotSpeaker(self, speaker: str) -> bool:
        """Check if a transcript speaker is the bot itself.

        Teams captions show the bot as e.g. "BotName (Unverified)" or
        "Nyla Larsson" depending on auth/anonymous join. We match against:
        - The configured/derived bot name
        - The bot account display name if authenticated
        """
        if not speaker:
            return False

        speakerLower = speaker.lower().strip()

        # Match against configured bot name
        botName = self.config.botName.lower().strip()
        if botName and botName in speakerLower:
            return True

        # Match against bot account email prefix (e.g. "nyla.larsson" from "nyla.larsson@poweron.swiss")
        botAccountEmail = getattr(self, '_botAccountEmail', None) or getattr(self.config, 'botAccountEmail', None)
        if botAccountEmail:
            emailPrefix = botAccountEmail.split("@")[0].lower().replace(".", " ")
            if emailPrefix in speakerLower:
                return True

        return False

    def _shouldTriggerAnalysis(self, transcriptText: str) -> bool:
        """
        Decide whether to trigger AI analysis based on the latest transcript.
        Triggers:
        - Bot name mentioned (immediate)
        - Periodic interval elapsed
        - Cooldown respected
        """
        now = time.time()
        timeSinceLastCall = now - self._lastAiCallTime

        # Bot name mentioned -> immediate trigger (OVERRIDES cooldown)
        botNameLower = self.config.botName.lower()
        transcriptLower = transcriptText.lower()
        if botNameLower in transcriptLower:
            logger.info(f"Trigger: Bot name '{self.config.botName}' detected in transcript (overrides cooldown): '{transcriptText[:60]}...'")
            return True

        # Also check first name and phonetically similar words (speech recognition artifacts)
        botFirstName = botNameLower.split()[0] if " " in botNameLower else botNameLower
        if len(botFirstName) >= 3:
            for word in transcriptLower.split():
                # Strip punctuation from word
                cleanWord = word.strip(".,!?:;\"'()[]")
                if not cleanWord or len(cleanWord) < 3:
                    continue
                # Exact first name match
                if cleanWord == botFirstName:
                    logger.info(f"Trigger: Bot first name '{botFirstName}' detected: '{transcriptText[:60]}...'")
                    return True
                # Simple phonetic similarity: same first letter, similar length, high character overlap
                if cleanWord[0] == botFirstName[0] and abs(len(cleanWord) - len(botFirstName)) <= 2:
                    common = sum(1 for c in set(botFirstName) if c in cleanWord)
                    similarity = common / max(len(set(botFirstName)), len(set(cleanWord)))
                    if similarity >= 0.6:
                        logger.info(f"Trigger: Phonetically similar to '{botFirstName}' -> '{cleanWord}' (sim={similarity:.2f}): '{transcriptText[:60]}...'")
                        return True

        # Cooldown check (only for non-name triggers)
        if timeSinceLastCall < self.config.triggerCooldownSeconds:
            logger.debug(f"Trigger: Cooldown active ({timeSinceLastCall:.1f}s < {self.config.triggerCooldownSeconds}s)")
            return False

        # Periodic trigger
        if timeSinceLastCall >= self.config.triggerIntervalSeconds:
            logger.info(f"Trigger: Periodic interval ({self.config.triggerIntervalSeconds}s) elapsed ({timeSinceLastCall:.1f}s since last call)")
            return True

        logger.debug(f"Trigger: No trigger ({timeSinceLastCall:.1f}s / {self.config.triggerIntervalSeconds}s interval)")
        return False

    async def _analyzeAndRespond(
        self,
        sessionId: str,
        interface,
        voiceInterface,
        websocket: WebSocket,
        triggerTranscript: Dict[str, Any],
    ):
        """Run SPEECH_TEAMS AI analysis and respond if needed."""
        self._lastAiCallTime = time.time()

        # Build transcript context from buffer.
        # Mark bot's own utterances and chat messages for the AI.
        contextLines = []
        for segment in self._contextBuffer:
            speaker = segment.get("speaker", "Unknown")
            text = segment.get("text", "")
            segSource = segment.get("source", "caption")
            prefix = "Chat" if segSource == "chat" else ""
            if self._isBotSpeaker(speaker):
                contextLines.append(f"[YOU ({self.config.botName})]: {text}")
            elif prefix:
                contextLines.append(f"[{prefix}: {speaker}]: {text}")
            else:
                contextLines.append(f"[{speaker}]: {text}")

        # Include session context if provided by the user at session start
        sessionContextStr = ""
        if self._sessionContext:
            sessionContextStr = f"\nSESSION_CONTEXT (background knowledge provided by the user):\n{self._sessionContext}\n"

        # Include summary of earlier conversation if available
        summaryStr = ""
        if self._contextSummary:
            summaryStr = f"\nEARLIER_CONVERSATION_SUMMARY:\n{self._contextSummary}\n"

        transcriptContext = f"BOT_NAME:{self.config.botName}{sessionContextStr}{summaryStr}\nRECENT_TRANSCRIPT:\n" + "\n".join(contextLines)

        # Call SPEECH_TEAMS
        try:
            from modules.services.serviceAi.mainServiceAi import AiService

            # Create minimal service context for AI billing
            serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
            aiService = AiService(serviceCenter=serviceContext)
            await aiService.ensureAiObjectsInitialized()

            request = AiCallRequest(
                prompt=self.config.aiSystemPrompt,
                context=transcriptContext,
                options=AiCallOptions(
                    operationType=OperationTypeEnum.SPEECH_TEAMS,
                    priority=PriorityEnum.SPEED,
                )
            )

            response = await aiService.callAi(request)

            # Parse structured response
            try:
                speechResult = SpeechTeamsResponse.model_validate_json(response.content)
            except Exception:
                # Try to extract JSON from response content
                try:
                    jsonStr = response.content
                    if "```json" in jsonStr:
                        jsonStr = jsonStr.split("```json")[1].split("```")[0]
                    elif "```" in jsonStr:
                        jsonStr = jsonStr.split("```")[1].split("```")[0]
                    speechResult = SpeechTeamsResponse.model_validate_json(jsonStr.strip())
                except Exception as parseErr:
                    logger.warning(f"Failed to parse SPEECH_TEAMS response: {parseErr}")
                    speechResult = SpeechTeamsResponse(
                        shouldRespond=False,
                        reasoning=f"Parse error: {str(parseErr)[:100]}",
                        detectedIntent="none"
                    )

            logger.info(
                f"SPEECH_TEAMS result: shouldRespond={speechResult.shouldRespond}, "
                f"intent={speechResult.detectedIntent}, "
                f"reasoning={speechResult.reasoning[:80]}..."
            )

            # Emit analysis event (always, for debug/UI)
            await _emitSessionEvent(sessionId, "analysis", {
                "shouldRespond": speechResult.shouldRespond,
                "detectedIntent": speechResult.detectedIntent,
                "reasoning": speechResult.reasoning,
                "modelName": response.modelName,
                "processingTime": response.processingTime,
                "priceCHF": response.priceCHF,
            })

            # Step 4a: Handle STOP intent -- stop audio immediately
            if speechResult.detectedIntent == "stop":
                logger.info(f"Session {sessionId}: AI detected STOP intent: {speechResult.reasoning}")
                if websocket:
                    try:
                        await websocket.send_text(json.dumps({
                            "type": "stopAudio",
                            "sessionId": sessionId,
                        }))
                    except Exception as stopErr:
                        logger.warning(f"Failed to send stop command: {stopErr}")
                return

            # Step 4b: Respond if AI decided to
            if speechResult.shouldRespond and speechResult.responseText:

                if self.config.responseMode == TeamsbotResponseMode.MANUAL:
                    # In manual mode, suggest but don't send
                    await _emitSessionEvent(sessionId, "suggestedResponse", {
                        "responseText": speechResult.responseText,
                        "detectedIntent": speechResult.detectedIntent,
                        "reasoning": speechResult.reasoning,
                    })
                    return

                # Determine response channel (voice, chat, or both)
                # Extract the raw value: enum.value gives "voice", str(enum) gives "TeamsbotResponseChannel.voice"
                channelRaw = self.config.responseChannel
                channelStr = (channelRaw.value if hasattr(channelRaw, 'value') else str(channelRaw)).lower().strip()
                logger.info(f"Response channel: '{channelStr}' (raw={channelRaw!r})")

                sendVoice = channelStr in ("voice", "both")
                sendChat = channelStr in ("chat", "both")

                if sendVoice and sendChat:
                    responseType = TeamsbotResponseType.BOTH
                elif sendVoice:
                    responseType = TeamsbotResponseType.AUDIO
                else:
                    responseType = TeamsbotResponseType.CHAT

                # 4a: Voice response (TTS -> Audio to bot)
                if sendVoice:
                    try:
                        ttsResult = await voiceInterface.textToSpeech(
                            text=speechResult.responseText,
                            languageCode=self.config.language,
                            voiceName=self.config.voiceId
                        )

                        if ttsResult and isinstance(ttsResult, dict):
                            audioContent = ttsResult.get("audioContent")
                            if audioContent and websocket:
                                await websocket.send_text(json.dumps({
                                    "type": "playAudio",
                                    "sessionId": sessionId,
                                    "audio": {
                                        "data": base64.b64encode(audioContent if isinstance(audioContent, bytes) else audioContent.encode()).decode(),
                                        "format": "mp3",
                                    },
                                }))
                            elif audioContent and not websocket:
                                logger.info(f"TTS audio generated for session {sessionId} (HTTP mode - no WebSocket for playback)")
                    except Exception as ttsErr:
                        logger.warning(f"TTS failed for session {sessionId}: {ttsErr}")
                        if not sendChat:
                            sendChat = True  # Fallback to chat if voice-only and TTS failed

                # 4b: Chat response (send text message to meeting chat)
                if sendChat:
                    try:
                        if websocket:
                            await websocket.send_text(json.dumps({
                                "type": "sendChatMessage",
                                "sessionId": sessionId,
                                "text": speechResult.responseText,
                            }))
                            logger.info(f"Chat response sent for session {sessionId}")
                    except Exception as chatErr:
                        logger.warning(f"Chat message send failed for session {sessionId}: {chatErr}")

                # 4b: Store bot response
                botResponseData = TeamsbotBotResponse(
                    sessionId=sessionId,
                    responseText=speechResult.responseText,
                    responseType=responseType,
                    detectedIntent=speechResult.detectedIntent,
                    reasoning=speechResult.reasoning,
                    triggeredByTranscriptId=triggerTranscript.get("id"),
                    modelName=response.modelName,
                    processingTime=response.processingTime,
                    priceCHF=response.priceCHF,
                    timestamp=getIsoTimestamp(),
                ).model_dump()

                createdResponse = interface.createBotResponse(botResponseData)

                # 4c: Emit SSE event
                await _emitSessionEvent(sessionId, "botResponse", {
                    "id": createdResponse.get("id"),
                    "responseText": speechResult.responseText,
                    "responseType": responseType.value,
                    "detectedIntent": speechResult.detectedIntent,
                    "reasoning": speechResult.reasoning,
                    "modelName": response.modelName,
                    "processingTime": response.processingTime,
                    "priceCHF": response.priceCHF,
                })

                # Update session response count
                session = interface.getSession(sessionId)
                if session:
                    count = session.get("botResponseCount", 0) + 1
                    interface.updateSession(sessionId, {"botResponseCount": count})

                logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")

            # Step 5: Execute AI-issued commands (if any)
            if speechResult.commands:
                await self._executeCommands(sessionId, speechResult.commands, voiceInterface, websocket)

        except Exception as e:
            logger.error(f"SPEECH_TEAMS analysis failed for session {sessionId}: {type(e).__name__}: {e}", exc_info=True)
            await _emitSessionEvent(sessionId, "error", {"message": f"AI analysis failed: {type(e).__name__}: {str(e)}"})

    # =========================================================================
    # AI Command Execution
    # =========================================================================

    async def _executeCommands(
        self,
        sessionId: str,
        commands: List[TeamsbotCommand],
        voiceInterface,
        websocket: WebSocket,
    ):
        """Execute structured commands returned by the AI.

        Each command is sent to the browser bot via WebSocket as a
        'botCommand' message. The bot's TeamsActionsService handles
        the actual Teams UI interaction (checking state, toggling, etc.).
        """
        for cmd in commands:
            action = cmd.action
            params = cmd.params or {}
            logger.info(f"Session {sessionId}: Executing command '{action}' with params {params}")

            try:
                if action == "toggleTranscript":
                    enable = params.get("enable", True)
                    if websocket:
                        await websocket.send_text(json.dumps({
                            "type": "botCommand",
                            "sessionId": sessionId,
                            "command": "toggleTranscript",
                            "params": {"enable": enable},
                        }))

                elif action == "sendChat":
                    chatText = params.get("text", "")
                    if chatText and websocket:
                        await websocket.send_text(json.dumps({
                            "type": "sendChatMessage",
                            "sessionId": sessionId,
                            "text": chatText,
                        }))

                elif action == "readAloud":
                    readText = params.get("text", "")
                    if readText and voiceInterface:
                        ttsResult = await voiceInterface.textToSpeech(
                            text=readText,
                            languageCode=self.config.language,
                            voiceName=self.config.voiceId,
                        )
                        if ttsResult and isinstance(ttsResult, dict):
                            audioContent = ttsResult.get("audioContent")
                            if audioContent and websocket:
                                await websocket.send_text(json.dumps({
                                    "type": "playAudio",
                                    "sessionId": sessionId,
                                    "audio": {
                                        "data": base64.b64encode(
                                            audioContent if isinstance(audioContent, bytes) else audioContent.encode()
                                        ).decode(),
                                        "format": "mp3",
                                    },
                                }))

                elif action == "changeLanguage":
                    newLang = params.get("language", "")
                    if newLang:
                        self.config = self.config.model_copy(update={"language": newLang})
                        logger.info(f"Session {sessionId}: Language changed to '{newLang}'")
                        await _emitSessionEvent(sessionId, "languageChanged", {"language": newLang})

                elif action in ("toggleMic", "toggleCamera"):
                    if websocket:
                        await websocket.send_text(json.dumps({
                            "type": "botCommand",
                            "sessionId": sessionId,
                            "command": action,
                            "params": params,
                        }))

                else:
                    logger.warning(f"Session {sessionId}: Unknown command '{action}'")

            except Exception as cmdErr:
                logger.warning(f"Session {sessionId}: Command '{action}' failed: {cmdErr}")

    # =========================================================================
    # Context Summarization (for long sessions)
    # =========================================================================

    async def _summarizeSessionContext(self, sessionId: str, rawContext: str) -> str:
        """Summarize a long user-provided session context to its essential points.
        This reduces token usage in every subsequent AI call."""
        try:
            from modules.services.serviceAi.mainServiceAi import AiService
            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum

            serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
            aiService = AiService(serviceCenter=serviceContext)
            await aiService.ensureAiObjectsInitialized()

            request = AiCallRequest(
                prompt=(
                    "Fasse den folgenden Kontext auf die wesentlichen Punkte zusammen. "
                    "Behalte alle wichtigen Fakten, Namen, Zahlen, Entscheidungen und Aktionspunkte. "
                    "Entferne Fuelltext und Wiederholungen. "
                    "Antworte NUR mit der Zusammenfassung, keine Erklaerungen oder Einleitungen."
                ),
                context=rawContext,
                options=AiCallOptions(
                    operationType=OperationTypeEnum.DATA_ANALYSE,
                    priority=PriorityEnum.SPEED,
                )
            )

            response = await aiService.callAi(request)
            if response and response.errorCount == 0 and response.content:
                summary = response.content.strip()
                logger.info(f"Session {sessionId}: Context summarized from {len(rawContext)} to {len(summary)} chars")
                return summary
        except Exception as e:
            logger.warning(f"Session context summarization failed for {sessionId}: {e}")

        # Fallback: return original (truncated if very long)
        return rawContext[:2000] if len(rawContext) > 2000 else rawContext

    async def _summarizeContextBuffer(self, sessionId: str):
        """Summarize the older part of the context buffer to preserve information
        without exceeding the context window. This runs in the background."""
        try:
            if self._contextSummary:
                return  # Already summarized recently

            # Take the older half of the buffer for summarization
            halfPoint = len(self._contextBuffer) // 2
            oldSegments = self._contextBuffer[:halfPoint]

            if len(oldSegments) < 10:
                return  # Not enough to summarize

            # Build text to summarize
            lines = []
            for seg in oldSegments:
                speaker = seg.get("speaker", "Unknown")
                text = seg.get("text", "")
                lines.append(f"[{speaker}]: {text}")
            textToSummarize = "\n".join(lines)

            from modules.services.serviceAi.mainServiceAi import AiService
            from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum, PriorityEnum

            serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
            aiService = AiService(serviceCenter=serviceContext)
            await aiService.ensureAiObjectsInitialized()

            request = AiCallRequest(
                prompt="Fasse das folgende Meeting-Transkript in 3-5 Saetzen zusammen. Nenne die wichtigsten Themen, Entscheidungen und offene Fragen. Antworte NUR mit der Zusammenfassung, keine Erklaerungen.",
                context=textToSummarize,
                options=AiCallOptions(
                    operationType=OperationTypeEnum.DATA_ANALYSE,
                    priority=PriorityEnum.SPEED,
                )
            )

            response = await aiService.callAi(request)
            if response and response.errorCount == 0:
                self._contextSummary = response.content.strip()
                logger.info(f"Session {sessionId}: Context summarized ({len(oldSegments)} segments -> {len(self._contextSummary)} chars)")

        except Exception as e:
            logger.warning(f"Context summarization failed for session {sessionId}: {e}")

    # =========================================================================
    # Meeting Summary
    # =========================================================================

    async def _generateMeetingSummary(self, sessionId: str):
        """Generate an AI summary of the meeting after it ends."""
        try:
            from . import interfaceFeatureTeamsbot as interfaceDb

            interface = interfaceDb.getInterface(self.currentUser, self.mandateId, self.instanceId)
            transcripts = interface.getTranscripts(sessionId)

            if not transcripts or len(transcripts) < 5:
                return  # Not enough content for a summary

            # Build full transcript
            fullTranscript = "\n".join(
                f"[{t.get('speaker', 'Unknown')}]: {t.get('text', '')}"
                for t in transcripts
            )

            from modules.services.serviceAi.mainServiceAi import AiService

            serviceContext = _ServiceContext(self.currentUser, self.mandateId, self.instanceId)
            aiService = AiService(serviceCenter=serviceContext)
            await aiService.ensureAiObjectsInitialized()

            request = AiCallRequest(
                prompt="Erstelle eine kurze Zusammenfassung dieses Meeting-Transkripts. Nenne die wichtigsten Punkte, Entscheidungen und offene Aktionspunkte.",
                context=fullTranscript,
                options=AiCallOptions(
                    operationType=OperationTypeEnum.DATA_ANALYSE,
                    priority=PriorityEnum.BALANCED,
                )
            )

            response = await aiService.callAi(request)
            if response and response.errorCount == 0:
                interface.updateSession(sessionId, {"summary": response.content})
                logger.info(f"Meeting summary generated for session {sessionId}")

        except Exception as e:
            logger.error(f"Failed to generate meeting summary for session {sessionId}: {e}")