Teamsbot: STT phrase hints, multi-lang, follow-up window, known speakers collection
Made-with: Cursor
This commit is contained in:
parent
b9c3ad38fb
commit
1dd354794b
3 changed files with 52 additions and 11 deletions
|
|
@ -9,7 +9,7 @@ import json
|
||||||
import html
|
import html
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, Optional, Any
|
from typing import Dict, Optional, Any, List
|
||||||
from google.cloud import speech
|
from google.cloud import speech
|
||||||
from google.cloud import translate_v2 as translate
|
from google.cloud import translate_v2 as translate
|
||||||
from google.cloud import texttospeech
|
from google.cloud import texttospeech
|
||||||
|
|
@ -60,7 +60,9 @@ class ConnectorGoogleSpeech:
|
||||||
|
|
||||||
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
||||||
sampleRate: int = None, channels: int = None,
|
sampleRate: int = None, channels: int = None,
|
||||||
skipFallbacks: bool = False) -> Dict:
|
skipFallbacks: bool = False,
|
||||||
|
phraseHints: Optional[list] = None,
|
||||||
|
alternativeLanguages: Optional[list] = None) -> Dict:
|
||||||
"""
|
"""
|
||||||
Convert speech to text using Google Cloud Speech-to-Text API.
|
Convert speech to text using Google Cloud Speech-to-Text API.
|
||||||
|
|
||||||
|
|
@ -149,12 +151,21 @@ class ConnectorGoogleSpeech:
|
||||||
"audio_channel_count": channels,
|
"audio_channel_count": channels,
|
||||||
"language_code": language,
|
"language_code": language,
|
||||||
"enable_automatic_punctuation": True,
|
"enable_automatic_punctuation": True,
|
||||||
"model": "latest_long", # Try latest_long model for better recognition
|
"model": "latest_long",
|
||||||
"enable_word_time_offsets": True, # Enable word-level timing
|
"enable_word_time_offsets": True,
|
||||||
"enable_word_confidence": True, # Enable word-level confidence
|
"enable_word_confidence": True,
|
||||||
"max_alternatives": 3, # Try more alternatives
|
"max_alternatives": 3,
|
||||||
"use_enhanced": True # Use enhanced model if available
|
"use_enhanced": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if phraseHints:
|
||||||
|
configParams["speech_contexts"] = [speech.SpeechContext(
|
||||||
|
phrases=phraseHints,
|
||||||
|
boost=15.0,
|
||||||
|
)]
|
||||||
|
|
||||||
|
if alternativeLanguages:
|
||||||
|
configParams["alternative_language_codes"] = alternativeLanguages
|
||||||
|
|
||||||
# Only add sample_rate_hertz if needed (not for WEBM_OPUS)
|
# Only add sample_rate_hertz if needed (not for WEBM_OPUS)
|
||||||
if useSampleRate:
|
if useSampleRate:
|
||||||
|
|
|
||||||
|
|
@ -94,9 +94,11 @@ class TeamsbotService:
|
||||||
# Speaker attribution: simple last-caption-speaker model
|
# Speaker attribution: simple last-caption-speaker model
|
||||||
self._lastCaptionSpeaker: Optional[str] = None
|
self._lastCaptionSpeaker: Optional[str] = None
|
||||||
self._unattributedTranscriptIds: List[str] = []
|
self._unattributedTranscriptIds: List[str] = []
|
||||||
|
self._knownSpeakers: set = set()
|
||||||
|
|
||||||
# Debounced name trigger: wait for speaker to finish before AI analysis
|
# Debounced name trigger: wait for speaker to finish before AI analysis
|
||||||
self._pendingNameTrigger: Optional[Dict[str, Any]] = None
|
self._pendingNameTrigger: Optional[Dict[str, Any]] = None
|
||||||
|
self._followUpWindowEnd: float = 0.0
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session Lifecycle
|
# Session Lifecycle
|
||||||
|
|
@ -469,12 +471,18 @@ class TeamsbotService:
|
||||||
# Treat sampleRate=0 as unknown (triggers auto-detection)
|
# Treat sampleRate=0 as unknown (triggers auto-detection)
|
||||||
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
|
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
|
||||||
|
|
||||||
|
phraseHints = list(self._knownSpeakers)
|
||||||
|
if self.config.botName:
|
||||||
|
phraseHints.append(self.config.botName)
|
||||||
|
|
||||||
sttResult = await voiceInterface.speechToText(
|
sttResult = await voiceInterface.speechToText(
|
||||||
audioContent=audioBytes,
|
audioContent=audioBytes,
|
||||||
language=self.config.language or "de-DE",
|
language=self.config.language or "de-DE",
|
||||||
sampleRate=effectiveSampleRate,
|
sampleRate=effectiveSampleRate,
|
||||||
channels=1,
|
channels=1,
|
||||||
skipFallbacks=True,
|
skipFallbacks=True,
|
||||||
|
phraseHints=phraseHints if phraseHints else None,
|
||||||
|
alternativeLanguages=["en-US"],
|
||||||
)
|
)
|
||||||
|
|
||||||
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
if sttResult and sttResult.get("success") and sttResult.get("text"):
|
||||||
|
|
@ -512,6 +520,7 @@ class TeamsbotService:
|
||||||
|
|
||||||
prevSpeaker = self._lastCaptionSpeaker
|
prevSpeaker = self._lastCaptionSpeaker
|
||||||
self._lastCaptionSpeaker = normalizedSpeaker
|
self._lastCaptionSpeaker = normalizedSpeaker
|
||||||
|
self._knownSpeakers.add(normalizedSpeaker)
|
||||||
|
|
||||||
if prevSpeaker is None and self._unattributedTranscriptIds:
|
if prevSpeaker is None and self._unattributedTranscriptIds:
|
||||||
from . import interfaceFeatureTeamsbot as interfaceDb
|
from . import interfaceFeatureTeamsbot as interfaceDb
|
||||||
|
|
@ -719,6 +728,20 @@ class TeamsbotService:
|
||||||
asyncio.create_task(self._checkPendingNameTrigger())
|
asyncio.create_task(self._checkPendingNameTrigger())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Follow-up window: after a bot response, trigger AI for any human speech
|
||||||
|
# without requiring the bot name — the AI decides via shouldRespond
|
||||||
|
if (
|
||||||
|
source == "audioCapture"
|
||||||
|
and not self._isBotSpeaker(speaker)
|
||||||
|
and time.time() < self._followUpWindowEnd
|
||||||
|
and not self._pendingNameTrigger
|
||||||
|
):
|
||||||
|
isNew = self._setPendingNameTrigger(sessionId, interface, voiceInterface, websocket, createdTranscript)
|
||||||
|
if isNew:
|
||||||
|
logger.info(f"Session {sessionId}: Follow-up window trigger (no name needed)")
|
||||||
|
asyncio.create_task(self._checkPendingNameTrigger())
|
||||||
|
return
|
||||||
|
|
||||||
# Periodic trigger (only when no debounce pending)
|
# Periodic trigger (only when no debounce pending)
|
||||||
if not self._pendingNameTrigger:
|
if not self._pendingNameTrigger:
|
||||||
shouldTrigger = self._shouldTriggerAnalysis(text)
|
shouldTrigger = self._shouldTriggerAnalysis(text)
|
||||||
|
|
@ -1149,7 +1172,8 @@ class TeamsbotService:
|
||||||
self._lastTranscriptText = speechResult.responseText
|
self._lastTranscriptText = speechResult.responseText
|
||||||
self._lastTranscriptId = botTranscript.get("id")
|
self._lastTranscriptId = botTranscript.get("id")
|
||||||
|
|
||||||
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}")
|
self._followUpWindowEnd = time.time() + 15.0
|
||||||
|
logger.info(f"Bot responded in session {sessionId}: intent={speechResult.detectedIntent}, follow-up window open for 15s")
|
||||||
|
|
||||||
# Step 5: Execute AI-issued commands (if any)
|
# Step 5: Execute AI-issued commands (if any)
|
||||||
if speechResult.commands:
|
if speechResult.commands:
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,9 @@ class VoiceObjects:
|
||||||
|
|
||||||
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
||||||
sampleRate: int = None, channels: int = None,
|
sampleRate: int = None, channels: int = None,
|
||||||
skipFallbacks: bool = False) -> Dict[str, Any]:
|
skipFallbacks: bool = False,
|
||||||
|
phraseHints: list = None,
|
||||||
|
alternativeLanguages: list = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Convert speech to text using Google Cloud Speech-to-Text API.
|
Convert speech to text using Google Cloud Speech-to-Text API.
|
||||||
|
|
||||||
|
|
@ -77,12 +79,14 @@ class VoiceObjects:
|
||||||
sampleRate: Audio sample rate (auto-detected if None)
|
sampleRate: Audio sample rate (auto-detected if None)
|
||||||
channels: Number of audio channels (auto-detected if None)
|
channels: Number of audio channels (auto-detected if None)
|
||||||
skipFallbacks: If True, skip fallback attempts (use when audio format is known)
|
skipFallbacks: If True, skip fallback attempts (use when audio format is known)
|
||||||
|
phraseHints: Optional list of phrases to boost recognition (names, terms)
|
||||||
|
alternativeLanguages: Optional list of additional language codes for multi-language
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict containing transcribed text, confidence, and metadata
|
Dict containing transcribed text, confidence, and metadata
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.info(f"🎤 Speech-to-text request: {len(audioContent)} bytes, language: {language}")
|
logger.info(f"Speech-to-text request: {len(audioContent)} bytes, language: {language}")
|
||||||
|
|
||||||
connector = self._getGoogleSpeechConnector()
|
connector = self._getGoogleSpeechConnector()
|
||||||
result = await connector.speechToText(
|
result = await connector.speechToText(
|
||||||
|
|
@ -90,7 +94,9 @@ class VoiceObjects:
|
||||||
language=language,
|
language=language,
|
||||||
sampleRate=sampleRate,
|
sampleRate=sampleRate,
|
||||||
channels=channels,
|
channels=channels,
|
||||||
skipFallbacks=skipFallbacks
|
skipFallbacks=skipFallbacks,
|
||||||
|
phraseHints=phraseHints,
|
||||||
|
alternativeLanguages=alternativeLanguages,
|
||||||
)
|
)
|
||||||
|
|
||||||
if result["success"]:
|
if result["success"]:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue