fix: skip STT fallbacks for teamsbot, run audio processing in background
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
77151df0f4
commit
ae4dc9fa48
3 changed files with 43 additions and 13 deletions
|
|
@ -58,7 +58,8 @@ class ConnectorGoogleSpeech:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
||||||
sampleRate: int = None, channels: int = None) -> Dict:
|
sampleRate: int = None, channels: int = None,
|
||||||
|
skipFallbacks: bool = False) -> Dict:
|
||||||
"""
|
"""
|
||||||
Convert speech to text using Google Cloud Speech-to-Text API.
|
Convert speech to text using Google Cloud Speech-to-Text API.
|
||||||
|
|
||||||
|
|
@ -234,6 +235,15 @@ class ConnectorGoogleSpeech:
|
||||||
"error": f"Google Cloud error: {response.error}"
|
"error": f"Google Cloud error: {response.error}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
|
||||||
|
if skipFallbacks:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"text": "",
|
||||||
|
"confidence": 0.0,
|
||||||
|
"error": "No recognition results (silence or unclear audio)"
|
||||||
|
}
|
||||||
|
|
||||||
# Try multiple fallback approaches
|
# Try multiple fallback approaches
|
||||||
fallback_configs = []
|
fallback_configs = []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -198,12 +198,15 @@ class TeamsbotService:
|
||||||
|
|
||||||
audioBuffer = bytearray()
|
audioBuffer = bytearray()
|
||||||
bufferDurationMs = 0
|
bufferDurationMs = 0
|
||||||
targetBufferMs = 1500 # Buffer 1.5 seconds of audio before STT
|
targetBufferMs = 3000 # Buffer 3 seconds of audio before STT
|
||||||
|
|
||||||
# PCM16 at 16kHz mono = 32000 bytes/second
|
# PCM16 at 16kHz mono = 32000 bytes/second
|
||||||
bytesPerSecond = 32000
|
bytesPerSecond = 32000
|
||||||
bytesPerMs = bytesPerSecond / 1000
|
bytesPerMs = bytesPerSecond / 1000
|
||||||
|
|
||||||
|
# Track background STT/AI tasks so they don't block the WebSocket loop
|
||||||
|
backgroundTasks: list[asyncio.Task] = []
|
||||||
|
|
||||||
logger.info(f"Audio processing started for session {sessionId}")
|
logger.info(f"Audio processing started for session {sessionId}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -235,18 +238,26 @@ class TeamsbotService:
|
||||||
audioBuffer.extend(audioChunk)
|
audioBuffer.extend(audioChunk)
|
||||||
bufferDurationMs = len(audioBuffer) / bytesPerMs
|
bufferDurationMs = len(audioBuffer) / bytesPerMs
|
||||||
|
|
||||||
# Process when buffer has enough audio
|
# Process when buffer has enough audio - run in background to not block WebSocket
|
||||||
if bufferDurationMs >= targetBufferMs:
|
if bufferDurationMs >= targetBufferMs:
|
||||||
await self._processAudioBuffer(
|
chunkBytes = bytes(audioBuffer)
|
||||||
bytes(audioBuffer),
|
|
||||||
sessionId,
|
|
||||||
interface,
|
|
||||||
voiceInterface,
|
|
||||||
websocket,
|
|
||||||
)
|
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
bufferDurationMs = 0
|
bufferDurationMs = 0
|
||||||
|
|
||||||
|
task = asyncio.create_task(
|
||||||
|
self._processAudioBuffer(
|
||||||
|
chunkBytes,
|
||||||
|
sessionId,
|
||||||
|
interface,
|
||||||
|
voiceInterface,
|
||||||
|
websocket,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
backgroundTasks.append(task)
|
||||||
|
|
||||||
|
# Clean up completed tasks
|
||||||
|
backgroundTasks = [t for t in backgroundTasks if not t.done()]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "disconnect" not in str(e).lower():
|
if "disconnect" not in str(e).lower():
|
||||||
logger.error(f"Audio stream error for session {sessionId}: {e}")
|
logger.error(f"Audio stream error for session {sessionId}: {e}")
|
||||||
|
|
@ -261,6 +272,10 @@ class TeamsbotService:
|
||||||
websocket,
|
websocket,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Wait for any remaining background tasks
|
||||||
|
if backgroundTasks:
|
||||||
|
await asyncio.gather(*backgroundTasks, return_exceptions=True)
|
||||||
|
|
||||||
logger.info(f"Audio processing ended for session {sessionId}")
|
logger.info(f"Audio processing ended for session {sessionId}")
|
||||||
|
|
||||||
async def _processAudioBuffer(
|
async def _processAudioBuffer(
|
||||||
|
|
@ -274,12 +289,14 @@ class TeamsbotService:
|
||||||
"""Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
|
"""Process a buffered audio chunk through the STT -> AI -> TTS pipeline."""
|
||||||
|
|
||||||
# Step 1: STT -- convert audio to text
|
# Step 1: STT -- convert audio to text
|
||||||
|
# skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams)
|
||||||
try:
|
try:
|
||||||
sttResult = await voiceInterface.speechToText(
|
sttResult = await voiceInterface.speechToText(
|
||||||
audioContent=audioBytes,
|
audioContent=audioBytes,
|
||||||
language=self.config.language,
|
language=self.config.language,
|
||||||
sampleRate=16000,
|
sampleRate=16000,
|
||||||
channels=1
|
channels=1,
|
||||||
|
skipFallbacks=True
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"STT failed for session {sessionId}: {e}")
|
logger.warning(f"STT failed for session {sessionId}: {e}")
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,8 @@ class VoiceObjects:
|
||||||
# Speech-to-Text Operations
|
# Speech-to-Text Operations
|
||||||
|
|
||||||
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
||||||
sampleRate: int = None, channels: int = None) -> Dict[str, Any]:
|
sampleRate: int = None, channels: int = None,
|
||||||
|
skipFallbacks: bool = False) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Convert speech to text using Google Cloud Speech-to-Text API.
|
Convert speech to text using Google Cloud Speech-to-Text API.
|
||||||
|
|
||||||
|
|
@ -75,6 +76,7 @@ class VoiceObjects:
|
||||||
language: Language code (e.g., 'de-DE', 'en-US')
|
language: Language code (e.g., 'de-DE', 'en-US')
|
||||||
sampleRate: Audio sample rate (auto-detected if None)
|
sampleRate: Audio sample rate (auto-detected if None)
|
||||||
channels: Number of audio channels (auto-detected if None)
|
channels: Number of audio channels (auto-detected if None)
|
||||||
|
skipFallbacks: If True, skip fallback attempts (use when audio format is known)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict containing transcribed text, confidence, and metadata
|
Dict containing transcribed text, confidence, and metadata
|
||||||
|
|
@ -87,7 +89,8 @@ class VoiceObjects:
|
||||||
audioContent=audioContent,
|
audioContent=audioContent,
|
||||||
language=language,
|
language=language,
|
||||||
sampleRate=sampleRate,
|
sampleRate=sampleRate,
|
||||||
channels=channels
|
channels=channels,
|
||||||
|
skipFallbacks=skipFallbacks
|
||||||
)
|
)
|
||||||
|
|
||||||
if result["success"]:
|
if result["success"]:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue