diff --git a/modules/connectors/connectorVoiceGoogle.py b/modules/connectors/connectorVoiceGoogle.py index c32b7fa4..85a19bf7 100644 --- a/modules/connectors/connectorVoiceGoogle.py +++ b/modules/connectors/connectorVoiceGoogle.py @@ -58,7 +58,8 @@ class ConnectorGoogleSpeech: raise async def speechToText(self, audioContent: bytes, language: str = "de-DE", - sampleRate: int = None, channels: int = None) -> Dict: + sampleRate: int = None, channels: int = None, + skipFallbacks: bool = False) -> Dict: """ Convert speech to text using Google Cloud Speech-to-Text API. @@ -234,6 +235,15 @@ class ConnectorGoogleSpeech: "error": f"Google Cloud error: {response.error}" } + # Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz) + if skipFallbacks: + return { + "success": False, + "text": "", + "confidence": 0.0, + "error": "No recognition results (silence or unclear audio)" + } + # Try multiple fallback approaches fallback_configs = [] diff --git a/modules/features/teamsbot/service.py b/modules/features/teamsbot/service.py index a7a266ec..ea10fddd 100644 --- a/modules/features/teamsbot/service.py +++ b/modules/features/teamsbot/service.py @@ -198,12 +198,15 @@ class TeamsbotService: audioBuffer = bytearray() bufferDurationMs = 0 - targetBufferMs = 1500 # Buffer 1.5 seconds of audio before STT + targetBufferMs = 3000 # Buffer 3 seconds of audio before STT # PCM16 at 16kHz mono = 32000 bytes/second bytesPerSecond = 32000 bytesPerMs = bytesPerSecond / 1000 + # Track background STT/AI tasks so they don't block the WebSocket loop + backgroundTasks: list[asyncio.Task] = [] + logger.info(f"Audio processing started for session {sessionId}") try: @@ -235,18 +238,26 @@ class TeamsbotService: audioBuffer.extend(audioChunk) bufferDurationMs = len(audioBuffer) / bytesPerMs - # Process when buffer has enough audio + # Process when buffer has enough audio - run in background to not block WebSocket if bufferDurationMs >= targetBufferMs: - await self._processAudioBuffer( - bytes(audioBuffer), - sessionId, - interface, - voiceInterface, - websocket, - ) + chunkBytes = bytes(audioBuffer) audioBuffer.clear() bufferDurationMs = 0 + task = asyncio.create_task( + self._processAudioBuffer( + chunkBytes, + sessionId, + interface, + voiceInterface, + websocket, + ) + ) + backgroundTasks.append(task) + + # Clean up completed tasks + backgroundTasks = [t for t in backgroundTasks if not t.done()] + except Exception as e: if "disconnect" not in str(e).lower(): logger.error(f"Audio stream error for session {sessionId}: {e}") @@ -261,6 +272,10 @@ class TeamsbotService: websocket, ) + # Wait for any remaining background tasks + if backgroundTasks: + await asyncio.gather(*backgroundTasks, return_exceptions=True) + logger.info(f"Audio processing ended for session {sessionId}") async def _processAudioBuffer( @@ -274,12 +289,14 @@ class TeamsbotService: """Process a buffered audio chunk through the STT -> AI -> TTS pipeline.""" # Step 1: STT -- convert audio to text + # skipFallbacks=True because we know the exact format (LINEAR16, 16kHz, mono from Teams) try: sttResult = await voiceInterface.speechToText( audioContent=audioBytes, language=self.config.language, sampleRate=16000, - channels=1 + channels=1, + skipFallbacks=True ) except Exception as e: logger.warning(f"STT failed for session {sessionId}: {e}") diff --git a/modules/interfaces/interfaceVoiceObjects.py b/modules/interfaces/interfaceVoiceObjects.py index 0c28f81d..cccebce4 100644 --- a/modules/interfaces/interfaceVoiceObjects.py +++ b/modules/interfaces/interfaceVoiceObjects.py @@ -66,7 +66,8 @@ class VoiceObjects: # Speech-to-Text Operations async def speechToText(self, audioContent: bytes, language: str = "de-DE", - sampleRate: int = None, channels: int = None) -> Dict[str, Any]: + sampleRate: int = None, channels: int = None, + skipFallbacks: bool = False) -> Dict[str, Any]: """ Convert speech to text using Google Cloud Speech-to-Text API. @@ -75,6 +76,7 @@ class VoiceObjects: language: Language code (e.g., 'de-DE', 'en-US') sampleRate: Audio sample rate (auto-detected if None) channels: Number of audio channels (auto-detected if None) + skipFallbacks: If True, skip fallback attempts (use when audio format is known) Returns: Dict containing transcribed text, confidence, and metadata @@ -87,7 +89,8 @@ class VoiceObjects: audioContent=audioContent, language=language, sampleRate=sampleRate, - channels=channels + channels=channels, + skipFallbacks=skipFallbacks ) if result["success"]: