gateway/modules/routes/routeVoiceWebSocket.py
2025-09-13 01:57:53 +02:00

494 lines
21 KiB
Python

"""
Azure Voice Services WebSocket Routes
Provides real-time WebSocket endpoints for:
- Live microphone audio streaming
- Real-time speech-to-text
- Real-time translation
- Real-time text-to-speech
"""
import logging
import asyncio
import json
from typing import Dict, Any, Optional, List
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, Query
from fastapi.websockets import WebSocketState
import base64
import io
from datetime import datetime
from modules.interfaces.interfaceAppObjects import getRootInterface
from modules.interfaces.interfaceAppModel import UserInDB
from modules.security.auth import getCurrentUser
from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/voice/ws", tags=["voice-websocket"])
class ConnectionManager:
"""Manages WebSocket connections for real-time voice services."""
def __init__(self):
self.active_connections: Dict[str, WebSocket] = {}
self.user_connections: Dict[str, str] = {} # user_id -> connection_id
self.connection_metadata: Dict[str, Dict] = {}
async def connect(self, websocket: WebSocket, connection_id: str, user_id: str, metadata: Dict = None):
"""Accept a WebSocket connection."""
await websocket.accept()
self.active_connections[connection_id] = websocket
self.user_connections[user_id] = connection_id
self.connection_metadata[connection_id] = metadata or {}
logger.info(f"WebSocket connected: {connection_id} for user {user_id}")
def disconnect(self, connection_id: str):
"""Remove a WebSocket connection."""
if connection_id in self.active_connections:
del self.active_connections[connection_id]
# Remove from user mapping
user_id = None
for uid, cid in self.user_connections.items():
if cid == connection_id:
user_id = uid
break
if user_id:
del self.user_connections[user_id]
if connection_id in self.connection_metadata:
del self.connection_metadata[connection_id]
logger.info(f"WebSocket disconnected: {connection_id}")
async def send_message(self, connection_id: str, message: Dict):
"""Send a message to a specific connection."""
if connection_id in self.active_connections:
websocket = self.active_connections[connection_id]
try:
if websocket.client_state == WebSocketState.CONNECTED:
await websocket.send_text(json.dumps(message))
else:
self.disconnect(connection_id)
except Exception as e:
logger.error(f"Error sending message to {connection_id}: {str(e)}")
self.disconnect(connection_id)
async def send_error(self, connection_id: str, error: str, error_code: str = "GENERAL_ERROR"):
"""Send an error message to a connection."""
await self.send_message(connection_id, {
"type": "error",
"error": error,
"error_code": error_code,
"timestamp": datetime.now().isoformat()
})
def get_connection_metadata(self, connection_id: str) -> Dict:
"""Get metadata for a connection."""
return self.connection_metadata.get(connection_id, {})
# Global connection manager
manager = ConnectionManager()
async def get_azure_speech_connector_ws(user_id: str) -> Optional[ConnectorAzureSpeech]:
"""Get Azure Speech connector for WebSocket user."""
try:
root_interface = getRootInterface()
# Get user by ID
user = root_interface.getUser(user_id)
if not user:
logger.error(f"User with ID {user_id} not found")
return None
# Get user connections
user_connections = root_interface.getUserConnections(user_id)
azure_connection = None
# Find first Azure connection
for connection in user_connections:
if connection.authority == "msft":
azure_connection = connection
break
if not azure_connection:
logger.error(f"No Azure connection found for user {user_id}")
return None
# Get connection token
connection_token = root_interface.getConnectionToken(azure_connection.id)
if not connection_token:
logger.error(f"No connection token found for user {user_id}")
return None
# Use Azure Speech Services subscription key from configuration
from modules.shared.configuration import APP_CONFIG
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
logger.error("Azure Speech Services subscription key not configured")
return None
# Get region from configuration
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
# Create Azure Speech connector
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
return connector
except Exception as e:
logger.error(f"Error getting Azure Speech connector for WebSocket: {str(e)}")
return None
@router.websocket("/realtime-interpreter")
async def websocket_realtime_interpreter(
websocket: WebSocket,
user_id: str = Query(...),
from_language: str = Query("de-DE"),
to_language: str = Query("en-US"),
audio_format: str = Query("wav")
):
"""WebSocket endpoint for real-time interpreter with live audio streaming."""
connection_id = f"interpreter_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
try:
# Connect WebSocket
await manager.connect(websocket, connection_id, user_id, {
"service": "realtime_interpreter",
"from_language": from_language,
"to_language": to_language,
"audio_format": audio_format
})
# Get Azure Speech connector
connector = await get_azure_speech_connector_ws(user_id)
if not connector:
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
return
# Send connection confirmation
await manager.send_message(connection_id, {
"type": "connected",
"connection_id": connection_id,
"service": "realtime_interpreter",
"from_language": from_language,
"to_language": to_language,
"timestamp": datetime.now().isoformat()
})
# Process incoming audio chunks
audio_buffer = io.BytesIO()
chunk_count = 0
while True:
try:
# Receive message from client
data = await websocket.receive_text()
message = json.loads(data)
if message.get("type") == "audio_chunk":
# Decode base64 audio data
audio_data = base64.b64decode(message.get("data", ""))
audio_buffer.write(audio_data)
chunk_count += 1
# Process every 10 chunks or when buffer is large enough
if chunk_count % 10 == 0 or len(audio_data) > 8192:
audio_buffer.seek(0)
audio_content = audio_buffer.read()
if len(audio_content) > 0:
try:
# Convert speech to text
stt_result = await connector.speech_to_text(
audio_content=audio_content,
language=from_language,
format="detailed",
audio_format=audio_format
)
original_text = stt_result.get("text", "")
# Translate if different languages
translated_text = original_text
if from_language != to_language and original_text.strip():
try:
translated_text = await connector.translate_text(
text=original_text,
from_language=from_language,
to_language=to_language
)
except Exception as e:
logger.warning(f"Translation failed: {str(e)}")
translated_text = original_text
# Send result back to client
await manager.send_message(connection_id, {
"type": "translation_result",
"original_text": original_text,
"translated_text": translated_text,
"from_language": from_language,
"to_language": to_language,
"confidence": stt_result.get("confidence", 0.0),
"timestamp": datetime.now().isoformat()
})
except Exception as e:
logger.error(f"Error processing audio chunk: {str(e)}")
await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR")
# Reset buffer
audio_buffer = io.BytesIO()
chunk_count = 0
elif message.get("type") == "ping":
# Respond to ping with pong
await manager.send_message(connection_id, {
"type": "pong",
"timestamp": datetime.now().isoformat()
})
elif message.get("type") == "disconnect":
# Client requested disconnect
break
except WebSocketDisconnect:
break
except json.JSONDecodeError:
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
except Exception as e:
logger.error(f"Error processing WebSocket message: {str(e)}")
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
except WebSocketDisconnect:
pass
except Exception as e:
logger.error(f"WebSocket error: {str(e)}")
try:
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
except:
pass
finally:
manager.disconnect(connection_id)
@router.websocket("/speech-to-text")
async def websocket_speech_to_text(
websocket: WebSocket,
user_id: str = Query(...),
language: str = Query("de-DE"),
audio_format: str = Query("wav")
):
"""WebSocket endpoint for real-time speech-to-text with live audio streaming."""
connection_id = f"stt_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
try:
# Connect WebSocket
await manager.connect(websocket, connection_id, user_id, {
"service": "speech_to_text",
"language": language,
"audio_format": audio_format
})
# Get Azure Speech connector
connector = await get_azure_speech_connector_ws(user_id)
if not connector:
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
return
# Send connection confirmation
await manager.send_message(connection_id, {
"type": "connected",
"connection_id": connection_id,
"service": "speech_to_text",
"language": language,
"timestamp": datetime.now().isoformat()
})
# Process incoming audio chunks
audio_buffer = io.BytesIO()
chunk_count = 0
while True:
try:
# Receive message from client
data = await websocket.receive_text()
message = json.loads(data)
if message.get("type") == "audio_chunk":
# Decode base64 audio data
audio_data = base64.b64decode(message.get("data", ""))
audio_buffer.write(audio_data)
chunk_count += 1
# Process every 10 chunks or when buffer is large enough
if chunk_count % 10 == 0 or len(audio_data) > 8192:
audio_buffer.seek(0)
audio_content = audio_buffer.read()
if len(audio_content) > 0:
try:
# Convert speech to text
stt_result = await connector.speech_to_text(
audio_content=audio_content,
language=language,
format="detailed",
audio_format=audio_format
)
# Send result back to client
await manager.send_message(connection_id, {
"type": "transcription_result",
"text": stt_result.get("text", ""),
"confidence": stt_result.get("confidence", 0.0),
"language": stt_result.get("language", language),
"is_final": stt_result.get("RecognitionStatus") == "Success",
"timestamp": datetime.now().isoformat()
})
except Exception as e:
logger.error(f"Error processing audio chunk: {str(e)}")
await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR")
# Reset buffer
audio_buffer = io.BytesIO()
chunk_count = 0
elif message.get("type") == "ping":
# Respond to ping with pong
await manager.send_message(connection_id, {
"type": "pong",
"timestamp": datetime.now().isoformat()
})
elif message.get("type") == "disconnect":
# Client requested disconnect
break
except WebSocketDisconnect:
break
except json.JSONDecodeError:
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
except Exception as e:
logger.error(f"Error processing WebSocket message: {str(e)}")
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
except WebSocketDisconnect:
pass
except Exception as e:
logger.error(f"WebSocket error: {str(e)}")
try:
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
except:
pass
finally:
manager.disconnect(connection_id)
@router.websocket("/text-to-speech")
async def websocket_text_to_speech(
websocket: WebSocket,
user_id: str = Query(...),
language: str = Query("de-DE"),
voice: str = Query("de-DE-KatjaNeural"),
audio_format: str = Query("audio-16khz-128kbitrate-mono-mp3")
):
"""WebSocket endpoint for real-time text-to-speech streaming."""
connection_id = f"tts_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
try:
# Connect WebSocket
await manager.connect(websocket, connection_id, user_id, {
"service": "text_to_speech",
"language": language,
"voice": voice,
"audio_format": audio_format
})
# Get Azure Speech connector
connector = await get_azure_speech_connector_ws(user_id)
if not connector:
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
return
# Send connection confirmation
await manager.send_message(connection_id, {
"type": "connected",
"connection_id": connection_id,
"service": "text_to_speech",
"language": language,
"voice": voice,
"timestamp": datetime.now().isoformat()
})
while True:
try:
# Receive message from client
data = await websocket.receive_text()
message = json.loads(data)
if message.get("type") == "text_to_speak":
text = message.get("text", "")
if text.strip():
try:
# Convert text to speech
audio_data = await connector.text_to_speech(
text=text,
language=language,
voice=voice,
format=audio_format
)
# Send audio data back to client
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
await manager.send_message(connection_id, {
"type": "audio_data",
"audio_data": audio_base64,
"format": audio_format,
"voice": voice,
"text": text,
"timestamp": datetime.now().isoformat()
})
except Exception as e:
logger.error(f"Error converting text to speech: {str(e)}")
await manager.send_error(connection_id, f"TTS failed: {str(e)}", "TTS_ERROR")
elif message.get("type") == "ping":
# Respond to ping with pong
await manager.send_message(connection_id, {
"type": "pong",
"timestamp": datetime.now().isoformat()
})
elif message.get("type") == "disconnect":
# Client requested disconnect
break
except WebSocketDisconnect:
break
except json.JSONDecodeError:
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
except Exception as e:
logger.error(f"Error processing WebSocket message: {str(e)}")
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
except WebSocketDisconnect:
pass
except Exception as e:
logger.error(f"WebSocket error: {str(e)}")
try:
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
except:
pass
finally:
manager.disconnect(connection_id)
@router.get("/status")
async def websocket_status():
"""Get WebSocket connection status."""
return {
"active_connections": len(manager.active_connections),
"connected_users": len(manager.user_connections),
"services": {
"realtime_interpreter": len([c for c in manager.connection_metadata.values() if c.get("service") == "realtime_interpreter"]),
"speech_to_text": len([c for c in manager.connection_metadata.values() if c.get("service") == "speech_to_text"]),
"text_to_speech": len([c for c in manager.connection_metadata.values() if c.get("service") == "text_to_speech"])
}
}