494 lines
21 KiB
Python
494 lines
21 KiB
Python
"""
|
|
Azure Voice Services WebSocket Routes
|
|
Provides real-time WebSocket endpoints for:
|
|
- Live microphone audio streaming
|
|
- Real-time speech-to-text
|
|
- Real-time translation
|
|
- Real-time text-to-speech
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
import json
|
|
from typing import Dict, Any, Optional, List
|
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, Query
|
|
from fastapi.websockets import WebSocketState
|
|
import base64
|
|
import io
|
|
from datetime import datetime
|
|
|
|
from modules.interfaces.interfaceAppObjects import getRootInterface
|
|
from modules.interfaces.interfaceAppModel import UserInDB
|
|
from modules.security.auth import getCurrentUser
|
|
from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/voice/ws", tags=["voice-websocket"])
|
|
|
|
class ConnectionManager:
|
|
"""Manages WebSocket connections for real-time voice services."""
|
|
|
|
def __init__(self):
|
|
self.active_connections: Dict[str, WebSocket] = {}
|
|
self.user_connections: Dict[str, str] = {} # user_id -> connection_id
|
|
self.connection_metadata: Dict[str, Dict] = {}
|
|
|
|
async def connect(self, websocket: WebSocket, connection_id: str, user_id: str, metadata: Dict = None):
|
|
"""Accept a WebSocket connection."""
|
|
await websocket.accept()
|
|
self.active_connections[connection_id] = websocket
|
|
self.user_connections[user_id] = connection_id
|
|
self.connection_metadata[connection_id] = metadata or {}
|
|
logger.info(f"WebSocket connected: {connection_id} for user {user_id}")
|
|
|
|
def disconnect(self, connection_id: str):
|
|
"""Remove a WebSocket connection."""
|
|
if connection_id in self.active_connections:
|
|
del self.active_connections[connection_id]
|
|
# Remove from user mapping
|
|
user_id = None
|
|
for uid, cid in self.user_connections.items():
|
|
if cid == connection_id:
|
|
user_id = uid
|
|
break
|
|
if user_id:
|
|
del self.user_connections[user_id]
|
|
if connection_id in self.connection_metadata:
|
|
del self.connection_metadata[connection_id]
|
|
logger.info(f"WebSocket disconnected: {connection_id}")
|
|
|
|
async def send_message(self, connection_id: str, message: Dict):
|
|
"""Send a message to a specific connection."""
|
|
if connection_id in self.active_connections:
|
|
websocket = self.active_connections[connection_id]
|
|
try:
|
|
if websocket.client_state == WebSocketState.CONNECTED:
|
|
await websocket.send_text(json.dumps(message))
|
|
else:
|
|
self.disconnect(connection_id)
|
|
except Exception as e:
|
|
logger.error(f"Error sending message to {connection_id}: {str(e)}")
|
|
self.disconnect(connection_id)
|
|
|
|
async def send_error(self, connection_id: str, error: str, error_code: str = "GENERAL_ERROR"):
|
|
"""Send an error message to a connection."""
|
|
await self.send_message(connection_id, {
|
|
"type": "error",
|
|
"error": error,
|
|
"error_code": error_code,
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
def get_connection_metadata(self, connection_id: str) -> Dict:
|
|
"""Get metadata for a connection."""
|
|
return self.connection_metadata.get(connection_id, {})
|
|
|
|
# Global connection manager
|
|
manager = ConnectionManager()
|
|
|
|
async def get_azure_speech_connector_ws(user_id: str) -> Optional[ConnectorAzureSpeech]:
|
|
"""Get Azure Speech connector for WebSocket user."""
|
|
try:
|
|
root_interface = getRootInterface()
|
|
|
|
# Get user by ID
|
|
user = root_interface.getUser(user_id)
|
|
if not user:
|
|
logger.error(f"User with ID {user_id} not found")
|
|
return None
|
|
|
|
# Get user connections
|
|
user_connections = root_interface.getUserConnections(user_id)
|
|
azure_connection = None
|
|
|
|
# Find first Azure connection
|
|
for connection in user_connections:
|
|
if connection.authority == "msft":
|
|
azure_connection = connection
|
|
break
|
|
|
|
if not azure_connection:
|
|
logger.error(f"No Azure connection found for user {user_id}")
|
|
return None
|
|
|
|
# Get connection token
|
|
connection_token = root_interface.getConnectionToken(azure_connection.id)
|
|
if not connection_token:
|
|
logger.error(f"No connection token found for user {user_id}")
|
|
return None
|
|
|
|
# Use Azure Speech Services subscription key from configuration
|
|
from modules.shared.configuration import APP_CONFIG
|
|
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
|
|
|
|
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
|
|
logger.error("Azure Speech Services subscription key not configured")
|
|
return None
|
|
|
|
# Get region from configuration
|
|
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
|
|
|
|
# Create Azure Speech connector
|
|
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
|
|
return connector
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting Azure Speech connector for WebSocket: {str(e)}")
|
|
return None
|
|
|
|
@router.websocket("/realtime-interpreter")
|
|
async def websocket_realtime_interpreter(
|
|
websocket: WebSocket,
|
|
user_id: str = Query(...),
|
|
from_language: str = Query("de-DE"),
|
|
to_language: str = Query("en-US"),
|
|
audio_format: str = Query("wav")
|
|
):
|
|
"""WebSocket endpoint for real-time interpreter with live audio streaming."""
|
|
connection_id = f"interpreter_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
|
|
try:
|
|
# Connect WebSocket
|
|
await manager.connect(websocket, connection_id, user_id, {
|
|
"service": "realtime_interpreter",
|
|
"from_language": from_language,
|
|
"to_language": to_language,
|
|
"audio_format": audio_format
|
|
})
|
|
|
|
# Get Azure Speech connector
|
|
connector = await get_azure_speech_connector_ws(user_id)
|
|
if not connector:
|
|
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
|
|
return
|
|
|
|
# Send connection confirmation
|
|
await manager.send_message(connection_id, {
|
|
"type": "connected",
|
|
"connection_id": connection_id,
|
|
"service": "realtime_interpreter",
|
|
"from_language": from_language,
|
|
"to_language": to_language,
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
# Process incoming audio chunks
|
|
audio_buffer = io.BytesIO()
|
|
chunk_count = 0
|
|
|
|
while True:
|
|
try:
|
|
# Receive message from client
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
|
|
if message.get("type") == "audio_chunk":
|
|
# Decode base64 audio data
|
|
audio_data = base64.b64decode(message.get("data", ""))
|
|
audio_buffer.write(audio_data)
|
|
chunk_count += 1
|
|
|
|
# Process every 10 chunks or when buffer is large enough
|
|
if chunk_count % 10 == 0 or len(audio_data) > 8192:
|
|
audio_buffer.seek(0)
|
|
audio_content = audio_buffer.read()
|
|
|
|
if len(audio_content) > 0:
|
|
try:
|
|
# Convert speech to text
|
|
stt_result = await connector.speech_to_text(
|
|
audio_content=audio_content,
|
|
language=from_language,
|
|
format="detailed",
|
|
audio_format=audio_format
|
|
)
|
|
|
|
original_text = stt_result.get("text", "")
|
|
|
|
# Translate if different languages
|
|
translated_text = original_text
|
|
if from_language != to_language and original_text.strip():
|
|
try:
|
|
translated_text = await connector.translate_text(
|
|
text=original_text,
|
|
from_language=from_language,
|
|
to_language=to_language
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Translation failed: {str(e)}")
|
|
translated_text = original_text
|
|
|
|
# Send result back to client
|
|
await manager.send_message(connection_id, {
|
|
"type": "translation_result",
|
|
"original_text": original_text,
|
|
"translated_text": translated_text,
|
|
"from_language": from_language,
|
|
"to_language": to_language,
|
|
"confidence": stt_result.get("confidence", 0.0),
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing audio chunk: {str(e)}")
|
|
await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR")
|
|
|
|
# Reset buffer
|
|
audio_buffer = io.BytesIO()
|
|
chunk_count = 0
|
|
|
|
elif message.get("type") == "ping":
|
|
# Respond to ping with pong
|
|
await manager.send_message(connection_id, {
|
|
"type": "pong",
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
elif message.get("type") == "disconnect":
|
|
# Client requested disconnect
|
|
break
|
|
|
|
except WebSocketDisconnect:
|
|
break
|
|
except json.JSONDecodeError:
|
|
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
|
|
except Exception as e:
|
|
logger.error(f"Error processing WebSocket message: {str(e)}")
|
|
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
|
|
|
|
except WebSocketDisconnect:
|
|
pass
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error: {str(e)}")
|
|
try:
|
|
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
|
|
except:
|
|
pass
|
|
finally:
|
|
manager.disconnect(connection_id)
|
|
|
|
@router.websocket("/speech-to-text")
|
|
async def websocket_speech_to_text(
|
|
websocket: WebSocket,
|
|
user_id: str = Query(...),
|
|
language: str = Query("de-DE"),
|
|
audio_format: str = Query("wav")
|
|
):
|
|
"""WebSocket endpoint for real-time speech-to-text with live audio streaming."""
|
|
connection_id = f"stt_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
|
|
try:
|
|
# Connect WebSocket
|
|
await manager.connect(websocket, connection_id, user_id, {
|
|
"service": "speech_to_text",
|
|
"language": language,
|
|
"audio_format": audio_format
|
|
})
|
|
|
|
# Get Azure Speech connector
|
|
connector = await get_azure_speech_connector_ws(user_id)
|
|
if not connector:
|
|
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
|
|
return
|
|
|
|
# Send connection confirmation
|
|
await manager.send_message(connection_id, {
|
|
"type": "connected",
|
|
"connection_id": connection_id,
|
|
"service": "speech_to_text",
|
|
"language": language,
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
# Process incoming audio chunks
|
|
audio_buffer = io.BytesIO()
|
|
chunk_count = 0
|
|
|
|
while True:
|
|
try:
|
|
# Receive message from client
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
|
|
if message.get("type") == "audio_chunk":
|
|
# Decode base64 audio data
|
|
audio_data = base64.b64decode(message.get("data", ""))
|
|
audio_buffer.write(audio_data)
|
|
chunk_count += 1
|
|
|
|
# Process every 10 chunks or when buffer is large enough
|
|
if chunk_count % 10 == 0 or len(audio_data) > 8192:
|
|
audio_buffer.seek(0)
|
|
audio_content = audio_buffer.read()
|
|
|
|
if len(audio_content) > 0:
|
|
try:
|
|
# Convert speech to text
|
|
stt_result = await connector.speech_to_text(
|
|
audio_content=audio_content,
|
|
language=language,
|
|
format="detailed",
|
|
audio_format=audio_format
|
|
)
|
|
|
|
# Send result back to client
|
|
await manager.send_message(connection_id, {
|
|
"type": "transcription_result",
|
|
"text": stt_result.get("text", ""),
|
|
"confidence": stt_result.get("confidence", 0.0),
|
|
"language": stt_result.get("language", language),
|
|
"is_final": stt_result.get("RecognitionStatus") == "Success",
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing audio chunk: {str(e)}")
|
|
await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR")
|
|
|
|
# Reset buffer
|
|
audio_buffer = io.BytesIO()
|
|
chunk_count = 0
|
|
|
|
elif message.get("type") == "ping":
|
|
# Respond to ping with pong
|
|
await manager.send_message(connection_id, {
|
|
"type": "pong",
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
elif message.get("type") == "disconnect":
|
|
# Client requested disconnect
|
|
break
|
|
|
|
except WebSocketDisconnect:
|
|
break
|
|
except json.JSONDecodeError:
|
|
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
|
|
except Exception as e:
|
|
logger.error(f"Error processing WebSocket message: {str(e)}")
|
|
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
|
|
|
|
except WebSocketDisconnect:
|
|
pass
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error: {str(e)}")
|
|
try:
|
|
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
|
|
except:
|
|
pass
|
|
finally:
|
|
manager.disconnect(connection_id)
|
|
|
|
@router.websocket("/text-to-speech")
|
|
async def websocket_text_to_speech(
|
|
websocket: WebSocket,
|
|
user_id: str = Query(...),
|
|
language: str = Query("de-DE"),
|
|
voice: str = Query("de-DE-KatjaNeural"),
|
|
audio_format: str = Query("audio-16khz-128kbitrate-mono-mp3")
|
|
):
|
|
"""WebSocket endpoint for real-time text-to-speech streaming."""
|
|
connection_id = f"tts_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
|
|
try:
|
|
# Connect WebSocket
|
|
await manager.connect(websocket, connection_id, user_id, {
|
|
"service": "text_to_speech",
|
|
"language": language,
|
|
"voice": voice,
|
|
"audio_format": audio_format
|
|
})
|
|
|
|
# Get Azure Speech connector
|
|
connector = await get_azure_speech_connector_ws(user_id)
|
|
if not connector:
|
|
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
|
|
return
|
|
|
|
# Send connection confirmation
|
|
await manager.send_message(connection_id, {
|
|
"type": "connected",
|
|
"connection_id": connection_id,
|
|
"service": "text_to_speech",
|
|
"language": language,
|
|
"voice": voice,
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
while True:
|
|
try:
|
|
# Receive message from client
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
|
|
if message.get("type") == "text_to_speak":
|
|
text = message.get("text", "")
|
|
|
|
if text.strip():
|
|
try:
|
|
# Convert text to speech
|
|
audio_data = await connector.text_to_speech(
|
|
text=text,
|
|
language=language,
|
|
voice=voice,
|
|
format=audio_format
|
|
)
|
|
|
|
# Send audio data back to client
|
|
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
await manager.send_message(connection_id, {
|
|
"type": "audio_data",
|
|
"audio_data": audio_base64,
|
|
"format": audio_format,
|
|
"voice": voice,
|
|
"text": text,
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error converting text to speech: {str(e)}")
|
|
await manager.send_error(connection_id, f"TTS failed: {str(e)}", "TTS_ERROR")
|
|
|
|
elif message.get("type") == "ping":
|
|
# Respond to ping with pong
|
|
await manager.send_message(connection_id, {
|
|
"type": "pong",
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
|
|
elif message.get("type") == "disconnect":
|
|
# Client requested disconnect
|
|
break
|
|
|
|
except WebSocketDisconnect:
|
|
break
|
|
except json.JSONDecodeError:
|
|
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
|
|
except Exception as e:
|
|
logger.error(f"Error processing WebSocket message: {str(e)}")
|
|
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
|
|
|
|
except WebSocketDisconnect:
|
|
pass
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error: {str(e)}")
|
|
try:
|
|
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
|
|
except:
|
|
pass
|
|
finally:
|
|
manager.disconnect(connection_id)
|
|
|
|
@router.get("/status")
|
|
async def websocket_status():
|
|
"""Get WebSocket connection status."""
|
|
return {
|
|
"active_connections": len(manager.active_connections),
|
|
"connected_users": len(manager.user_connections),
|
|
"services": {
|
|
"realtime_interpreter": len([c for c in manager.connection_metadata.values() if c.get("service") == "realtime_interpreter"]),
|
|
"speech_to_text": len([c for c in manager.connection_metadata.values() if c.get("service") == "speech_to_text"]),
|
|
"text_to_speech": len([c for c in manager.connection_metadata.values() if c.get("service") == "text_to_speech"])
|
|
}
|
|
}
|