231 lines
8.5 KiB
Python
231 lines
8.5 KiB
Python
"""
|
|
Voice Streaming WebSocket Routes
|
|
Provides real-time audio streaming for voice services
|
|
"""
|
|
|
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends
|
|
from fastapi.responses import JSONResponse
|
|
import logging
|
|
import json
|
|
import base64
|
|
import asyncio
|
|
from typing import Dict, List
|
|
|
|
from modules.shared.configuration import APP_CONFIG
|
|
from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/voice/ws", tags=["Voice Streaming"])
|
|
|
|
# Store active connections
|
|
active_connections: Dict[str, WebSocket] = {}
|
|
|
|
class ConnectionManager:
|
|
def __init__(self):
|
|
self.active_connections: List[WebSocket] = []
|
|
|
|
async def connect(self, websocket: WebSocket, connection_id: str):
|
|
await websocket.accept()
|
|
self.active_connections.append(websocket)
|
|
active_connections[connection_id] = websocket
|
|
logger.info(f"WebSocket connected: {connection_id}")
|
|
|
|
def disconnect(self, websocket: WebSocket, connection_id: str):
|
|
if websocket in self.active_connections:
|
|
self.active_connections.remove(websocket)
|
|
if connection_id in active_connections:
|
|
del active_connections[connection_id]
|
|
logger.info(f"WebSocket disconnected: {connection_id}")
|
|
|
|
async def send_personal_message(self, message: dict, websocket: WebSocket):
|
|
try:
|
|
await websocket.send_text(json.dumps(message))
|
|
except Exception as e:
|
|
logger.error(f"Error sending message: {e}")
|
|
|
|
manager = ConnectionManager()
|
|
|
|
@router.websocket("/realtime-interpreter")
|
|
async def websocket_realtime_interpreter(
|
|
websocket: WebSocket,
|
|
user_id: str = "default",
|
|
from_language: str = "de-DE",
|
|
to_language: str = "en-US"
|
|
):
|
|
"""WebSocket endpoint for real-time voice interpretation"""
|
|
connection_id = f"realtime_{user_id}_{from_language}_{to_language}"
|
|
|
|
try:
|
|
await manager.connect(websocket, connection_id)
|
|
|
|
# Send connection confirmation
|
|
await manager.send_personal_message({
|
|
"type": "connected",
|
|
"connection_id": connection_id,
|
|
"message": "Connected to real-time interpreter"
|
|
}, websocket)
|
|
|
|
# Initialize Google Speech connector
|
|
google_speech = ConnectorGoogleSpeech()
|
|
|
|
while True:
|
|
# Receive message from client
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
|
|
if message["type"] == "audio_chunk":
|
|
# Process audio chunk
|
|
try:
|
|
# Decode base64 audio data
|
|
audio_data = base64.b64decode(message["data"])
|
|
|
|
# For now, just acknowledge receipt
|
|
# In a full implementation, this would:
|
|
# 1. Buffer audio chunks
|
|
# 2. Process with Google Cloud Speech-to-Text streaming
|
|
# 3. Send partial results back
|
|
# 4. Handle translation
|
|
|
|
await manager.send_personal_message({
|
|
"type": "audio_received",
|
|
"chunk_size": len(audio_data),
|
|
"timestamp": message.get("timestamp")
|
|
}, websocket)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing audio chunk: {e}")
|
|
await manager.send_personal_message({
|
|
"type": "error",
|
|
"error": f"Failed to process audio: {str(e)}"
|
|
}, websocket)
|
|
|
|
elif message["type"] == "ping":
|
|
# Respond to ping
|
|
await manager.send_personal_message({
|
|
"type": "pong",
|
|
"timestamp": message.get("timestamp")
|
|
}, websocket)
|
|
|
|
else:
|
|
logger.warning(f"Unknown message type: {message['type']}")
|
|
|
|
except WebSocketDisconnect:
|
|
manager.disconnect(websocket, connection_id)
|
|
logger.info(f"Client disconnected: {connection_id}")
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error: {e}")
|
|
manager.disconnect(websocket, connection_id)
|
|
|
|
@router.websocket("/speech-to-text")
|
|
async def websocket_speech_to_text(
|
|
websocket: WebSocket,
|
|
user_id: str = "default",
|
|
language: str = "de-DE"
|
|
):
|
|
"""WebSocket endpoint for real-time speech-to-text"""
|
|
connection_id = f"stt_{user_id}_{language}"
|
|
|
|
try:
|
|
await manager.connect(websocket, connection_id)
|
|
|
|
await manager.send_personal_message({
|
|
"type": "connected",
|
|
"connection_id": connection_id,
|
|
"message": "Connected to speech-to-text"
|
|
}, websocket)
|
|
|
|
# Initialize Google Speech connector
|
|
google_speech = ConnectorGoogleSpeech()
|
|
|
|
while True:
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
|
|
if message["type"] == "audio_chunk":
|
|
try:
|
|
audio_data = base64.b64decode(message["data"])
|
|
|
|
# Process audio chunk
|
|
# This would integrate with Google Cloud Speech-to-Text streaming API
|
|
|
|
await manager.send_personal_message({
|
|
"type": "transcription_result",
|
|
"text": "Audio chunk received", # Placeholder
|
|
"confidence": 0.95,
|
|
"is_final": False
|
|
}, websocket)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing audio: {e}")
|
|
await manager.send_personal_message({
|
|
"type": "error",
|
|
"error": f"Failed to process audio: {str(e)}"
|
|
}, websocket)
|
|
|
|
elif message["type"] == "ping":
|
|
await manager.send_personal_message({
|
|
"type": "pong",
|
|
"timestamp": message.get("timestamp")
|
|
}, websocket)
|
|
|
|
except WebSocketDisconnect:
|
|
manager.disconnect(websocket, connection_id)
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error: {e}")
|
|
manager.disconnect(websocket, connection_id)
|
|
|
|
@router.websocket("/text-to-speech")
|
|
async def websocket_text_to_speech(
|
|
websocket: WebSocket,
|
|
user_id: str = "default",
|
|
language: str = "de-DE",
|
|
voice: str = "de-DE-Wavenet-A"
|
|
):
|
|
"""WebSocket endpoint for real-time text-to-speech"""
|
|
connection_id = f"tts_{user_id}_{language}_{voice}"
|
|
|
|
try:
|
|
await manager.connect(websocket, connection_id)
|
|
|
|
await manager.send_personal_message({
|
|
"type": "connected",
|
|
"connection_id": connection_id,
|
|
"message": "Connected to text-to-speech"
|
|
}, websocket)
|
|
|
|
while True:
|
|
data = await websocket.receive_text()
|
|
message = json.loads(data)
|
|
|
|
if message["type"] == "text_to_speak":
|
|
try:
|
|
text = message["text"]
|
|
|
|
# Process text-to-speech
|
|
# This would integrate with Google Cloud Text-to-Speech API
|
|
|
|
# For now, send a placeholder response
|
|
await manager.send_personal_message({
|
|
"type": "audio_data",
|
|
"audio": "base64_encoded_audio_here", # Placeholder
|
|
"format": "mp3"
|
|
}, websocket)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing text-to-speech: {e}")
|
|
await manager.send_personal_message({
|
|
"type": "error",
|
|
"error": f"Failed to process text: {str(e)}"
|
|
}, websocket)
|
|
|
|
elif message["type"] == "ping":
|
|
await manager.send_personal_message({
|
|
"type": "pong",
|
|
"timestamp": message.get("timestamp")
|
|
}, websocket)
|
|
|
|
except WebSocketDisconnect:
|
|
manager.disconnect(websocket, connection_id)
|
|
except Exception as e:
|
|
logger.error(f"WebSocket error: {e}")
|
|
manager.disconnect(websocket, connection_id)
|