""" Google Cloud Voice Services Routes Replaces Azure voice services with Google Cloud Speech-to-Text and Translation Includes WebSocket support for real-time voice streaming """ import os import logging import json import base64 import asyncio from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException, Body, WebSocket, WebSocketDisconnect from fastapi.responses import Response from typing import Optional, Dict, Any, List from modules.security.auth import getCurrentUser from modules.datamodels.datamodelUam import User from modules.interfaces.interfaceDbComponentObjects import getInterface from modules.interfaces.interfaceVoiceObjects import getVoiceInterface, VoiceObjects logger = logging.getLogger(__name__) router = APIRouter(prefix="/voice-google", tags=["Voice Google"]) # Store active WebSocket connections active_connections: Dict[str, WebSocket] = {} class ConnectionManager: def __init__(self): self.active_connections: List[WebSocket] = [] async def connect(self, websocket: WebSocket, connection_id: str): await websocket.accept() self.active_connections.append(websocket) active_connections[connection_id] = websocket logger.info(f"WebSocket connected: {connection_id}") def disconnect(self, websocket: WebSocket, connection_id: str): if websocket in self.active_connections: self.active_connections.remove(websocket) if connection_id in active_connections: del active_connections[connection_id] logger.info(f"WebSocket disconnected: {connection_id}") async def send_personal_message(self, message: dict, websocket: WebSocket): try: await websocket.send_text(json.dumps(message)) except Exception as e: logger.error(f"Error sending message: {e}") manager = ConnectionManager() def get_voice_interface(current_user: User) -> VoiceObjects: """Get voice interface instance with user context.""" try: return getVoiceInterface(current_user) except Exception as e: logger.error(f"Failed to initialize voice interface: {e}") raise HTTPException( status_code=500, detail=f"Failed to initialize voice interface: {str(e)}" ) @router.post("/speech-to-text") async def speech_to_text( audio_file: UploadFile = File(...), language: str = Form("de-DE"), current_user: User = Depends(getCurrentUser) ): """Convert speech to text using Google Cloud Speech-to-Text API.""" try: logger.info(f"🎤 Speech-to-text request: {audio_file.filename}, language: {language}") # Read audio file audio_content = await audio_file.read() logger.info(f"📊 Audio file size: {len(audio_content)} bytes") # Get voice interface voice_interface = get_voice_interface(current_user) # Validate audio format validation = voice_interface.validateAudioFormat(audio_content) if not validation["valid"]: raise HTTPException( status_code=400, detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}" ) # Perform speech recognition result = await voice_interface.speechToText( audioContent=audio_content, language=language ) if result["success"]: return { "success": True, "text": result["text"], "confidence": result["confidence"], "language": result["language"], "audio_info": { "size": len(audio_content), "format": validation["format"], "estimated_duration": validation.get("estimated_duration", 0) } } else: raise HTTPException( status_code=400, detail=f"Speech recognition failed: {result.get('error', 'Unknown error')}" ) except HTTPException: raise except Exception as e: logger.error(f"❌ Speech-to-text error: {e}") raise HTTPException( status_code=500, detail=f"Speech-to-text processing failed: {str(e)}" ) @router.post("/translate") async def translate_text( text: str = Form(...), source_language: str = Form("de"), target_language: str = Form("en"), current_user: User = Depends(getCurrentUser) ): """Translate text using Google Cloud Translation API.""" try: logger.info(f"🌐 Translation request: '{text}' ({source_language} -> {target_language})") if not text.strip(): raise HTTPException( status_code=400, detail="Empty text provided for translation" ) # Get voice interface voice_interface = get_voice_interface(current_user) # Perform translation result = await voice_interface.translateText( text=text, sourceLanguage=source_language, targetLanguage=target_language ) if result["success"]: return { "success": True, "original_text": result["original_text"], "translated_text": result["translated_text"], "source_language": result["source_language"], "target_language": result["target_language"] } else: raise HTTPException( status_code=400, detail=f"Translation failed: {result.get('error', 'Unknown error')}" ) except HTTPException: raise except Exception as e: logger.error(f"❌ Translation error: {e}") raise HTTPException( status_code=500, detail=f"Translation processing failed: {str(e)}" ) @router.post("/realtime-interpreter") async def realtime_interpreter( audio_file: UploadFile = File(...), from_language: str = Form("de-DE"), to_language: str = Form("en-US"), connection_id: str = Form(None), current_user: User = Depends(getCurrentUser) ): """Real-time interpreter: speech to translated text using Google Cloud APIs.""" try: logger.info(f"🔄 Real-time interpreter request: {audio_file.filename}") logger.info(f" From: {from_language} -> To: {to_language}") logger.info(f" MIME type: {audio_file.content_type}") # Read audio file audio_content = await audio_file.read() logger.info(f"📊 Audio file size: {len(audio_content)} bytes") # Save audio file for debugging with correct extension # file_extension = "webm" if audio_file.filename.endswith('.webm') else "wav" # debug_filename = f"debug_audio/audio_google_{audio_file.filename.replace('.wav', '.webm')}" # os.makedirs("debug_audio", exist_ok=True) # with open(debug_filename, "wb") as f: # f.write(audio_content) # logger.info(f"💾 Saved audio file for debugging: {debug_filename}") # Get voice interface voice_interface = get_voice_interface(current_user) # Validate audio format validation = voice_interface.validateAudioFormat(audio_content) if not validation["valid"]: raise HTTPException( status_code=400, detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}" ) # Perform complete pipeline: Speech-to-Text + Translation result = await voice_interface.speechToTranslatedText( audioContent=audio_content, fromLanguage=from_language, toLanguage=to_language ) if result["success"]: logger.info(f"✅ Real-time interpreter successful:") logger.info(f" Original: '{result['original_text']}'") logger.info(f" Translated: '{result['translated_text']}'") return { "success": True, "original_text": result["original_text"], "translated_text": result["translated_text"], "confidence": result["confidence"], "source_language": result["source_language"], "target_language": result["target_language"], "audio_info": { "size": len(audio_content), "format": validation["format"], "estimated_duration": validation.get("estimated_duration", 0) } } else: raise HTTPException( status_code=400, detail=f"Real-time interpreter failed: {result.get('error', 'Unknown error')}" ) except HTTPException: raise except Exception as e: logger.error(f"❌ Real-time interpreter error: {e}") raise HTTPException( status_code=500, detail=f"Real-time interpreter processing failed: {str(e)}" ) @router.post("/text-to-speech") async def text_to_speech( text: str = Form(...), language: str = Form("de-DE"), voice: str = Form(None), current_user: User = Depends(getCurrentUser) ): """Convert text to speech using Google Cloud Text-to-Speech.""" try: logger.info(f"Text-to-Speech request: '{text[:50]}...' in {language}") if not text.strip(): raise HTTPException( status_code=400, detail="Empty text provided for text-to-speech" ) voice_interface = get_voice_interface(current_user) result = await voice_interface.textToSpeech( text=text, languageCode=language, voiceName=voice ) if result["success"]: return Response( content=result["audio_content"], media_type="audio/mpeg", headers={ "Content-Disposition": "attachment; filename=speech.mp3", "X-Voice-Name": result["voice_name"], "X-Language-Code": result["language_code"] } ) else: raise HTTPException( status_code=400, detail=f"Text-to-Speech failed: {result.get('error', 'Unknown error')}" ) except HTTPException: raise except Exception as e: logger.error(f"Text-to-Speech error: {e}") raise HTTPException( status_code=500, detail=f"Text-to-Speech processing failed: {str(e)}" ) @router.get("/languages") async def get_available_languages(current_user: User = Depends(getCurrentUser)): """Get available languages from Google Cloud Text-to-Speech.""" try: logger.info("🌐 Getting available languages from Google Cloud TTS") voice_interface = get_voice_interface(current_user) result = await voice_interface.getAvailableLanguages() if result["success"]: return { "success": True, "languages": result["languages"] } else: raise HTTPException( status_code=400, detail=f"Failed to get languages: {result.get('error', 'Unknown error')}" ) except HTTPException: raise except Exception as e: logger.error(f"❌ Get languages error: {e}") raise HTTPException( status_code=500, detail=f"Failed to get available languages: {str(e)}" ) @router.get("/voices") async def get_available_voices( language_code: Optional[str] = None, current_user: User = Depends(getCurrentUser) ): """Get available voices from Google Cloud Text-to-Speech.""" try: logger.info(f"🎤 Getting available voices, language filter: {language_code}") voice_interface = get_voice_interface(current_user) result = await voice_interface.getAvailableVoices(languageCode=language_code) if result["success"]: return { "success": True, "voices": result["voices"], "language_filter": language_code } else: raise HTTPException( status_code=400, detail=f"Failed to get voices: {result.get('error', 'Unknown error')}" ) except HTTPException: raise except Exception as e: logger.error(f"❌ Get voices error: {e}") raise HTTPException( status_code=500, detail=f"Failed to get available voices: {str(e)}" ) @router.get("/health") async def health_check(current_user: User = Depends(getCurrentUser)): """Health check for Google Cloud voice services.""" try: voice_interface = get_voice_interface(current_user) test_result = await voice_interface.healthCheck() return test_result except Exception as e: logger.error(f"❌ Health check failed: {e}") return { "status": "unhealthy", "error": str(e) } @router.get("/settings") async def get_voice_settings(current_user: User = Depends(getCurrentUser)): """Get voice settings for the current user.""" try: logger.info(f"Getting voice settings for user: {current_user.id}") # Get voice interface voice_interface = get_voice_interface(current_user) # Get or create voice settings for the user voice_settings = voice_interface.getOrCreateVoiceSettings(current_user.id) if voice_settings: # Return user settings return { "success": True, "data": { "user_settings": voice_settings.to_dict(), "default_settings": { "sttLanguage": "de-DE", "ttsLanguage": "de-DE", "ttsVoice": "de-DE-Wavenet-A", "translationEnabled": True, "targetLanguage": "en-US" } } } else: # Fallback to default settings if database fails logger.warning("Failed to get voice settings from database, using defaults") return { "success": True, "data": { "user_settings": None, "default_settings": { "sttLanguage": "de-DE", "ttsLanguage": "de-DE", "ttsVoice": "de-DE-Wavenet-A", "translationEnabled": True, "targetLanguage": "en-US" } } } except Exception as e: logger.error(f"Error getting voice settings: {e}") raise HTTPException( status_code=500, detail=f"Failed to get voice settings: {str(e)}" ) @router.post("/settings") async def save_voice_settings( settings: Dict[str, Any] = Body(...), current_user: User = Depends(getCurrentUser) ): """Save voice settings for the current user.""" try: logger.info(f"Saving voice settings for user: {current_user.id}") logger.info(f"Settings: {settings}") # Validate required settings required_fields = ["sttLanguage", "ttsLanguage", "ttsVoice"] for field in required_fields: if field not in settings: raise HTTPException( status_code=400, detail=f"Missing required field: {field}" ) # Set default values for optional fields if not provided if "translationEnabled" not in settings: settings["translationEnabled"] = True if "targetLanguage" not in settings: settings["targetLanguage"] = "en-US" # Get voice interface voice_interface = get_voice_interface(current_user) # Check if settings already exist for this user existing_settings = voice_interface.getVoiceSettings(current_user.id) if existing_settings: # Update existing settings logger.info(f"Updating existing voice settings for user {current_user.id}") updated_settings = voice_interface.updateVoiceSettings(current_user.id, settings) logger.info(f"Voice settings updated for user {current_user.id}: {updated_settings}") else: # Create new settings logger.info(f"Creating new voice settings for user {current_user.id}") # Add userId to settings settings["userId"] = current_user.id created_settings = voice_interface.createVoiceSettings(settings) logger.info(f"Voice settings created for user {current_user.id}: {created_settings}") return { "success": True, "message": "Voice settings saved successfully", "data": settings } except HTTPException: raise except Exception as e: logger.error(f"Error saving voice settings: {e}") raise HTTPException( status_code=500, detail=f"Failed to save voice settings: {str(e)}" ) # WebSocket endpoints for real-time voice streaming @router.websocket("/ws/realtime-interpreter") async def websocket_realtime_interpreter( websocket: WebSocket, user_id: str = "default", from_language: str = "de-DE", to_language: str = "en-US" ): """WebSocket endpoint for real-time voice interpretation""" connection_id = f"realtime_{user_id}_{from_language}_{to_language}" try: await manager.connect(websocket, connection_id) # Send connection confirmation await manager.send_personal_message({ "type": "connected", "connection_id": connection_id, "message": "Connected to real-time interpreter" }, websocket) # Initialize voice interface voice_interface = get_voice_interface(User(id=user_id)) while True: # Receive message from client data = await websocket.receive_text() message = json.loads(data) if message["type"] == "audio_chunk": # Process audio chunk try: # Decode base64 audio data audio_data = base64.b64decode(message["data"]) # For now, just acknowledge receipt # In a full implementation, this would: # 1. Buffer audio chunks # 2. Process with Google Cloud Speech-to-Text streaming # 3. Send partial results back # 4. Handle translation await manager.send_personal_message({ "type": "audio_received", "chunk_size": len(audio_data), "timestamp": message.get("timestamp") }, websocket) except Exception as e: logger.error(f"Error processing audio chunk: {e}") await manager.send_personal_message({ "type": "error", "error": f"Failed to process audio: {str(e)}" }, websocket) elif message["type"] == "ping": # Respond to ping await manager.send_personal_message({ "type": "pong", "timestamp": message.get("timestamp") }, websocket) else: logger.warning(f"Unknown message type: {message['type']}") except WebSocketDisconnect: manager.disconnect(websocket, connection_id) logger.info(f"Client disconnected: {connection_id}") except Exception as e: logger.error(f"WebSocket error: {e}") manager.disconnect(websocket, connection_id) @router.websocket("/ws/speech-to-text") async def websocket_speech_to_text( websocket: WebSocket, user_id: str = "default", language: str = "de-DE" ): """WebSocket endpoint for real-time speech-to-text""" connection_id = f"stt_{user_id}_{language}" try: await manager.connect(websocket, connection_id) await manager.send_personal_message({ "type": "connected", "connection_id": connection_id, "message": "Connected to speech-to-text" }, websocket) # Initialize voice interface voice_interface = get_voice_interface(User(id=user_id)) while True: data = await websocket.receive_text() message = json.loads(data) if message["type"] == "audio_chunk": try: audio_data = base64.b64decode(message["data"]) # Process audio chunk # This would integrate with Google Cloud Speech-to-Text streaming API await manager.send_personal_message({ "type": "transcription_result", "text": "Audio chunk received", # Placeholder "confidence": 0.95, "is_final": False }, websocket) except Exception as e: logger.error(f"Error processing audio: {e}") await manager.send_personal_message({ "type": "error", "error": f"Failed to process audio: {str(e)}" }, websocket) elif message["type"] == "ping": await manager.send_personal_message({ "type": "pong", "timestamp": message.get("timestamp") }, websocket) except WebSocketDisconnect: manager.disconnect(websocket, connection_id) except Exception as e: logger.error(f"WebSocket error: {e}") manager.disconnect(websocket, connection_id) @router.websocket("/ws/text-to-speech") async def websocket_text_to_speech( websocket: WebSocket, user_id: str = "default", language: str = "de-DE", voice: str = "de-DE-Wavenet-A" ): """WebSocket endpoint for real-time text-to-speech""" connection_id = f"tts_{user_id}_{language}_{voice}" try: await manager.connect(websocket, connection_id) await manager.send_personal_message({ "type": "connected", "connection_id": connection_id, "message": "Connected to text-to-speech" }, websocket) while True: data = await websocket.receive_text() message = json.loads(data) if message["type"] == "text_to_speak": try: text = message["text"] # Process text-to-speech # This would integrate with Google Cloud Text-to-Speech API # For now, send a placeholder response await manager.send_personal_message({ "type": "audio_data", "audio": "base64_encoded_audio_here", # Placeholder "format": "mp3" }, websocket) except Exception as e: logger.error(f"Error processing text-to-speech: {e}") await manager.send_personal_message({ "type": "error", "error": f"Failed to process text: {str(e)}" }, websocket) elif message["type"] == "ping": await manager.send_personal_message({ "type": "pong", "timestamp": message.get("timestamp") }, websocket) except WebSocketDisconnect: manager.disconnect(websocket, connection_id) except Exception as e: logger.error(f"WebSocket error: {e}") manager.disconnect(websocket, connection_id)