voice tts beta test version

2025-09-15 00:36:26 +02:00 · 2025-09-15 00:36:26 +02:00 · cf1b302008
commit cf1b302008
parent 7ed36283c5
35 changed files with 1032 additions and 83 deletions
--- a/app.py
+++ b/app.py
@ -251,3 +251,6 @@ app.include_router(googleRouter)

 from modules.routes.routeVoiceGoogle import router as voiceGoogleRouter
 app.include_router(voiceGoogleRouter)
+
+from modules.routes.routeVoiceStreaming import router as voiceStreamingRouter
+app.include_router(voiceStreamingRouter)
--- a/config.ini
+++ b/config.ini
@ -49,7 +49,19 @@ Service_GOOGLE_CLIENT_SECRET = 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
 Connector_WebTavily_API_KEY = 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

 # Google Cloud Speech Services configuration
-# Set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory
+Connector_GoogleSpeech_API_KEY = {
+  "type": "service_account",
+  "project_id": "poweronid",
+  "private_key_id": "88db66e4248326e9baeac4231bc196fd46a9a441",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDTnJuxA+xBL3LA\nPgFILYCsGuppkkdO6d153Q36f2jTj6zpH3OhKMVsaaTBknG2o2+D0Whlk6Yh5rOw\nkWzpMC3y81leRLm5kucERMkBUgd2GL4v16k6m+QGuC3BFlt/XeyuckJNW0V6v/Dy\n3+bSYM7/5o1ftPNWJeAIEWoE/V4wKCYde8RE4Vp1LO5YwhgcM4rRuPmF2OhekpA+\npteYwkY/8/gTTRpZIc8OTsBYRbaMwsjoDj5riuL3boVtkwZwKRb+ZLvupXeU7Ds7\n1305odTcZUwnImHiHfuq83ZJViQiLRNhUAFnQIXPrYLwEpCmzRBGzYHaRlb69ga/\nzqUbKnclAgMBAAECggEAH6W9qHehubioPMAJM7Y6bC2KU/JLNS4csBZd+idb52gG\nwBwIEFjR+H4ZjymhAA4+pe7c4h7MKyh0RI/l7eoFX98Cb+rEq/r1udm1BhGH3s2h\n2UiI8qRQh1YRjF2/nrN5VjhDBOFa6W9opaopZy/l8AzsT8f21zIgPen8z8o6GpFg\n64fJFcbqCGk2ykN2+x2pIOT04tmCszrfbXZP8LEs4xrUB/XwlHL1vT/M3EWIKbnj\njDaIMjw7q/KRgNUvmKS6SU9b3fnOLcQCz9f5cKdiWACKIU/UvuiWhWJ9ou6BWLWU\nva1A6Fi4XJjhW7s3po58/ioQfl0A9p/L92lGg4ST8QKBgQDx8LIM1g0dh9Ql6LmH\nBUGCOewNNXTs+y3ZznUfvVMoyyZK5w/pzeUvkmOwzbRGnZJ9WyCghq8aezyEpo2D\nPL7Odf988IeHmvhyZIM4PLJYgDvSwGXyf/gh6gJkf/4wpx+tx/yQYNBm3Rht7sA0\npSaLehK0E0kW1uyBzHGKgyQOhwKBgQDf6LiZ7hSQqh54vIU1XMDRth0UOo/s/HGi\nDoij29KjmHjLkm8vOlCo83e79X0WhcnyB5kM7nWFegwcM1PJ0Dl8gidUuTlOVDtM\n5u2AaxDoyXAUL457U5dGFAIW+R653ZDkzMfCglacP8HixXEyIpL1cTLqiCAgzszS\nLcSWwoAr8wKBgQC4CGm3X97sFpTmHSd6sCHLaDnJNl9xoAKZifUHpqCqCBVhpm8x\nXp+11vmj1GULzfJPDlE8Khbp4tH+6R39tOhC7fjgVaoSGWxgv1odHfZfYXOf9R/X\nHUZmrbUSM1XsNkPfkZ7pR+teQ1HA1Xo40WMHd1zgw0a2a9fNR/EZ9nUn4wKBgGaK\nUEgGNRrPHadTRnnaoV8o1IZYD2OLdIqvtzm7SOqsv90SkaKCRUAqR5InaYKwAHy7\nqAa5Cc73xqX/h4arujff7x0ouiq5/nJIa0ndPmAtKAvGf6zQ6j0ompBkxAKAioON\nmInmYL2roSI2I5G/LagDkDrB3lzH+Brk5NvZ9RKrAoGAGox462GGGb/NbGdDkahN\ndifzYYvq4FPiWFFo0ynKAulxCBWLXO/N45XNuAyen433d8eREcAYz1Dzax44+MdQ\nHo9dU7YcZvFyt6iZsYeQF8dluHui3vzMpUe0KbqpZC5KMOSw53ZdNIwzo8NTAK59\n+uv3dHGj7sS8fhDo3yCifzc=\n-----END PRIVATE KEY-----\n",
+  "client_email": "poweron-voice-services@poweronid.iam.gserviceaccount.com",
+  "client_id": "116641749406798186404",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/poweron-voice-services%40poweronid.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}

 # Web Search configuration
 Web_Search_MAX_QUERY_LENGTH = 400
--- a/debug_audio/audio_20250913_223438.wav
+++ b/debug_audio/audio_20250913_223438.wav
--- a/debug_audio/audio_20250913_223658.wav
+++ b/debug_audio/audio_20250913_223658.wav
--- a/debug_audio/audio_20250913_224003.wav
+++ b/debug_audio/audio_20250913_224003.wav
--- a/debug_audio/audio_20250913_224258.wav
+++ b/debug_audio/audio_20250913_224258.wav
--- a/debug_audio/audio_20250913_224524.wav
+++ b/debug_audio/audio_20250913_224524.wav
--- a/debug_audio/audio_20250913_224801.wav
+++ b/debug_audio/audio_20250913_224801.wav
--- a/debug_audio/audio_20250913_230817.wav
+++ b/debug_audio/audio_20250913_230817.wav
--- a/debug_audio/audio_20250913_230927.wav
+++ b/debug_audio/audio_20250913_230927.wav
--- a/debug_audio/audio_20250913_231253.wav
+++ b/debug_audio/audio_20250913_231253.wav
--- a/debug_audio/audio_20250913_231321.wav
+++ b/debug_audio/audio_20250913_231321.wav
--- a/debug_audio/audio_20250913_231611.wav
+++ b/debug_audio/audio_20250913_231611.wav
--- a/debug_audio/audio_20250913_231935.wav
+++ b/debug_audio/audio_20250913_231935.wav
--- a/debug_audio/audio_20250913_232141.wav
+++ b/debug_audio/audio_20250913_232141.wav
--- a/debug_audio/audio_20250913_232309.wav
+++ b/debug_audio/audio_20250913_232309.wav
--- a/debug_audio/audio_20250913_232518.wav
+++ b/debug_audio/audio_20250913_232518.wav
--- a/debug_audio/audio_20250913_232659.wav
+++ b/debug_audio/audio_20250913_232659.wav
--- a/debug_audio/audio_20250913_232941.wav
+++ b/debug_audio/audio_20250913_232941.wav
--- a/debug_audio/audio_20250913_233053.wav
+++ b/debug_audio/audio_20250913_233053.wav
--- a/debug_audio/audio_20250913_233155.wav
+++ b/debug_audio/audio_20250913_233155.wav
--- a/debug_audio/audio_20250913_233607.wav
+++ b/debug_audio/audio_20250913_233607.wav
--- a/debug_audio/audio_20250913_234106.wav
+++ b/debug_audio/audio_20250913_234106.wav
--- a/debug_audio/audio_20250913_234245.wav
+++ b/debug_audio/audio_20250913_234245.wav
--- a/debug_audio/audio_20250913_234843.wav
+++ b/debug_audio/audio_20250913_234843.wav
--- a/debug_audio/audio_20250913_235136.wav
+++ b/debug_audio/audio_20250913_235136.wav
--- a/debug_audio/audio_20250913_235409.wav
+++ b/debug_audio/audio_20250913_235409.wav
--- a/debug_audio/audio_google_conversation_sliding.webm
+++ b/debug_audio/audio_google_conversation_sliding.webm
--- a/debug_audio/audio_google_interpreter_recording.webm
+++ b/debug_audio/audio_google_interpreter_recording.webm
--- a/debug_audio/audio_google_recording.webm
+++ b/debug_audio/audio_google_recording.webm
--- a/modules/connectors/connectorGoogleSpeech.py
+++ b/modules/connectors/connectorGoogleSpeech.py
@ -5,11 +5,15 @@ Replaces Azure Speech Services with Google Cloud APIs

 import os
 import io
+import json
+import html
 import logging
 import asyncio
 from typing import Dict, Optional, Any
 from google.cloud import speech
 from google.cloud import translate_v2 as translate
+from google.cloud import texttospeech
+from modules.shared.configuration import APP_CONFIG

 logger = logging.getLogger(__name__)

@ -19,21 +23,34 @@ class ConnectorGoogleSpeech:
    Handles audio processing, speech recognition, and translation.
    """
    
-    def __init__(self, credentials_path: Optional[str] = None):
+    def __init__(self):
        """
-        Initialize Google Cloud Speech and Translation clients.
-        
-        Args:
-            credentials_path: Path to Google Cloud service account JSON file
+        Initialize Google Cloud Speech and Translation clients using config.ini.
        """
        try:
-            # Set up authentication
-            if credentials_path:
-                os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
+            # Get JSON key from config.ini
+            api_key = APP_CONFIG.get("Connector_GoogleSpeech_API_KEY")
            
-            # Initialize clients
-            self.speech_client = speech.SpeechClient()
-            self.translate_client = translate.Client()
+            if not api_key or api_key == "YOUR_GOOGLE_SERVICE_ACCOUNT_JSON_KEY_HERE":
+                raise ValueError("Google Speech API key not configured. Please set Connector_GoogleSpeech_API_KEY in config.ini with the full service account JSON key")
+            
+            # Parse the JSON key and set up authentication
+            try:
+                credentials_info = json.loads(api_key)
+                
+                # Create credentials object directly (no file needed!)
+                from google.oauth2 import service_account
+                credentials = service_account.Credentials.from_service_account_info(credentials_info)
+                
+                logger.info("✅ Using Google Speech credentials from config.ini")
+                
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON in Google Speech API key: {e}")
+            
+            # Initialize clients with explicit credentials
+            self.speech_client = speech.SpeechClient(credentials=credentials)
+            self.translate_client = translate.Client(credentials=credentials)
+            self.tts_client = texttospeech.TextToSpeechClient(credentials=credentials)
            
            logger.info("✅ Google Cloud Speech and Translation clients initialized successfully")
            
@ -42,37 +59,134 @@ class ConnectorGoogleSpeech:
            raise
    
    async def speech_to_text(self, audio_content: bytes, language: str = "de-DE", 
-                           sample_rate: int = 16000, channels: int = 1) -> Dict:
+                           sample_rate: int = None, channels: int = None) -> Dict:
        """
        Convert speech to text using Google Cloud Speech-to-Text API.
        
        Args:
-            audio_content: Raw audio data (PCM format)
+            audio_content: Raw audio data (various formats supported)
            language: Language code (e.g., 'de-DE', 'en-US')
-            sample_rate: Audio sample rate (default: 16000 Hz)
-            channels: Number of audio channels (default: 1)
+            sample_rate: Audio sample rate (auto-detected if None)
+            channels: Number of audio channels (auto-detected if None)
            
        Returns:
            Dict containing transcribed text, confidence, and metadata
        """
        try:
-            logger.info(f"🎤 Processing audio with Google Cloud Speech-to-Text")
-            logger.info(f"📊 Audio: {len(audio_content)} bytes, {sample_rate}Hz, {channels}ch")
+            # Auto-detect audio format if not provided
+            if sample_rate is None or channels is None:
+                validation = self.validate_audio_format(audio_content)
+                if not validation["valid"]:
+                    return {
+                        "success": False,
+                        "text": "",
+                        "confidence": 0.0,
+                        "error": f"Invalid audio format: {validation.get('error', 'Unknown error')}"
+                    }
+                sample_rate = validation["sample_rate"]
+                channels = validation["channels"]
+                audio_format = validation["format"]
+                logger.info(f"Auto-detected audio: {audio_format}, {sample_rate}Hz, {channels}ch")
+            
+            logger.info(f"Processing audio with Google Cloud Speech-to-Text")
+            logger.info(f"Audio: {len(audio_content)} bytes, {sample_rate}Hz, {channels}ch")
            
            # Configure audio settings
            audio = speech.RecognitionAudio(content=audio_content)
-            config = speech.RecognitionConfig(
-                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
-                sample_rate_hertz=sample_rate,
-                audio_channel_count=channels,
-                language_code=language,
-                enable_automatic_punctuation=True,
-                model="latest_long"  # Use the latest model
-            )
+            
+            # Determine encoding based on detected format
+            # Google Cloud Speech API has specific requirements for different formats
+            if audio_format == "webm_opus":
+                # For WEBM OPUS, we need to ensure proper format
+                encoding = speech.RecognitionConfig.AudioEncoding.WEBM_OPUS
+                # WEBM_OPUS requires specific sample rate handling - must match header
+                if sample_rate != 48000:
+                    logger.warning(f"WEBM_OPUS detected but sample rate is {sample_rate}, adjusting to 48000")
+                    sample_rate = 48000
+                # For WEBM_OPUS, don't specify sample_rate_hertz in config
+                # Google Cloud will read it from the WEBM header
+                use_sample_rate = False
+            elif audio_format == "linear16":
+                # For LINEAR16 format (PCM)
+                encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
+                # Ensure sample rate is reasonable
+                if sample_rate not in [8000, 16000, 22050, 24000, 32000, 44100, 48000]:
+                    logger.warning(f"Unusual sample rate {sample_rate}, adjusting to 16000")
+                    sample_rate = 16000
+                use_sample_rate = True
+            elif audio_format == "mp3":
+                # For MP3 format
+                encoding = speech.RecognitionConfig.AudioEncoding.MP3
+                use_sample_rate = True
+            elif audio_format == "flac":
+                # For FLAC format
+                encoding = speech.RecognitionConfig.AudioEncoding.FLAC
+                use_sample_rate = True
+            elif audio_format == "wav":
+                # For WAV format
+                encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
+                use_sample_rate = True
+            else:
+                # For unknown formats, try LINEAR16 as fallback
+                encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
+                sample_rate = 16000  # Use standard sample rate
+                channels = 1  # Use mono
+                use_sample_rate = True
+                logger.warning(f"Unknown audio format '{audio_format}', using LINEAR16 encoding with 16000Hz")
+            
+            # Build config based on format requirements
+            config_params = {
+                "encoding": encoding,
+                "audio_channel_count": channels,
+                "language_code": language,
+                "enable_automatic_punctuation": True,
+                "model": "latest_long",  # Try latest_long model for better recognition
+                "enable_word_time_offsets": True,  # Enable word-level timing
+                "enable_word_confidence": True,    # Enable word-level confidence
+                "max_alternatives": 3,             # Try more alternatives
+                "use_enhanced": True               # Use enhanced model if available
+            }
+            
+            # Only add sample_rate_hertz if needed (not for WEBM_OPUS)
+            if use_sample_rate:
+                config_params["sample_rate_hertz"] = sample_rate
+                logger.debug(f"Recognition config: encoding={encoding}, sample_rate={sample_rate}, channels={channels}, language={language}")
+            else:
+                logger.debug(f"Recognition config: encoding={encoding}, sample_rate=auto (from header), channels={channels}, language={language}")
+            
+            config = speech.RecognitionConfig(**config_params)
            
            # Perform speech recognition
-            logger.info("🔄 Sending audio to Google Cloud Speech-to-Text...")
-            response = self.speech_client.recognize(config=config, audio=audio)
+            logger.info("Sending audio to Google Cloud Speech-to-Text...")
+            
+            try:
+                # Use regular recognition for single audio files (not streaming)
+                logger.info("Using regular recognition for single audio file...")
+                response = self.speech_client.recognize(config=config, audio=audio)
+                logger.debug(f"Google Cloud response: {response}")
+                
+            except Exception as api_error:
+                logger.error(f"Google Cloud API error: {api_error}")
+                # Try with different encoding as fallback
+                if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16:
+                    logger.info("Trying fallback with LINEAR16 encoding...")
+                    fallback_config = speech.RecognitionConfig(
+                        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                        sample_rate_hertz=16000,  # Use standard sample rate
+                        audio_channel_count=1,
+                        language_code=language,
+                        enable_automatic_punctuation=True,
+                        model="latest_long"
+                    )
+                    
+                    try:
+                        response = self.speech_client.recognize(config=fallback_config, audio=audio)
+                        logger.debug(f"Google Cloud fallback response: {response}")
+                    except Exception as fallback_error:
+                        logger.error(f"Google Cloud fallback error: {fallback_error}")
+                        raise api_error
+                else:
+                    raise api_error
            
            # Process results
            if response.results:
@ -82,7 +196,7 @@ class ConnectorGoogleSpeech:
                    transcribed_text = alternative.transcript
                    confidence = alternative.confidence
                    
-                    logger.info(f"✅ Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")
+                    logger.info(f"Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")
                    
                    return {
                        "success": True,
@ -96,7 +210,8 @@ class ConnectorGoogleSpeech:
                        }
                    }
                else:
-                    logger.warning("⚠️ No transcription alternatives found")
+                    logger.warning("No transcription alternatives found")
+                    logger.debug(f"Result details: {result}")
                    return {
                        "success": False,
                        "text": "",
@ -104,16 +219,102 @@ class ConnectorGoogleSpeech:
                        "error": "No transcription alternatives found"
                    }
            else:
-                logger.warning("⚠️ No recognition results from Google Cloud")
+                logger.warning("No recognition results from Google Cloud")
+                logger.debug(f"Response details: {response}")
+                
+                # Check if there are any error messages in the response
+                if hasattr(response, 'error') and response.error:
+                    logger.error(f"Google Cloud error: {response.error}")
+                    return {
+                        "success": False,
+                        "text": "",
+                        "confidence": 0.0,
+                        "error": f"Google Cloud error: {response.error}"
+                    }
+                
+                # Try multiple fallback approaches
+                fallback_configs = []
+                
+                if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16:
+                    # Try LINEAR16 with detected sample rate
+                    fallback_configs.append({
+                        "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                        "sample_rate": sample_rate,
+                        "channels": channels,
+                        "use_sample_rate": True,
+                        "description": f"LINEAR16 with {sample_rate}Hz"
+                    })
+                
+                # Try LINEAR16 with standard sample rates
+                for std_rate in [16000, 8000, 22050, 44100]:
+                    if std_rate != sample_rate:
+                        fallback_configs.append({
+                            "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
+                            "sample_rate": std_rate,
+                            "channels": 1,
+                            "use_sample_rate": True,
+                            "description": f"LINEAR16 with {std_rate}Hz"
+                        })
+                
+                # Try with different models
+                models = ["latest_long", "phone_call", "latest_short"]
+                
+                for fallback_config in fallback_configs:
+                    for model in models:
+                        try:
+                            logger.info(f"Trying fallback: {fallback_config['description']} with {model} model...")
+                            
+                            # Build fallback config with proper sample rate handling
+                            fallback_config_params = {
+                                "encoding": fallback_config["encoding"],
+                                "audio_channel_count": fallback_config["channels"],
+                                "language_code": language,
+                                "enable_automatic_punctuation": True,
+                                "model": model
+                            }
+                            
+                            # Only add sample_rate_hertz if needed
+                            if fallback_config["use_sample_rate"]:
+                                fallback_config_params["sample_rate_hertz"] = fallback_config["sample_rate"]
+                            
+                            fallback_config_obj = speech.RecognitionConfig(**fallback_config_params)
+                            fallback_response = self.speech_client.recognize(config=fallback_config_obj, audio=audio)
+                            
+                            if fallback_response.results:
+                                result = fallback_response.results[0]
+                                if result.alternatives:
+                                    alternative = result.alternatives[0]
+                                    transcribed_text = alternative.transcript
+                                    confidence = alternative.confidence
+                                    
+                                    logger.info(f"Fallback transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")
+                                    
+                                    return {
+                                        "success": True,
+                                        "text": transcribed_text,
+                                        "confidence": confidence,
+                                        "language": language,
+                                        "raw_result": {
+                                            "transcript": transcribed_text,
+                                            "confidence": confidence,
+                                            "language_code": language
+                                        }
+                                    }
+                            
+                        except Exception as e:
+                            logger.debug(f"Fallback failed: {e}")
+                            continue
+                
+                logger.warning("All fallback attempts failed")
                return {
                    "success": False,
                    "text": "",
                    "confidence": 0.0,
-                    "error": "No recognition results"
+                    "error": "No recognition results - audio may be too short, unclear, or in unsupported format"
                }
                
        except Exception as e:
-            logger.error(f"❌ Google Cloud Speech-to-Text error: {e}")
+            logger.error(f"Google Cloud Speech-to-Text error: {e}")
            return {
                "success": False,
                "text": "",
@ -155,6 +356,9 @@ class ConnectorGoogleSpeech:
            translated_text = result['translatedText']
            detected_language = result.get('detectedSourceLanguage', source_language)
            
+            # Decode HTML entities in translated text
+            translated_text = html.unescape(translated_text)
+            
            logger.info(f"✅ Translation successful: '{translated_text}'")
            
            return {
@ -256,26 +460,154 @@ class ConnectorGoogleSpeech:
            Dict containing validation results
        """
        try:
-            # Google Cloud Speech-to-Text supports various formats
-            # We'll do basic validation
+            # Basic validation
            if len(audio_content) < 100:
                return {
                    "valid": False,
                    "error": "Audio too short (less than 100 bytes)"
                }
            
-            # Check if it looks like PCM audio (basic check)
-            if len(audio_content) % 2 != 0:
-                return {
-                    "valid": False,
-                    "error": "Audio data length is odd (not 16-bit PCM)"
-                }
+            # Detect audio format by checking file headers
+            audio_format = "unknown"
+            sample_rate = 16000  # Default fallback
+            channels = 1  # Default fallback
+            
+            # Debug: Log first few bytes for format detection
+            logger.debug(f"Audio header bytes: {audio_content[:20].hex()}")
+            logger.debug(f"Audio content length: {len(audio_content)} bytes")
+            
+            # Check for WEBM/OPUS format (common from web recordings)
+            if audio_content.startswith(b'\x1a\x45\xdf\xa3'):
+                audio_format = "webm_opus"
+                sample_rate = 48000  # WEBM OPUS typically uses 48kHz
+                channels = 1
+                logger.info(f"Detected WEBM OPUS format: {sample_rate}Hz, {channels}ch")
+            
+            # Check for specific header patterns seen in logs (43c381...)
+            # This appears to be a different audio format or corrupted WEBM
+            elif audio_content.startswith(b'\x43\xc3\x81') and len(audio_content) > 1000:
+                # This might be a different format or corrupted audio
+                # Try to detect if it's actually WEBM by looking deeper
+                if b'webm' in audio_content[:200] or b'opus' in audio_content[:200]:
+                    audio_format = "webm_opus"
+                    sample_rate = 48000
+                    channels = 1
+                    logger.info(f"Detected WEBM format (deep scan): {sample_rate}Hz, {channels}ch")
+                else:
+                    # Unknown format, try as LINEAR16
+                    audio_format = "linear16"
+                    sample_rate = 16000
+                    channels = 1
+                    logger.warning(f"Unknown audio format with header {audio_content[:8].hex()}, trying LINEAR16")
+            
+            # Check for WEBM format (alternative detection)
+            elif b'webm' in audio_content[:100].lower() or b'opus' in audio_content[:100].lower():
+                audio_format = "webm_opus"
+                sample_rate = 48000  # WEBM OPUS typically uses 48kHz
+                channels = 1
+                logger.info(f"Detected WEBM format: {sample_rate}Hz, {channels}ch")
+            
+            # Check for MediaRecorder WEBM chunks (common in browser recordings)
+            elif audio_content.startswith(b'\x1a\x45\xdf\xa3') and len(audio_content) > 1000:
+                audio_format = "webm_opus"
+                sample_rate = 48000  # Browser MediaRecorder typically uses 48kHz
+                channels = 1
+                logger.info(f"Detected MediaRecorder WEBM: {sample_rate}Hz, {channels}ch")
+            
+            # Check for OPUS format by looking for OPUS magic bytes
+            elif audio_content.startswith(b'OpusHead') or b'OpusHead' in audio_content[:50]:
+                audio_format = "webm_opus"
+                sample_rate = 48000  # OPUS typically uses 48kHz
+                channels = 1
+                logger.info(f"Detected OPUS format: {sample_rate}Hz, {channels}ch")
+            
+            # Check for OGG format (often contains OPUS)
+            elif audio_content.startswith(b'OggS'):
+                audio_format = "webm_opus"
+                sample_rate = 48000  # OGG OPUS typically uses 48kHz
+                channels = 1
+                logger.info(f"Detected OGG format: {sample_rate}Hz, {channels}ch")
+            
+            # Check for WAV format
+            elif audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]:
+                audio_format = "wav"
+                # Try to extract sample rate from WAV header
+                try:
+                    # WAV header sample rate is at offset 24-27 (little endian)
+                    sample_rate = int.from_bytes(audio_content[24:28], 'little')
+                    channels = int.from_bytes(audio_content[22:24], 'little')
+                    logger.info(f"Detected WAV format: {sample_rate}Hz, {channels}ch")
+                except:
+                    sample_rate = 16000  # Fallback
+                    channels = 1
+            
+            # Check for MP3 format
+            elif audio_content.startswith(b'\xff\xfb') or audio_content.startswith(b'ID3'):
+                audio_format = "mp3"
+                sample_rate = 44100  # MP3 typically uses 44.1kHz
+                channels = 2  # Usually stereo
+                logger.info(f"Detected MP3 format: {sample_rate}Hz, {channels}ch")
+            
+            # Check for FLAC format
+            elif audio_content.startswith(b'fLaC'):
+                audio_format = "flac"
+                sample_rate = 44100  # Common FLAC sample rate
+                channels = 2
+                logger.info(f"Detected FLAC format: {sample_rate}Hz, {channels}ch")
+            
+            else:
+                # Unknown format, try WEBM_OPUS as it's most common for web recordings
+                audio_format = "webm_opus"
+                sample_rate = 48000  # Try 48kHz for web recordings
+                channels = 1
+                logger.warning(f"Unknown audio format, trying WEBM_OPUS: {sample_rate}Hz, {channels}ch")
+            
+            # Calculate estimated duration
+            if audio_format == "webm_opus":
+                # WEBM OPUS duration is hard to calculate without decoding
+                estimated_duration = 3.0  # Assume 3 seconds for web recordings
+            else:
+                # Rough estimate for uncompressed audio
+                estimated_duration = len(audio_content) / (sample_rate * channels * 2)  # 16-bit = 2 bytes per sample
+            
+            # Check if audio is too short (less than 0.5 seconds)
+            if estimated_duration < 0.5:
+                logger.warning(f"Audio too short: {estimated_duration:.2f}s, may not be recognized")
+            
+            # Log audio details for debugging
+            logger.info(f"Audio analysis: {len(audio_content)} bytes, {estimated_duration:.2f}s, {sample_rate}Hz, {channels}ch, format={audio_format}")
+            
+            # Check audio levels (simple check for silence)
+            if audio_format == "webm_opus":
+                # For WEBM, we can't easily check levels, but log the first few bytes
+                logger.debug(f"Audio sample bytes: {audio_content[:20].hex()}")
+                # Check if audio has some variation (not all same bytes)
+                if len(audio_content) > 100:
+                    sample_bytes = audio_content[100:200]  # Skip header
+                    if len(set(sample_bytes)) < 5:  # Less than 5 different byte values
+                        logger.warning("Audio may be silent or very quiet (low byte variation)")
+                    else:
+                        logger.debug(f"Audio has good byte variation: {len(set(sample_bytes))} unique values")
+            else:
+                # For PCM audio, check for silence
+                if len(audio_content) > 100:
+                    # Convert first 100 bytes to check for silence
+                    sample_bytes = audio_content[:100]
+                    if all(b == 0 for b in sample_bytes):
+                        logger.warning("Audio appears to be silent (all zeros)")
+                    else:
+                        logger.debug(f"Audio sample bytes: {sample_bytes[:20].hex()}")
+                        # Check for low variation
+                        if len(set(sample_bytes)) < 5:
+                            logger.warning("Audio may be very quiet (low byte variation)")
            
            return {
                "valid": True,
-                "format": "pcm",
+                "format": audio_format,
+                "sample_rate": sample_rate,
+                "channels": channels,
                "size": len(audio_content),
-                "estimated_duration": len(audio_content) / (16000 * 2)  # Rough estimate for 16kHz, 16-bit
+                "estimated_duration": estimated_duration
            }
            
        except Exception as e:
@ -283,3 +615,181 @@ class ConnectorGoogleSpeech:
                "valid": False,
                "error": f"Validation error: {e}"
            }
+    
+    async def text_to_speech(self, text: str, language_code: str = "de-DE", voice_name: str = None) -> Dict[str, Any]:
+        """
+        Convert text to speech using Google Cloud Text-to-Speech.
+        
+        Args:
+            text: Text to convert to speech
+            language_code: Language code (e.g., 'de-DE', 'en-US')
+            voice_name: Specific voice name (optional)
+        
+        Returns:
+            Dict with success status and audio data
+        """
+        try:
+            logger.info(f"Converting text to speech: '{text[:50]}...' in {language_code}")
+            
+            # Set up the synthesis input
+            synthesis_input = texttospeech.SynthesisInput(text=text)
+            
+            # Build the voice request
+            selected_voice = voice_name or self._get_default_voice(language_code)
+            logger.info(f"Using TTS voice: {selected_voice} for language: {language_code}")
+            
+            voice = texttospeech.VoiceSelectionParams(
+                language_code=language_code,
+                name=selected_voice,
+                ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
+            )
+            
+            # Select the type of audio file to return
+            audio_config = texttospeech.AudioConfig(
+                audio_encoding=texttospeech.AudioEncoding.MP3
+            )
+            
+            # Perform the text-to-speech request
+            response = self.tts_client.synthesize_speech(
+                input=synthesis_input,
+                voice=voice,
+                audio_config=audio_config
+            )
+            
+            # Return the audio content
+            return {
+                "success": True,
+                "audio_content": response.audio_content,
+                "audio_format": "mp3",
+                "language_code": language_code,
+                "voice_name": voice.name
+            }
+            
+        except Exception as e:
+            logger.error(f"Text-to-Speech error: {e}")
+            return {
+                "success": False,
+                "error": f"Text-to-Speech failed: {str(e)}"
+            }
+    
+    def _get_default_voice(self, language_code: str) -> str:
+        """
+        Get default voice name for a language code.
+        Uses female voices as default for better user experience.
+        """
+        voice_mapping = {
+            # European Languages
+            'de-DE': 'de-DE-Wavenet-B',  # German, female
+            'en-US': 'en-US-Wavenet-B',  # English US, female
+            'en-GB': 'en-GB-Wavenet-B',  # English UK, female
+            'en-AU': 'en-AU-Wavenet-B',  # English Australia, female
+            'en-CA': 'en-CA-Wavenet-B',  # English Canada, female
+            'en-IN': 'en-IN-Wavenet-B',  # English India, female
+            'fr-FR': 'fr-FR-Wavenet-B',  # French, female
+            'fr-CA': 'fr-CA-Wavenet-B',  # French Canada, female
+            'es-ES': 'es-ES-Wavenet-B',  # Spanish Spain, female
+            'es-MX': 'es-MX-Wavenet-B',  # Spanish Mexico, female
+            'es-AR': 'es-AR-Wavenet-B',  # Spanish Argentina, female
+            'es-CO': 'es-CO-Wavenet-B',  # Spanish Colombia, female
+            'es-PE': 'es-PE-Wavenet-B',  # Spanish Peru, female
+            'es-VE': 'es-VE-Wavenet-B',  # Spanish Venezuela, female
+            'es-CL': 'es-CL-Wavenet-B',  # Spanish Chile, female
+            'es-UY': 'es-UY-Wavenet-B',  # Spanish Uruguay, female
+            'es-BO': 'es-BO-Wavenet-B',  # Spanish Bolivia, female
+            'es-CR': 'es-CR-Wavenet-B',  # Spanish Costa Rica, female
+            'es-EC': 'es-EC-Wavenet-B',  # Spanish Ecuador, female
+            'es-GT': 'es-GT-Wavenet-B',  # Spanish Guatemala, female
+            'es-HN': 'es-HN-Wavenet-B',  # Spanish Honduras, female
+            'es-NI': 'es-NI-Wavenet-B',  # Spanish Nicaragua, female
+            'es-PA': 'es-PA-Wavenet-B',  # Spanish Panama, female
+            'es-PY': 'es-PY-Wavenet-B',  # Spanish Paraguay, female
+            'es-PR': 'es-PR-Wavenet-B',  # Spanish Puerto Rico, female
+            'es-DO': 'es-DO-Wavenet-B',  # Spanish Dominican Republic, female
+            'es-SV': 'es-SV-Wavenet-B',  # Spanish El Salvador, female
+            'it-IT': 'it-IT-Wavenet-B',  # Italian, female
+            'pt-PT': 'pt-PT-Wavenet-B',  # Portuguese Portugal, female
+            'pt-BR': 'pt-BR-Wavenet-B',  # Portuguese Brazil, female
+            'nl-NL': 'nl-NL-Wavenet-B',  # Dutch, female
+            'pl-PL': 'pl-PL-Wavenet-B',  # Polish, female
+            'ru-RU': 'ru-RU-Wavenet-B',  # Russian, female
+            'uk-UA': 'uk-UA-Wavenet-B',  # Ukrainian, female
+            'cs-CZ': 'cs-CZ-Wavenet-B',  # Czech, female
+            'sk-SK': 'sk-SK-Wavenet-B',  # Slovak, female
+            'hu-HU': 'hu-HU-Wavenet-B',  # Hungarian, female
+            'ro-RO': 'ro-RO-Wavenet-B',  # Romanian, female
+            'bg-BG': 'bg-BG-Wavenet-B',  # Bulgarian, female
+            'hr-HR': 'hr-HR-Wavenet-B',  # Croatian, female
+            'sr-RS': 'sr-RS-Wavenet-B',  # Serbian, female
+            'sl-SI': 'sl-SI-Wavenet-B',  # Slovenian, female
+            'et-EE': 'et-EE-Wavenet-B',  # Estonian, female
+            'lv-LV': 'lv-LV-Wavenet-B',  # Latvian, female
+            'lt-LT': 'lt-LT-Wavenet-B',  # Lithuanian, female
+            'fi-FI': 'fi-FI-Wavenet-B',  # Finnish, female
+            'sv-SE': 'sv-SE-Wavenet-B',  # Swedish, female
+            'no-NO': 'no-NO-Wavenet-B',  # Norwegian, female
+            'da-DK': 'da-DK-Wavenet-B',  # Danish, female
+            'is-IS': 'is-IS-Wavenet-B',  # Icelandic, female
+            'el-GR': 'el-GR-Wavenet-B',  # Greek, female
+            'ca-ES': 'ca-ES-Wavenet-B',  # Catalan, female
+            'eu-ES': 'eu-ES-Wavenet-B',  # Basque, female
+            'gl-ES': 'gl-ES-Wavenet-B',  # Galician, female
+            'cy-GB': 'cy-GB-Wavenet-B',  # Welsh, female
+            'ga-IE': 'ga-IE-Wavenet-B',  # Irish, female
+            'mt-MT': 'mt-MT-Wavenet-B',  # Maltese, female
+            
+            # Asian Languages
+            'ja-JP': 'ja-JP-Wavenet-B',  # Japanese, female
+            'ko-KR': 'ko-KR-Wavenet-B',  # Korean, female
+            'zh-CN': 'cmn-CN-Wavenet-B', # Chinese Mandarin, female
+            'zh-TW': 'cmn-TW-Wavenet-B', # Chinese Traditional, female
+            'zh-HK': 'cmn-HK-Wavenet-B', # Chinese Hong Kong, female
+            'hi-IN': 'hi-IN-Wavenet-B',  # Hindi, female
+            'bn-IN': 'bn-IN-Wavenet-B',  # Bengali, female
+            'te-IN': 'te-IN-Wavenet-B',  # Telugu, female
+            'ta-IN': 'ta-IN-Wavenet-B',  # Tamil, female
+            'gu-IN': 'gu-IN-Wavenet-B',  # Gujarati, female
+            'kn-IN': 'kn-IN-Wavenet-B',  # Kannada, female
+            'ml-IN': 'ml-IN-Wavenet-B',  # Malayalam, female
+            'pa-IN': 'pa-IN-Wavenet-B',  # Punjabi, female
+            'or-IN': 'or-IN-Wavenet-B',  # Odia, female
+            'as-IN': 'as-IN-Wavenet-B',  # Assamese, female
+            'ne-NP': 'ne-NP-Wavenet-B',  # Nepali, female
+            'si-LK': 'si-LK-Wavenet-B',  # Sinhala, female
+            'th-TH': 'th-TH-Wavenet-B',  # Thai, female
+            'vi-VN': 'vi-VN-Wavenet-B',  # Vietnamese, female
+            'id-ID': 'id-ID-Wavenet-B',  # Indonesian, female
+            'ms-MY': 'ms-MY-Wavenet-B',  # Malay, female
+            'tl-PH': 'fil-PH-Wavenet-B', # Filipino, female
+            'tr-TR': 'tr-TR-Wavenet-B',  # Turkish, female
+            
+            # Middle Eastern & African Languages
+            'ar-SA': 'ar-SA-Wavenet-B',  # Arabic Saudi Arabia, female
+            'ar-EG': 'ar-EG-Wavenet-B',  # Arabic Egypt, female
+            'ar-AE': 'ar-AE-Wavenet-B',  # Arabic UAE, female
+            'ar-JO': 'ar-JO-Wavenet-B',  # Arabic Jordan, female
+            'ar-KW': 'ar-KW-Wavenet-B',  # Arabic Kuwait, female
+            'ar-LB': 'ar-LB-Wavenet-B',  # Arabic Lebanon, female
+            'ar-QA': 'ar-QA-Wavenet-B',  # Arabic Qatar, female
+            'ar-BH': 'ar-BH-Wavenet-B',  # Arabic Bahrain, female
+            'ar-OM': 'ar-OM-Wavenet-B',  # Arabic Oman, female
+            'ar-IQ': 'ar-IQ-Wavenet-B',  # Arabic Iraq, female
+            'ar-PS': 'ar-PS-Wavenet-B',  # Arabic Palestine, female
+            'ar-SY': 'ar-SY-Wavenet-B',  # Arabic Syria, female
+            'ar-YE': 'ar-YE-Wavenet-B',  # Arabic Yemen, female
+            'ar-MA': 'ar-MA-Wavenet-B',  # Arabic Morocco, female
+            'ar-DZ': 'ar-DZ-Wavenet-B',  # Arabic Algeria, female
+            'ar-TN': 'ar-TN-Wavenet-B',  # Arabic Tunisia, female
+            'ar-LY': 'ar-LY-Wavenet-B',  # Arabic Libya, female
+            'ar-SD': 'ar-SD-Wavenet-B',  # Arabic Sudan, female
+            'he-IL': 'he-IL-Wavenet-B',  # Hebrew, female
+            'fa-IR': 'fa-IR-Wavenet-B',  # Persian, female
+            'ur-PK': 'ur-PK-Wavenet-B',  # Urdu, female
+            'af-ZA': 'af-ZA-Wavenet-B',  # Afrikaans, female
+            'sw-KE': 'sw-KE-Wavenet-B',  # Swahili Kenya, female
+            'am-ET': 'am-ET-Wavenet-B',  # Amharic, female
+            'sw-TZ': 'sw-TZ-Wavenet-B',  # Swahili Tanzania, female
+            'zu-ZA': 'zu-ZA-Wavenet-B',  # Zulu, female
+            'xh-ZA': 'xh-ZA-Wavenet-B',  # Xhosa, female
+        }
+        return voice_mapping.get(language_code, 'en-US-Wavenet-B')
+
--- a/modules/routes/routeVoiceGoogle.py
+++ b/modules/routes/routeVoiceGoogle.py
@ -5,14 +5,16 @@ Replaces Azure voice services with Google Cloud Speech-to-Text and Translation

 import os
 import logging
-from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException
-from typing import Optional
+from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException, Body
+from fastapi.responses import Response
+from typing import Optional, Dict, Any
 from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech
 from modules.security.auth import getCurrentUser
 from modules.interfaces.interfaceAppModel import User
+from modules.interfaces.interfaceComponentObjects import getInterface

 logger = logging.getLogger(__name__)
-router = APIRouter(prefix="/voice-google", tags=["voice-google"])
+router = APIRouter(prefix="/voice-google", tags=["Voice Google"])

 # Global connector instance
 _google_speech_connector = None
@ -23,28 +25,7 @@ def get_google_speech_connector() -> ConnectorGoogleSpeech:
    
    if _google_speech_connector is None:
        try:
-            # Get credentials path from environment or config
-            credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
-            if not credentials_path:
-                # Try to find credentials in common locations
-                possible_paths = [
-                    "credentials/google-service-account.json",
-                    "config/google-credentials.json",
-                    "google-credentials.json"
-                ]
-                
-                for path in possible_paths:
-                    if os.path.exists(path):
-                        credentials_path = path
-                        break
-                
-                if not credentials_path:
-                    raise HTTPException(
-                        status_code=500,
-                        detail="Google Cloud credentials not found. Please set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory."
-                    )
-            
-            _google_speech_connector = ConnectorGoogleSpeech(credentials_path)
+            _google_speech_connector = ConnectorGoogleSpeech()
            logger.info("✅ Google Cloud Speech connector initialized")
            
        except Exception as e:
@ -173,13 +154,15 @@ async def realtime_interpreter(
    try:
        logger.info(f"🔄 Real-time interpreter request: {audio_file.filename}")
        logger.info(f"   From: {from_language} -> To: {to_language}")
+        logger.info(f"   MIME type: {audio_file.content_type}")
        
        # Read audio file
        audio_content = await audio_file.read()
        logger.info(f"📊 Audio file size: {len(audio_content)} bytes")
        
-        # Save audio file for debugging
-        debug_filename = f"debug_audio/audio_google_{audio_file.filename}"
+        # Save audio file for debugging with correct extension
+        file_extension = "webm" if audio_file.filename.endswith('.webm') else "wav"
+        debug_filename = f"debug_audio/audio_google_{audio_file.filename.replace('.wav', '.webm')}"
        os.makedirs("debug_audio", exist_ok=True)
        with open(debug_filename, "wb") as f:
            f.write(audio_content)
@ -235,6 +218,56 @@ async def realtime_interpreter(
            detail=f"Real-time interpreter processing failed: {str(e)}"
        )

+
+@router.post("/text-to-speech")
+async def text_to_speech(
+    text: str = Form(...),
+    language: str = Form("de-DE"),
+    voice: str = Form(None),
+    current_user: User = Depends(getCurrentUser)
+):
+    """Convert text to speech using Google Cloud Text-to-Speech."""
+    try:
+        logger.info(f"Text-to-Speech request: '{text[:50]}...' in {language}")
+        
+        if not text.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Empty text provided for text-to-speech"
+            )
+        
+        connector = get_google_speech_connector()
+        result = await connector.text_to_speech(
+            text=text,
+            language_code=language,
+            voice_name=voice
+        )
+        
+        if result["success"]:
+            return Response(
+                content=result["audio_content"],
+                media_type="audio/mpeg",
+                headers={
+                    "Content-Disposition": "attachment; filename=speech.mp3",
+                    "X-Voice-Name": result["voice_name"],
+                    "X-Language-Code": result["language_code"]
+                }
+            )
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Text-to-Speech failed: {result.get('error', 'Unknown error')}"
+            )
+            
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Text-to-Speech error: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Text-to-Speech processing failed: {str(e)}"
+        )
+
@router.get("/health")
 async def health_check(current_user: User = Depends(getCurrentUser)):
    """Health check for Google Cloud voice services."""
@ -266,3 +299,113 @@ async def health_check(current_user: User = Depends(getCurrentUser)):
            "status": "unhealthy",
            "error": str(e)
        }
+
+@router.get("/settings")
+async def get_voice_settings(current_user: User = Depends(getCurrentUser)):
+    """Get voice settings for the current user."""
+    try:
+        logger.info(f"Getting voice settings for user: {current_user.id}")
+        
+        # Get database interface with user context
+        interface = getInterface(current_user)
+        
+        # Get or create voice settings for the user
+        voice_settings = interface.getOrCreateVoiceSettings(current_user.id)
+        
+        if voice_settings:
+            # Return user settings
+            return {
+                "success": True,
+                "data": {
+                    "user_settings": voice_settings.to_dict(),
+                    "default_settings": {
+                        "sttLanguage": "de-DE",
+                        "ttsLanguage": "de-DE", 
+                        "ttsVoice": "de-DE-Wavenet-A",
+                        "translationEnabled": True,
+                        "targetLanguage": "en-US"
+                    }
+                }
+            }
+        else:
+            # Fallback to default settings if database fails
+            logger.warning("Failed to get voice settings from database, using defaults")
+            return {
+                "success": True,
+                "data": {
+                    "user_settings": None,
+                    "default_settings": {
+                        "sttLanguage": "de-DE",
+                        "ttsLanguage": "de-DE", 
+                        "ttsVoice": "de-DE-Wavenet-A",
+                        "translationEnabled": True,
+                        "targetLanguage": "en-US"
+                    }
+                }
+            }
+        
+    except Exception as e:
+        logger.error(f"Error getting voice settings: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to get voice settings: {str(e)}"
+        )
+
+@router.post("/settings")
+async def save_voice_settings(
+    settings: Dict[str, Any] = Body(...),
+    current_user: User = Depends(getCurrentUser)
+):
+    """Save voice settings for the current user."""
+    try:
+        logger.info(f"Saving voice settings for user: {current_user.id}")
+        logger.info(f"Settings: {settings}")
+        
+        # Validate required settings
+        required_fields = ["sttLanguage", "ttsLanguage", "ttsVoice"]
+        for field in required_fields:
+            if field not in settings:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Missing required field: {field}"
+                )
+        
+        # Set default values for optional fields if not provided
+        if "translationEnabled" not in settings:
+            settings["translationEnabled"] = True
+        if "targetLanguage" not in settings:
+            settings["targetLanguage"] = "en-US"
+        
+        # Get database interface with user context
+        interface = getInterface(current_user)
+        
+        # Check if settings already exist for this user
+        existing_settings = interface.getVoiceSettings(current_user.id)
+        
+        if existing_settings:
+            # Update existing settings
+            logger.info(f"Updating existing voice settings for user {current_user.id}")
+            updated_settings = interface.updateVoiceSettings(current_user.id, settings)
+            logger.info(f"Voice settings updated for user {current_user.id}: {updated_settings}")
+        else:
+            # Create new settings
+            logger.info(f"Creating new voice settings for user {current_user.id}")
+            # Add userId to settings
+            settings["userId"] = current_user.id
+            created_settings = interface.createVoiceSettings(settings)
+            logger.info(f"Voice settings created for user {current_user.id}: {created_settings}")
+        
+        return {
+            "success": True,
+            "message": "Voice settings saved successfully",
+            "data": settings
+        }
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error saving voice settings: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to save voice settings: {str(e)}"
+        )
--- a/modules/routes/routeVoiceStreaming.py
+++ b/modules/routes/routeVoiceStreaming.py
@ -0,0 +1,231 @@
+"""
+Voice Streaming WebSocket Routes
+Provides real-time audio streaming for voice services
+"""
+
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends
+from fastapi.responses import JSONResponse
+import logging
+import json
+import base64
+import asyncio
+from typing import Dict, List
+
+from modules.shared.configuration import APP_CONFIG
+from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech
+
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api/voice/ws", tags=["Voice Streaming"])
+
+# Store active connections
+active_connections: Dict[str, WebSocket] = {}
+
+class ConnectionManager:
+    def __init__(self):
+        self.active_connections: List[WebSocket] = []
+
+    async def connect(self, websocket: WebSocket, connection_id: str):
+        await websocket.accept()
+        self.active_connections.append(websocket)
+        active_connections[connection_id] = websocket
+        logger.info(f"WebSocket connected: {connection_id}")
+
+    def disconnect(self, websocket: WebSocket, connection_id: str):
+        if websocket in self.active_connections:
+            self.active_connections.remove(websocket)
+        if connection_id in active_connections:
+            del active_connections[connection_id]
+        logger.info(f"WebSocket disconnected: {connection_id}")
+
+    async def send_personal_message(self, message: dict, websocket: WebSocket):
+        try:
+            await websocket.send_text(json.dumps(message))
+        except Exception as e:
+            logger.error(f"Error sending message: {e}")
+
+manager = ConnectionManager()
+
+@router.websocket("/realtime-interpreter")
+async def websocket_realtime_interpreter(
+    websocket: WebSocket,
+    user_id: str = "default",
+    from_language: str = "de-DE",
+    to_language: str = "en-US"
+):
+    """WebSocket endpoint for real-time voice interpretation"""
+    connection_id = f"realtime_{user_id}_{from_language}_{to_language}"
+    
+    try:
+        await manager.connect(websocket, connection_id)
+        
+        # Send connection confirmation
+        await manager.send_personal_message({
+            "type": "connected",
+            "connection_id": connection_id,
+            "message": "Connected to real-time interpreter"
+        }, websocket)
+        
+        # Initialize Google Speech connector
+        google_speech = ConnectorGoogleSpeech()
+        
+        while True:
+            # Receive message from client
+            data = await websocket.receive_text()
+            message = json.loads(data)
+            
+            if message["type"] == "audio_chunk":
+                # Process audio chunk
+                try:
+                    # Decode base64 audio data
+                    audio_data = base64.b64decode(message["data"])
+                    
+                    # For now, just acknowledge receipt
+                    # In a full implementation, this would:
+                    # 1. Buffer audio chunks
+                    # 2. Process with Google Cloud Speech-to-Text streaming
+                    # 3. Send partial results back
+                    # 4. Handle translation
+                    
+                    await manager.send_personal_message({
+                        "type": "audio_received",
+                        "chunk_size": len(audio_data),
+                        "timestamp": message.get("timestamp")
+                    }, websocket)
+                    
+                except Exception as e:
+                    logger.error(f"Error processing audio chunk: {e}")
+                    await manager.send_personal_message({
+                        "type": "error",
+                        "error": f"Failed to process audio: {str(e)}"
+                    }, websocket)
+            
+            elif message["type"] == "ping":
+                # Respond to ping
+                await manager.send_personal_message({
+                    "type": "pong",
+                    "timestamp": message.get("timestamp")
+                }, websocket)
+            
+            else:
+                logger.warning(f"Unknown message type: {message['type']}")
+                
+    except WebSocketDisconnect:
+        manager.disconnect(websocket, connection_id)
+        logger.info(f"Client disconnected: {connection_id}")
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        manager.disconnect(websocket, connection_id)
+
+@router.websocket("/speech-to-text")
+async def websocket_speech_to_text(
+    websocket: WebSocket,
+    user_id: str = "default",
+    language: str = "de-DE"
+):
+    """WebSocket endpoint for real-time speech-to-text"""
+    connection_id = f"stt_{user_id}_{language}"
+    
+    try:
+        await manager.connect(websocket, connection_id)
+        
+        await manager.send_personal_message({
+            "type": "connected",
+            "connection_id": connection_id,
+            "message": "Connected to speech-to-text"
+        }, websocket)
+        
+        # Initialize Google Speech connector
+        google_speech = ConnectorGoogleSpeech()
+        
+        while True:
+            data = await websocket.receive_text()
+            message = json.loads(data)
+            
+            if message["type"] == "audio_chunk":
+                try:
+                    audio_data = base64.b64decode(message["data"])
+                    
+                    # Process audio chunk
+                    # This would integrate with Google Cloud Speech-to-Text streaming API
+                    
+                    await manager.send_personal_message({
+                        "type": "transcription_result",
+                        "text": "Audio chunk received",  # Placeholder
+                        "confidence": 0.95,
+                        "is_final": False
+                    }, websocket)
+                    
+                except Exception as e:
+                    logger.error(f"Error processing audio: {e}")
+                    await manager.send_personal_message({
+                        "type": "error",
+                        "error": f"Failed to process audio: {str(e)}"
+                    }, websocket)
+            
+            elif message["type"] == "ping":
+                await manager.send_personal_message({
+                    "type": "pong",
+                    "timestamp": message.get("timestamp")
+                }, websocket)
+                
+    except WebSocketDisconnect:
+        manager.disconnect(websocket, connection_id)
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        manager.disconnect(websocket, connection_id)
+
+@router.websocket("/text-to-speech")
+async def websocket_text_to_speech(
+    websocket: WebSocket,
+    user_id: str = "default",
+    language: str = "de-DE",
+    voice: str = "de-DE-Wavenet-A"
+):
+    """WebSocket endpoint for real-time text-to-speech"""
+    connection_id = f"tts_{user_id}_{language}_{voice}"
+    
+    try:
+        await manager.connect(websocket, connection_id)
+        
+        await manager.send_personal_message({
+            "type": "connected",
+            "connection_id": connection_id,
+            "message": "Connected to text-to-speech"
+        }, websocket)
+        
+        while True:
+            data = await websocket.receive_text()
+            message = json.loads(data)
+            
+            if message["type"] == "text_to_speak":
+                try:
+                    text = message["text"]
+                    
+                    # Process text-to-speech
+                    # This would integrate with Google Cloud Text-to-Speech API
+                    
+                    # For now, send a placeholder response
+                    await manager.send_personal_message({
+                        "type": "audio_data",
+                        "audio": "base64_encoded_audio_here",  # Placeholder
+                        "format": "mp3"
+                    }, websocket)
+                    
+                except Exception as e:
+                    logger.error(f"Error processing text-to-speech: {e}")
+                    await manager.send_personal_message({
+                        "type": "error",
+                        "error": f"Failed to process text: {str(e)}"
+                    }, websocket)
+            
+            elif message["type"] == "ping":
+                await manager.send_personal_message({
+                    "type": "pong",
+                    "timestamp": message.get("timestamp")
+                }, websocket)
+                
+    except WebSocketDisconnect:
+        manager.disconnect(websocket, connection_id)
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        manager.disconnect(websocket, connection_id)
--- a/modules/shared/configuration.py
+++ b/modules/shared/configuration.py
@ -58,20 +58,43 @@ class Configuration:

        try:
            with open(configPath, 'r') as f:
-                for line in f:
-                    line = line.strip()
-                    # Skip empty lines and comments
-                    if not line or line.startswith('#'):
-                        continue
+                lines = f.readlines()
                
-                    # Parse key-value pairs
-                    if '=' in line:
-                        key, value = line.split('=', 1)
-                        key = key.strip()
-                        value = value.strip()
+            i = 0
+            while i < len(lines):
+                line = lines[i].strip()
                
-                        # Add directly to data dictionary
-                        self._data[key] = value
+                # Skip empty lines and comments
+                if not line or line.startswith('#'):
+                    i += 1
+                    continue
+                    
+                # Parse key-value pairs
+                if '=' in line:
+                    key, value = line.split('=', 1)
+                    key = key.strip()
+                    value = value.strip()
+                    
+                    # Check if value starts with { (JSON object)
+                    if value.startswith('{'):
+                        # Collect all lines until we find the closing }
+                        json_lines = [value]
+                        i += 1
+                        brace_count = value.count('{') - value.count('}')
+                        
+                        while i < len(lines) and brace_count > 0:
+                            json_lines.append(lines[i].rstrip('\n'))
+                            brace_count += lines[i].count('{') - lines[i].count('}')
+                            i += 1
+                        
+                        # Join all lines and parse as JSON
+                        value = '\n'.join(json_lines)
+                        i -= 1  # Adjust for the loop increment
+                    
+                    # Add to data dictionary
+                    self._data[key] = value
+                
+                i += 1
                
                    
        except Exception as e:
@ -144,6 +167,9 @@ class Configuration:
            # Handle secrets (keys ending with _SECRET)
            if key.endswith("_SECRET"):
                return handleSecret(value)
+            # Handle JSON secrets (keys ending with _API_KEY that contain JSON)
+            elif key.endswith("_API_KEY") and value.startswith("{"):
+                return handleJsonSecret(value)
            return value
        return default
    
@ -180,5 +206,27 @@ def handleSecret(value: str) -> str:
    # In the future, this could be enhanced to decrypt values
    return value

+def handleJsonSecret(value: str) -> str:
+    """
+    Handle JSON secret values (like Google service account keys).
+    Validates that the value is valid JSON.
+    
+    Args:
+        value: The JSON secret value to handle
+        
+    Returns:
+        str: Processed JSON secret value
+        
+    Raises:
+        ValueError: If the value is not valid JSON
+    """
+    import json
+    try:
+        # Validate that it's valid JSON
+        json.loads(value)
+        return value
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in secret value: {e}")
+
 # Create the global APP_CONFIG instance
 APP_CONFIG = Configuration()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 ## Web Framework & API
 fastapi==0.104.1
+websockets==12.0
 uvicorn==0.23.2
 python-multipart==0.0.6
 httpx==0.25.0
@ -62,6 +63,7 @@ sortedcontainers>=2.4.0     # Required by trio
 ## Google Cloud Integration
 google-cloud-speech==2.21.0
 google-cloud-translate==3.11.1
+google-cloud-texttospeech==2.16.3

 ## MSFT Integration
 msal==1.24.1