diff --git a/app.py b/app.py index 24da5c20..ad2d1309 100644 --- a/app.py +++ b/app.py @@ -250,4 +250,7 @@ from modules.routes.routeSecurityGoogle import router as googleRouter app.include_router(googleRouter) from modules.routes.routeVoiceGoogle import router as voiceGoogleRouter -app.include_router(voiceGoogleRouter) \ No newline at end of file +app.include_router(voiceGoogleRouter) + +from modules.routes.routeVoiceStreaming import router as voiceStreamingRouter +app.include_router(voiceStreamingRouter) \ No newline at end of file diff --git a/config.ini b/config.ini index a434bf8b..bc8aeb7f 100644 --- a/config.ini +++ b/config.ini @@ -49,7 +49,19 @@ Service_GOOGLE_CLIENT_SECRET = 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 Connector_WebTavily_API_KEY = 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 # Google Cloud Speech Services configuration -# Set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory +Connector_GoogleSpeech_API_KEY = { + "type": "service_account", + "project_id": "poweronid", + "private_key_id": "88db66e4248326e9baeac4231bc196fd46a9a441", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDTnJuxA+xBL3LA\nPgFILYCsGuppkkdO6d153Q36f2jTj6zpH3OhKMVsaaTBknG2o2+D0Whlk6Yh5rOw\nkWzpMC3y81leRLm5kucERMkBUgd2GL4v16k6m+QGuC3BFlt/XeyuckJNW0V6v/Dy\n3+bSYM7/5o1ftPNWJeAIEWoE/V4wKCYde8RE4Vp1LO5YwhgcM4rRuPmF2OhekpA+\npteYwkY/8/gTTRpZIc8OTsBYRbaMwsjoDj5riuL3boVtkwZwKRb+ZLvupXeU7Ds7\n1305odTcZUwnImHiHfuq83ZJViQiLRNhUAFnQIXPrYLwEpCmzRBGzYHaRlb69ga/\nzqUbKnclAgMBAAECggEAH6W9qHehubioPMAJM7Y6bC2KU/JLNS4csBZd+idb52gG\nwBwIEFjR+H4ZjymhAA4+pe7c4h7MKyh0RI/l7eoFX98Cb+rEq/r1udm1BhGH3s2h\n2UiI8qRQh1YRjF2/nrN5VjhDBOFa6W9opaopZy/l8AzsT8f21zIgPen8z8o6GpFg\n64fJFcbqCGk2ykN2+x2pIOT04tmCszrfbXZP8LEs4xrUB/XwlHL1vT/M3EWIKbnj\njDaIMjw7q/KRgNUvmKS6SU9b3fnOLcQCz9f5cKdiWACKIU/UvuiWhWJ9ou6BWLWU\nva1A6Fi4XJjhW7s3po58/ioQfl0A9p/L92lGg4ST8QKBgQDx8LIM1g0dh9Ql6LmH\nBUGCOewNNXTs+y3ZznUfvVMoyyZK5w/pzeUvkmOwzbRGnZJ9WyCghq8aezyEpo2D\nPL7Odf988IeHmvhyZIM4PLJYgDvSwGXyf/gh6gJkf/4wpx+tx/yQYNBm3Rht7sA0\npSaLehK0E0kW1uyBzHGKgyQOhwKBgQDf6LiZ7hSQqh54vIU1XMDRth0UOo/s/HGi\nDoij29KjmHjLkm8vOlCo83e79X0WhcnyB5kM7nWFegwcM1PJ0Dl8gidUuTlOVDtM\n5u2AaxDoyXAUL457U5dGFAIW+R653ZDkzMfCglacP8HixXEyIpL1cTLqiCAgzszS\nLcSWwoAr8wKBgQC4CGm3X97sFpTmHSd6sCHLaDnJNl9xoAKZifUHpqCqCBVhpm8x\nXp+11vmj1GULzfJPDlE8Khbp4tH+6R39tOhC7fjgVaoSGWxgv1odHfZfYXOf9R/X\nHUZmrbUSM1XsNkPfkZ7pR+teQ1HA1Xo40WMHd1zgw0a2a9fNR/EZ9nUn4wKBgGaK\nUEgGNRrPHadTRnnaoV8o1IZYD2OLdIqvtzm7SOqsv90SkaKCRUAqR5InaYKwAHy7\nqAa5Cc73xqX/h4arujff7x0ouiq5/nJIa0ndPmAtKAvGf6zQ6j0ompBkxAKAioON\nmInmYL2roSI2I5G/LagDkDrB3lzH+Brk5NvZ9RKrAoGAGox462GGGb/NbGdDkahN\ndifzYYvq4FPiWFFo0ynKAulxCBWLXO/N45XNuAyen433d8eREcAYz1Dzax44+MdQ\nHo9dU7YcZvFyt6iZsYeQF8dluHui3vzMpUe0KbqpZC5KMOSw53ZdNIwzo8NTAK59\n+uv3dHGj7sS8fhDo3yCifzc=\n-----END PRIVATE KEY-----\n", + "client_email": "poweron-voice-services@poweronid.iam.gserviceaccount.com", + "client_id": "116641749406798186404", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/poweron-voice-services%40poweronid.iam.gserviceaccount.com", + "universe_domain": "googleapis.com" +} # Web Search configuration Web_Search_MAX_QUERY_LENGTH = 400 diff --git a/debug_audio/audio_20250913_223438.wav b/debug_audio/audio_20250913_223438.wav deleted file mode 100644 index eb2196e3..00000000 Binary files a/debug_audio/audio_20250913_223438.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_223658.wav b/debug_audio/audio_20250913_223658.wav deleted file mode 100644 index 5ecabd88..00000000 Binary files a/debug_audio/audio_20250913_223658.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_224003.wav b/debug_audio/audio_20250913_224003.wav deleted file mode 100644 index ee47d364..00000000 Binary files a/debug_audio/audio_20250913_224003.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_224258.wav b/debug_audio/audio_20250913_224258.wav deleted file mode 100644 index ea776be1..00000000 Binary files a/debug_audio/audio_20250913_224258.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_224524.wav b/debug_audio/audio_20250913_224524.wav deleted file mode 100644 index c9f15fa1..00000000 Binary files a/debug_audio/audio_20250913_224524.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_224801.wav b/debug_audio/audio_20250913_224801.wav deleted file mode 100644 index 8d63b2b6..00000000 Binary files a/debug_audio/audio_20250913_224801.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_230817.wav b/debug_audio/audio_20250913_230817.wav deleted file mode 100644 index 44ee620e..00000000 Binary files a/debug_audio/audio_20250913_230817.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_230927.wav b/debug_audio/audio_20250913_230927.wav deleted file mode 100644 index 70eff7a5..00000000 Binary files a/debug_audio/audio_20250913_230927.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_231253.wav b/debug_audio/audio_20250913_231253.wav deleted file mode 100644 index b40442eb..00000000 Binary files a/debug_audio/audio_20250913_231253.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_231321.wav b/debug_audio/audio_20250913_231321.wav deleted file mode 100644 index 241fb907..00000000 Binary files a/debug_audio/audio_20250913_231321.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_231611.wav b/debug_audio/audio_20250913_231611.wav deleted file mode 100644 index 197a766e..00000000 Binary files a/debug_audio/audio_20250913_231611.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_231935.wav b/debug_audio/audio_20250913_231935.wav deleted file mode 100644 index ee0da4f6..00000000 Binary files a/debug_audio/audio_20250913_231935.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_232141.wav b/debug_audio/audio_20250913_232141.wav deleted file mode 100644 index 567219ea..00000000 Binary files a/debug_audio/audio_20250913_232141.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_232309.wav b/debug_audio/audio_20250913_232309.wav deleted file mode 100644 index 4ff7c326..00000000 Binary files a/debug_audio/audio_20250913_232309.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_232518.wav b/debug_audio/audio_20250913_232518.wav deleted file mode 100644 index 4330c519..00000000 Binary files a/debug_audio/audio_20250913_232518.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_232659.wav b/debug_audio/audio_20250913_232659.wav deleted file mode 100644 index 17135101..00000000 Binary files a/debug_audio/audio_20250913_232659.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_232941.wav b/debug_audio/audio_20250913_232941.wav deleted file mode 100644 index 0b5be013..00000000 Binary files a/debug_audio/audio_20250913_232941.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_233053.wav b/debug_audio/audio_20250913_233053.wav deleted file mode 100644 index 2cfc44ee..00000000 Binary files a/debug_audio/audio_20250913_233053.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_233155.wav b/debug_audio/audio_20250913_233155.wav deleted file mode 100644 index 0923cf6b..00000000 Binary files a/debug_audio/audio_20250913_233155.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_233607.wav b/debug_audio/audio_20250913_233607.wav deleted file mode 100644 index 36b7918b..00000000 Binary files a/debug_audio/audio_20250913_233607.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_234106.wav b/debug_audio/audio_20250913_234106.wav deleted file mode 100644 index 0bb11f95..00000000 Binary files a/debug_audio/audio_20250913_234106.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_234245.wav b/debug_audio/audio_20250913_234245.wav deleted file mode 100644 index d1ce6299..00000000 Binary files a/debug_audio/audio_20250913_234245.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_234843.wav b/debug_audio/audio_20250913_234843.wav deleted file mode 100644 index de817ca9..00000000 Binary files a/debug_audio/audio_20250913_234843.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_235136.wav b/debug_audio/audio_20250913_235136.wav deleted file mode 100644 index 7ef75e0a..00000000 Binary files a/debug_audio/audio_20250913_235136.wav and /dev/null differ diff --git a/debug_audio/audio_20250913_235409.wav b/debug_audio/audio_20250913_235409.wav deleted file mode 100644 index 3248f9b3..00000000 Binary files a/debug_audio/audio_20250913_235409.wav and /dev/null differ diff --git a/debug_audio/audio_google_conversation_sliding.webm b/debug_audio/audio_google_conversation_sliding.webm new file mode 100644 index 00000000..de55163b Binary files /dev/null and b/debug_audio/audio_google_conversation_sliding.webm differ diff --git a/debug_audio/audio_google_interpreter_recording.webm b/debug_audio/audio_google_interpreter_recording.webm new file mode 100644 index 00000000..07c2b674 Binary files /dev/null and b/debug_audio/audio_google_interpreter_recording.webm differ diff --git a/debug_audio/audio_google_recording.webm b/debug_audio/audio_google_recording.webm new file mode 100644 index 00000000..c3bbabf2 Binary files /dev/null and b/debug_audio/audio_google_recording.webm differ diff --git a/modules/connectors/connectorGoogleSpeech.py b/modules/connectors/connectorGoogleSpeech.py index 002f88de..9b18c2ab 100644 --- a/modules/connectors/connectorGoogleSpeech.py +++ b/modules/connectors/connectorGoogleSpeech.py @@ -5,11 +5,15 @@ Replaces Azure Speech Services with Google Cloud APIs import os import io +import json +import html import logging import asyncio from typing import Dict, Optional, Any from google.cloud import speech from google.cloud import translate_v2 as translate +from google.cloud import texttospeech +from modules.shared.configuration import APP_CONFIG logger = logging.getLogger(__name__) @@ -19,21 +23,34 @@ class ConnectorGoogleSpeech: Handles audio processing, speech recognition, and translation. """ - def __init__(self, credentials_path: Optional[str] = None): + def __init__(self): """ - Initialize Google Cloud Speech and Translation clients. - - Args: - credentials_path: Path to Google Cloud service account JSON file + Initialize Google Cloud Speech and Translation clients using config.ini. """ try: - # Set up authentication - if credentials_path: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path + # Get JSON key from config.ini + api_key = APP_CONFIG.get("Connector_GoogleSpeech_API_KEY") - # Initialize clients - self.speech_client = speech.SpeechClient() - self.translate_client = translate.Client() + if not api_key or api_key == "YOUR_GOOGLE_SERVICE_ACCOUNT_JSON_KEY_HERE": + raise ValueError("Google Speech API key not configured. Please set Connector_GoogleSpeech_API_KEY in config.ini with the full service account JSON key") + + # Parse the JSON key and set up authentication + try: + credentials_info = json.loads(api_key) + + # Create credentials object directly (no file needed!) + from google.oauth2 import service_account + credentials = service_account.Credentials.from_service_account_info(credentials_info) + + logger.info("✅ Using Google Speech credentials from config.ini") + + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in Google Speech API key: {e}") + + # Initialize clients with explicit credentials + self.speech_client = speech.SpeechClient(credentials=credentials) + self.translate_client = translate.Client(credentials=credentials) + self.tts_client = texttospeech.TextToSpeechClient(credentials=credentials) logger.info("✅ Google Cloud Speech and Translation clients initialized successfully") @@ -42,37 +59,134 @@ class ConnectorGoogleSpeech: raise async def speech_to_text(self, audio_content: bytes, language: str = "de-DE", - sample_rate: int = 16000, channels: int = 1) -> Dict: + sample_rate: int = None, channels: int = None) -> Dict: """ Convert speech to text using Google Cloud Speech-to-Text API. Args: - audio_content: Raw audio data (PCM format) + audio_content: Raw audio data (various formats supported) language: Language code (e.g., 'de-DE', 'en-US') - sample_rate: Audio sample rate (default: 16000 Hz) - channels: Number of audio channels (default: 1) + sample_rate: Audio sample rate (auto-detected if None) + channels: Number of audio channels (auto-detected if None) Returns: Dict containing transcribed text, confidence, and metadata """ try: - logger.info(f"🎤 Processing audio with Google Cloud Speech-to-Text") - logger.info(f"📊 Audio: {len(audio_content)} bytes, {sample_rate}Hz, {channels}ch") + # Auto-detect audio format if not provided + if sample_rate is None or channels is None: + validation = self.validate_audio_format(audio_content) + if not validation["valid"]: + return { + "success": False, + "text": "", + "confidence": 0.0, + "error": f"Invalid audio format: {validation.get('error', 'Unknown error')}" + } + sample_rate = validation["sample_rate"] + channels = validation["channels"] + audio_format = validation["format"] + logger.info(f"Auto-detected audio: {audio_format}, {sample_rate}Hz, {channels}ch") + + logger.info(f"Processing audio with Google Cloud Speech-to-Text") + logger.info(f"Audio: {len(audio_content)} bytes, {sample_rate}Hz, {channels}ch") # Configure audio settings audio = speech.RecognitionAudio(content=audio_content) - config = speech.RecognitionConfig( - encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, - sample_rate_hertz=sample_rate, - audio_channel_count=channels, - language_code=language, - enable_automatic_punctuation=True, - model="latest_long" # Use the latest model - ) + + # Determine encoding based on detected format + # Google Cloud Speech API has specific requirements for different formats + if audio_format == "webm_opus": + # For WEBM OPUS, we need to ensure proper format + encoding = speech.RecognitionConfig.AudioEncoding.WEBM_OPUS + # WEBM_OPUS requires specific sample rate handling - must match header + if sample_rate != 48000: + logger.warning(f"WEBM_OPUS detected but sample rate is {sample_rate}, adjusting to 48000") + sample_rate = 48000 + # For WEBM_OPUS, don't specify sample_rate_hertz in config + # Google Cloud will read it from the WEBM header + use_sample_rate = False + elif audio_format == "linear16": + # For LINEAR16 format (PCM) + encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16 + # Ensure sample rate is reasonable + if sample_rate not in [8000, 16000, 22050, 24000, 32000, 44100, 48000]: + logger.warning(f"Unusual sample rate {sample_rate}, adjusting to 16000") + sample_rate = 16000 + use_sample_rate = True + elif audio_format == "mp3": + # For MP3 format + encoding = speech.RecognitionConfig.AudioEncoding.MP3 + use_sample_rate = True + elif audio_format == "flac": + # For FLAC format + encoding = speech.RecognitionConfig.AudioEncoding.FLAC + use_sample_rate = True + elif audio_format == "wav": + # For WAV format + encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16 + use_sample_rate = True + else: + # For unknown formats, try LINEAR16 as fallback + encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16 + sample_rate = 16000 # Use standard sample rate + channels = 1 # Use mono + use_sample_rate = True + logger.warning(f"Unknown audio format '{audio_format}', using LINEAR16 encoding with 16000Hz") + + # Build config based on format requirements + config_params = { + "encoding": encoding, + "audio_channel_count": channels, + "language_code": language, + "enable_automatic_punctuation": True, + "model": "latest_long", # Try latest_long model for better recognition + "enable_word_time_offsets": True, # Enable word-level timing + "enable_word_confidence": True, # Enable word-level confidence + "max_alternatives": 3, # Try more alternatives + "use_enhanced": True # Use enhanced model if available + } + + # Only add sample_rate_hertz if needed (not for WEBM_OPUS) + if use_sample_rate: + config_params["sample_rate_hertz"] = sample_rate + logger.debug(f"Recognition config: encoding={encoding}, sample_rate={sample_rate}, channels={channels}, language={language}") + else: + logger.debug(f"Recognition config: encoding={encoding}, sample_rate=auto (from header), channels={channels}, language={language}") + + config = speech.RecognitionConfig(**config_params) # Perform speech recognition - logger.info("🔄 Sending audio to Google Cloud Speech-to-Text...") - response = self.speech_client.recognize(config=config, audio=audio) + logger.info("Sending audio to Google Cloud Speech-to-Text...") + + try: + # Use regular recognition for single audio files (not streaming) + logger.info("Using regular recognition for single audio file...") + response = self.speech_client.recognize(config=config, audio=audio) + logger.debug(f"Google Cloud response: {response}") + + except Exception as api_error: + logger.error(f"Google Cloud API error: {api_error}") + # Try with different encoding as fallback + if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16: + logger.info("Trying fallback with LINEAR16 encoding...") + fallback_config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, # Use standard sample rate + audio_channel_count=1, + language_code=language, + enable_automatic_punctuation=True, + model="latest_long" + ) + + try: + response = self.speech_client.recognize(config=fallback_config, audio=audio) + logger.debug(f"Google Cloud fallback response: {response}") + except Exception as fallback_error: + logger.error(f"Google Cloud fallback error: {fallback_error}") + raise api_error + else: + raise api_error # Process results if response.results: @@ -82,7 +196,7 @@ class ConnectorGoogleSpeech: transcribed_text = alternative.transcript confidence = alternative.confidence - logger.info(f"✅ Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})") + logger.info(f"Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})") return { "success": True, @@ -96,7 +210,8 @@ class ConnectorGoogleSpeech: } } else: - logger.warning("⚠️ No transcription alternatives found") + logger.warning("No transcription alternatives found") + logger.debug(f"Result details: {result}") return { "success": False, "text": "", @@ -104,16 +219,102 @@ class ConnectorGoogleSpeech: "error": "No transcription alternatives found" } else: - logger.warning("⚠️ No recognition results from Google Cloud") + logger.warning("No recognition results from Google Cloud") + logger.debug(f"Response details: {response}") + + # Check if there are any error messages in the response + if hasattr(response, 'error') and response.error: + logger.error(f"Google Cloud error: {response.error}") + return { + "success": False, + "text": "", + "confidence": 0.0, + "error": f"Google Cloud error: {response.error}" + } + + # Try multiple fallback approaches + fallback_configs = [] + + if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16: + # Try LINEAR16 with detected sample rate + fallback_configs.append({ + "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16, + "sample_rate": sample_rate, + "channels": channels, + "use_sample_rate": True, + "description": f"LINEAR16 with {sample_rate}Hz" + }) + + # Try LINEAR16 with standard sample rates + for std_rate in [16000, 8000, 22050, 44100]: + if std_rate != sample_rate: + fallback_configs.append({ + "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16, + "sample_rate": std_rate, + "channels": 1, + "use_sample_rate": True, + "description": f"LINEAR16 with {std_rate}Hz" + }) + + # Try with different models + models = ["latest_long", "phone_call", "latest_short"] + + for fallback_config in fallback_configs: + for model in models: + try: + logger.info(f"Trying fallback: {fallback_config['description']} with {model} model...") + + # Build fallback config with proper sample rate handling + fallback_config_params = { + "encoding": fallback_config["encoding"], + "audio_channel_count": fallback_config["channels"], + "language_code": language, + "enable_automatic_punctuation": True, + "model": model + } + + # Only add sample_rate_hertz if needed + if fallback_config["use_sample_rate"]: + fallback_config_params["sample_rate_hertz"] = fallback_config["sample_rate"] + + fallback_config_obj = speech.RecognitionConfig(**fallback_config_params) + fallback_response = self.speech_client.recognize(config=fallback_config_obj, audio=audio) + + if fallback_response.results: + result = fallback_response.results[0] + if result.alternatives: + alternative = result.alternatives[0] + transcribed_text = alternative.transcript + confidence = alternative.confidence + + logger.info(f"Fallback transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})") + + return { + "success": True, + "text": transcribed_text, + "confidence": confidence, + "language": language, + "raw_result": { + "transcript": transcribed_text, + "confidence": confidence, + "language_code": language + } + } + + except Exception as e: + logger.debug(f"Fallback failed: {e}") + continue + + logger.warning("All fallback attempts failed") return { "success": False, "text": "", "confidence": 0.0, - "error": "No recognition results" + "error": "No recognition results - audio may be too short, unclear, or in unsupported format" } except Exception as e: - logger.error(f"❌ Google Cloud Speech-to-Text error: {e}") + logger.error(f"Google Cloud Speech-to-Text error: {e}") return { "success": False, "text": "", @@ -155,6 +356,9 @@ class ConnectorGoogleSpeech: translated_text = result['translatedText'] detected_language = result.get('detectedSourceLanguage', source_language) + # Decode HTML entities in translated text + translated_text = html.unescape(translated_text) + logger.info(f"✅ Translation successful: '{translated_text}'") return { @@ -256,26 +460,154 @@ class ConnectorGoogleSpeech: Dict containing validation results """ try: - # Google Cloud Speech-to-Text supports various formats - # We'll do basic validation + # Basic validation if len(audio_content) < 100: return { "valid": False, "error": "Audio too short (less than 100 bytes)" } - # Check if it looks like PCM audio (basic check) - if len(audio_content) % 2 != 0: - return { - "valid": False, - "error": "Audio data length is odd (not 16-bit PCM)" - } + # Detect audio format by checking file headers + audio_format = "unknown" + sample_rate = 16000 # Default fallback + channels = 1 # Default fallback + + # Debug: Log first few bytes for format detection + logger.debug(f"Audio header bytes: {audio_content[:20].hex()}") + logger.debug(f"Audio content length: {len(audio_content)} bytes") + + # Check for WEBM/OPUS format (common from web recordings) + if audio_content.startswith(b'\x1a\x45\xdf\xa3'): + audio_format = "webm_opus" + sample_rate = 48000 # WEBM OPUS typically uses 48kHz + channels = 1 + logger.info(f"Detected WEBM OPUS format: {sample_rate}Hz, {channels}ch") + + # Check for specific header patterns seen in logs (43c381...) + # This appears to be a different audio format or corrupted WEBM + elif audio_content.startswith(b'\x43\xc3\x81') and len(audio_content) > 1000: + # This might be a different format or corrupted audio + # Try to detect if it's actually WEBM by looking deeper + if b'webm' in audio_content[:200] or b'opus' in audio_content[:200]: + audio_format = "webm_opus" + sample_rate = 48000 + channels = 1 + logger.info(f"Detected WEBM format (deep scan): {sample_rate}Hz, {channels}ch") + else: + # Unknown format, try as LINEAR16 + audio_format = "linear16" + sample_rate = 16000 + channels = 1 + logger.warning(f"Unknown audio format with header {audio_content[:8].hex()}, trying LINEAR16") + + # Check for WEBM format (alternative detection) + elif b'webm' in audio_content[:100].lower() or b'opus' in audio_content[:100].lower(): + audio_format = "webm_opus" + sample_rate = 48000 # WEBM OPUS typically uses 48kHz + channels = 1 + logger.info(f"Detected WEBM format: {sample_rate}Hz, {channels}ch") + + # Check for MediaRecorder WEBM chunks (common in browser recordings) + elif audio_content.startswith(b'\x1a\x45\xdf\xa3') and len(audio_content) > 1000: + audio_format = "webm_opus" + sample_rate = 48000 # Browser MediaRecorder typically uses 48kHz + channels = 1 + logger.info(f"Detected MediaRecorder WEBM: {sample_rate}Hz, {channels}ch") + + # Check for OPUS format by looking for OPUS magic bytes + elif audio_content.startswith(b'OpusHead') or b'OpusHead' in audio_content[:50]: + audio_format = "webm_opus" + sample_rate = 48000 # OPUS typically uses 48kHz + channels = 1 + logger.info(f"Detected OPUS format: {sample_rate}Hz, {channels}ch") + + # Check for OGG format (often contains OPUS) + elif audio_content.startswith(b'OggS'): + audio_format = "webm_opus" + sample_rate = 48000 # OGG OPUS typically uses 48kHz + channels = 1 + logger.info(f"Detected OGG format: {sample_rate}Hz, {channels}ch") + + # Check for WAV format + elif audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]: + audio_format = "wav" + # Try to extract sample rate from WAV header + try: + # WAV header sample rate is at offset 24-27 (little endian) + sample_rate = int.from_bytes(audio_content[24:28], 'little') + channels = int.from_bytes(audio_content[22:24], 'little') + logger.info(f"Detected WAV format: {sample_rate}Hz, {channels}ch") + except: + sample_rate = 16000 # Fallback + channels = 1 + + # Check for MP3 format + elif audio_content.startswith(b'\xff\xfb') or audio_content.startswith(b'ID3'): + audio_format = "mp3" + sample_rate = 44100 # MP3 typically uses 44.1kHz + channels = 2 # Usually stereo + logger.info(f"Detected MP3 format: {sample_rate}Hz, {channels}ch") + + # Check for FLAC format + elif audio_content.startswith(b'fLaC'): + audio_format = "flac" + sample_rate = 44100 # Common FLAC sample rate + channels = 2 + logger.info(f"Detected FLAC format: {sample_rate}Hz, {channels}ch") + + else: + # Unknown format, try WEBM_OPUS as it's most common for web recordings + audio_format = "webm_opus" + sample_rate = 48000 # Try 48kHz for web recordings + channels = 1 + logger.warning(f"Unknown audio format, trying WEBM_OPUS: {sample_rate}Hz, {channels}ch") + + # Calculate estimated duration + if audio_format == "webm_opus": + # WEBM OPUS duration is hard to calculate without decoding + estimated_duration = 3.0 # Assume 3 seconds for web recordings + else: + # Rough estimate for uncompressed audio + estimated_duration = len(audio_content) / (sample_rate * channels * 2) # 16-bit = 2 bytes per sample + + # Check if audio is too short (less than 0.5 seconds) + if estimated_duration < 0.5: + logger.warning(f"Audio too short: {estimated_duration:.2f}s, may not be recognized") + + # Log audio details for debugging + logger.info(f"Audio analysis: {len(audio_content)} bytes, {estimated_duration:.2f}s, {sample_rate}Hz, {channels}ch, format={audio_format}") + + # Check audio levels (simple check for silence) + if audio_format == "webm_opus": + # For WEBM, we can't easily check levels, but log the first few bytes + logger.debug(f"Audio sample bytes: {audio_content[:20].hex()}") + # Check if audio has some variation (not all same bytes) + if len(audio_content) > 100: + sample_bytes = audio_content[100:200] # Skip header + if len(set(sample_bytes)) < 5: # Less than 5 different byte values + logger.warning("Audio may be silent or very quiet (low byte variation)") + else: + logger.debug(f"Audio has good byte variation: {len(set(sample_bytes))} unique values") + else: + # For PCM audio, check for silence + if len(audio_content) > 100: + # Convert first 100 bytes to check for silence + sample_bytes = audio_content[:100] + if all(b == 0 for b in sample_bytes): + logger.warning("Audio appears to be silent (all zeros)") + else: + logger.debug(f"Audio sample bytes: {sample_bytes[:20].hex()}") + # Check for low variation + if len(set(sample_bytes)) < 5: + logger.warning("Audio may be very quiet (low byte variation)") return { "valid": True, - "format": "pcm", + "format": audio_format, + "sample_rate": sample_rate, + "channels": channels, "size": len(audio_content), - "estimated_duration": len(audio_content) / (16000 * 2) # Rough estimate for 16kHz, 16-bit + "estimated_duration": estimated_duration } except Exception as e: @@ -283,3 +615,181 @@ class ConnectorGoogleSpeech: "valid": False, "error": f"Validation error: {e}" } + + async def text_to_speech(self, text: str, language_code: str = "de-DE", voice_name: str = None) -> Dict[str, Any]: + """ + Convert text to speech using Google Cloud Text-to-Speech. + + Args: + text: Text to convert to speech + language_code: Language code (e.g., 'de-DE', 'en-US') + voice_name: Specific voice name (optional) + + Returns: + Dict with success status and audio data + """ + try: + logger.info(f"Converting text to speech: '{text[:50]}...' in {language_code}") + + # Set up the synthesis input + synthesis_input = texttospeech.SynthesisInput(text=text) + + # Build the voice request + selected_voice = voice_name or self._get_default_voice(language_code) + logger.info(f"Using TTS voice: {selected_voice} for language: {language_code}") + + voice = texttospeech.VoiceSelectionParams( + language_code=language_code, + name=selected_voice, + ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL + ) + + # Select the type of audio file to return + audio_config = texttospeech.AudioConfig( + audio_encoding=texttospeech.AudioEncoding.MP3 + ) + + # Perform the text-to-speech request + response = self.tts_client.synthesize_speech( + input=synthesis_input, + voice=voice, + audio_config=audio_config + ) + + # Return the audio content + return { + "success": True, + "audio_content": response.audio_content, + "audio_format": "mp3", + "language_code": language_code, + "voice_name": voice.name + } + + except Exception as e: + logger.error(f"Text-to-Speech error: {e}") + return { + "success": False, + "error": f"Text-to-Speech failed: {str(e)}" + } + + def _get_default_voice(self, language_code: str) -> str: + """ + Get default voice name for a language code. + Uses female voices as default for better user experience. + """ + voice_mapping = { + # European Languages + 'de-DE': 'de-DE-Wavenet-B', # German, female + 'en-US': 'en-US-Wavenet-B', # English US, female + 'en-GB': 'en-GB-Wavenet-B', # English UK, female + 'en-AU': 'en-AU-Wavenet-B', # English Australia, female + 'en-CA': 'en-CA-Wavenet-B', # English Canada, female + 'en-IN': 'en-IN-Wavenet-B', # English India, female + 'fr-FR': 'fr-FR-Wavenet-B', # French, female + 'fr-CA': 'fr-CA-Wavenet-B', # French Canada, female + 'es-ES': 'es-ES-Wavenet-B', # Spanish Spain, female + 'es-MX': 'es-MX-Wavenet-B', # Spanish Mexico, female + 'es-AR': 'es-AR-Wavenet-B', # Spanish Argentina, female + 'es-CO': 'es-CO-Wavenet-B', # Spanish Colombia, female + 'es-PE': 'es-PE-Wavenet-B', # Spanish Peru, female + 'es-VE': 'es-VE-Wavenet-B', # Spanish Venezuela, female + 'es-CL': 'es-CL-Wavenet-B', # Spanish Chile, female + 'es-UY': 'es-UY-Wavenet-B', # Spanish Uruguay, female + 'es-BO': 'es-BO-Wavenet-B', # Spanish Bolivia, female + 'es-CR': 'es-CR-Wavenet-B', # Spanish Costa Rica, female + 'es-EC': 'es-EC-Wavenet-B', # Spanish Ecuador, female + 'es-GT': 'es-GT-Wavenet-B', # Spanish Guatemala, female + 'es-HN': 'es-HN-Wavenet-B', # Spanish Honduras, female + 'es-NI': 'es-NI-Wavenet-B', # Spanish Nicaragua, female + 'es-PA': 'es-PA-Wavenet-B', # Spanish Panama, female + 'es-PY': 'es-PY-Wavenet-B', # Spanish Paraguay, female + 'es-PR': 'es-PR-Wavenet-B', # Spanish Puerto Rico, female + 'es-DO': 'es-DO-Wavenet-B', # Spanish Dominican Republic, female + 'es-SV': 'es-SV-Wavenet-B', # Spanish El Salvador, female + 'it-IT': 'it-IT-Wavenet-B', # Italian, female + 'pt-PT': 'pt-PT-Wavenet-B', # Portuguese Portugal, female + 'pt-BR': 'pt-BR-Wavenet-B', # Portuguese Brazil, female + 'nl-NL': 'nl-NL-Wavenet-B', # Dutch, female + 'pl-PL': 'pl-PL-Wavenet-B', # Polish, female + 'ru-RU': 'ru-RU-Wavenet-B', # Russian, female + 'uk-UA': 'uk-UA-Wavenet-B', # Ukrainian, female + 'cs-CZ': 'cs-CZ-Wavenet-B', # Czech, female + 'sk-SK': 'sk-SK-Wavenet-B', # Slovak, female + 'hu-HU': 'hu-HU-Wavenet-B', # Hungarian, female + 'ro-RO': 'ro-RO-Wavenet-B', # Romanian, female + 'bg-BG': 'bg-BG-Wavenet-B', # Bulgarian, female + 'hr-HR': 'hr-HR-Wavenet-B', # Croatian, female + 'sr-RS': 'sr-RS-Wavenet-B', # Serbian, female + 'sl-SI': 'sl-SI-Wavenet-B', # Slovenian, female + 'et-EE': 'et-EE-Wavenet-B', # Estonian, female + 'lv-LV': 'lv-LV-Wavenet-B', # Latvian, female + 'lt-LT': 'lt-LT-Wavenet-B', # Lithuanian, female + 'fi-FI': 'fi-FI-Wavenet-B', # Finnish, female + 'sv-SE': 'sv-SE-Wavenet-B', # Swedish, female + 'no-NO': 'no-NO-Wavenet-B', # Norwegian, female + 'da-DK': 'da-DK-Wavenet-B', # Danish, female + 'is-IS': 'is-IS-Wavenet-B', # Icelandic, female + 'el-GR': 'el-GR-Wavenet-B', # Greek, female + 'ca-ES': 'ca-ES-Wavenet-B', # Catalan, female + 'eu-ES': 'eu-ES-Wavenet-B', # Basque, female + 'gl-ES': 'gl-ES-Wavenet-B', # Galician, female + 'cy-GB': 'cy-GB-Wavenet-B', # Welsh, female + 'ga-IE': 'ga-IE-Wavenet-B', # Irish, female + 'mt-MT': 'mt-MT-Wavenet-B', # Maltese, female + + # Asian Languages + 'ja-JP': 'ja-JP-Wavenet-B', # Japanese, female + 'ko-KR': 'ko-KR-Wavenet-B', # Korean, female + 'zh-CN': 'cmn-CN-Wavenet-B', # Chinese Mandarin, female + 'zh-TW': 'cmn-TW-Wavenet-B', # Chinese Traditional, female + 'zh-HK': 'cmn-HK-Wavenet-B', # Chinese Hong Kong, female + 'hi-IN': 'hi-IN-Wavenet-B', # Hindi, female + 'bn-IN': 'bn-IN-Wavenet-B', # Bengali, female + 'te-IN': 'te-IN-Wavenet-B', # Telugu, female + 'ta-IN': 'ta-IN-Wavenet-B', # Tamil, female + 'gu-IN': 'gu-IN-Wavenet-B', # Gujarati, female + 'kn-IN': 'kn-IN-Wavenet-B', # Kannada, female + 'ml-IN': 'ml-IN-Wavenet-B', # Malayalam, female + 'pa-IN': 'pa-IN-Wavenet-B', # Punjabi, female + 'or-IN': 'or-IN-Wavenet-B', # Odia, female + 'as-IN': 'as-IN-Wavenet-B', # Assamese, female + 'ne-NP': 'ne-NP-Wavenet-B', # Nepali, female + 'si-LK': 'si-LK-Wavenet-B', # Sinhala, female + 'th-TH': 'th-TH-Wavenet-B', # Thai, female + 'vi-VN': 'vi-VN-Wavenet-B', # Vietnamese, female + 'id-ID': 'id-ID-Wavenet-B', # Indonesian, female + 'ms-MY': 'ms-MY-Wavenet-B', # Malay, female + 'tl-PH': 'fil-PH-Wavenet-B', # Filipino, female + 'tr-TR': 'tr-TR-Wavenet-B', # Turkish, female + + # Middle Eastern & African Languages + 'ar-SA': 'ar-SA-Wavenet-B', # Arabic Saudi Arabia, female + 'ar-EG': 'ar-EG-Wavenet-B', # Arabic Egypt, female + 'ar-AE': 'ar-AE-Wavenet-B', # Arabic UAE, female + 'ar-JO': 'ar-JO-Wavenet-B', # Arabic Jordan, female + 'ar-KW': 'ar-KW-Wavenet-B', # Arabic Kuwait, female + 'ar-LB': 'ar-LB-Wavenet-B', # Arabic Lebanon, female + 'ar-QA': 'ar-QA-Wavenet-B', # Arabic Qatar, female + 'ar-BH': 'ar-BH-Wavenet-B', # Arabic Bahrain, female + 'ar-OM': 'ar-OM-Wavenet-B', # Arabic Oman, female + 'ar-IQ': 'ar-IQ-Wavenet-B', # Arabic Iraq, female + 'ar-PS': 'ar-PS-Wavenet-B', # Arabic Palestine, female + 'ar-SY': 'ar-SY-Wavenet-B', # Arabic Syria, female + 'ar-YE': 'ar-YE-Wavenet-B', # Arabic Yemen, female + 'ar-MA': 'ar-MA-Wavenet-B', # Arabic Morocco, female + 'ar-DZ': 'ar-DZ-Wavenet-B', # Arabic Algeria, female + 'ar-TN': 'ar-TN-Wavenet-B', # Arabic Tunisia, female + 'ar-LY': 'ar-LY-Wavenet-B', # Arabic Libya, female + 'ar-SD': 'ar-SD-Wavenet-B', # Arabic Sudan, female + 'he-IL': 'he-IL-Wavenet-B', # Hebrew, female + 'fa-IR': 'fa-IR-Wavenet-B', # Persian, female + 'ur-PK': 'ur-PK-Wavenet-B', # Urdu, female + 'af-ZA': 'af-ZA-Wavenet-B', # Afrikaans, female + 'sw-KE': 'sw-KE-Wavenet-B', # Swahili Kenya, female + 'am-ET': 'am-ET-Wavenet-B', # Amharic, female + 'sw-TZ': 'sw-TZ-Wavenet-B', # Swahili Tanzania, female + 'zu-ZA': 'zu-ZA-Wavenet-B', # Zulu, female + 'xh-ZA': 'xh-ZA-Wavenet-B', # Xhosa, female + } + return voice_mapping.get(language_code, 'en-US-Wavenet-B') + diff --git a/modules/routes/routeVoiceGoogle.py b/modules/routes/routeVoiceGoogle.py index a44921e6..048e483d 100644 --- a/modules/routes/routeVoiceGoogle.py +++ b/modules/routes/routeVoiceGoogle.py @@ -5,14 +5,16 @@ Replaces Azure voice services with Google Cloud Speech-to-Text and Translation import os import logging -from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException -from typing import Optional +from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException, Body +from fastapi.responses import Response +from typing import Optional, Dict, Any from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech from modules.security.auth import getCurrentUser from modules.interfaces.interfaceAppModel import User +from modules.interfaces.interfaceComponentObjects import getInterface logger = logging.getLogger(__name__) -router = APIRouter(prefix="/voice-google", tags=["voice-google"]) +router = APIRouter(prefix="/voice-google", tags=["Voice Google"]) # Global connector instance _google_speech_connector = None @@ -23,28 +25,7 @@ def get_google_speech_connector() -> ConnectorGoogleSpeech: if _google_speech_connector is None: try: - # Get credentials path from environment or config - credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") - if not credentials_path: - # Try to find credentials in common locations - possible_paths = [ - "credentials/google-service-account.json", - "config/google-credentials.json", - "google-credentials.json" - ] - - for path in possible_paths: - if os.path.exists(path): - credentials_path = path - break - - if not credentials_path: - raise HTTPException( - status_code=500, - detail="Google Cloud credentials not found. Please set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory." - ) - - _google_speech_connector = ConnectorGoogleSpeech(credentials_path) + _google_speech_connector = ConnectorGoogleSpeech() logger.info("✅ Google Cloud Speech connector initialized") except Exception as e: @@ -173,13 +154,15 @@ async def realtime_interpreter( try: logger.info(f"🔄 Real-time interpreter request: {audio_file.filename}") logger.info(f" From: {from_language} -> To: {to_language}") + logger.info(f" MIME type: {audio_file.content_type}") # Read audio file audio_content = await audio_file.read() logger.info(f"📊 Audio file size: {len(audio_content)} bytes") - # Save audio file for debugging - debug_filename = f"debug_audio/audio_google_{audio_file.filename}" + # Save audio file for debugging with correct extension + file_extension = "webm" if audio_file.filename.endswith('.webm') else "wav" + debug_filename = f"debug_audio/audio_google_{audio_file.filename.replace('.wav', '.webm')}" os.makedirs("debug_audio", exist_ok=True) with open(debug_filename, "wb") as f: f.write(audio_content) @@ -235,6 +218,56 @@ async def realtime_interpreter( detail=f"Real-time interpreter processing failed: {str(e)}" ) + +@router.post("/text-to-speech") +async def text_to_speech( + text: str = Form(...), + language: str = Form("de-DE"), + voice: str = Form(None), + current_user: User = Depends(getCurrentUser) +): + """Convert text to speech using Google Cloud Text-to-Speech.""" + try: + logger.info(f"Text-to-Speech request: '{text[:50]}...' in {language}") + + if not text.strip(): + raise HTTPException( + status_code=400, + detail="Empty text provided for text-to-speech" + ) + + connector = get_google_speech_connector() + result = await connector.text_to_speech( + text=text, + language_code=language, + voice_name=voice + ) + + if result["success"]: + return Response( + content=result["audio_content"], + media_type="audio/mpeg", + headers={ + "Content-Disposition": "attachment; filename=speech.mp3", + "X-Voice-Name": result["voice_name"], + "X-Language-Code": result["language_code"] + } + ) + else: + raise HTTPException( + status_code=400, + detail=f"Text-to-Speech failed: {result.get('error', 'Unknown error')}" + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Text-to-Speech error: {e}") + raise HTTPException( + status_code=500, + detail=f"Text-to-Speech processing failed: {str(e)}" + ) + @router.get("/health") async def health_check(current_user: User = Depends(getCurrentUser)): """Health check for Google Cloud voice services.""" @@ -266,3 +299,113 @@ async def health_check(current_user: User = Depends(getCurrentUser)): "status": "unhealthy", "error": str(e) } + +@router.get("/settings") +async def get_voice_settings(current_user: User = Depends(getCurrentUser)): + """Get voice settings for the current user.""" + try: + logger.info(f"Getting voice settings for user: {current_user.id}") + + # Get database interface with user context + interface = getInterface(current_user) + + # Get or create voice settings for the user + voice_settings = interface.getOrCreateVoiceSettings(current_user.id) + + if voice_settings: + # Return user settings + return { + "success": True, + "data": { + "user_settings": voice_settings.to_dict(), + "default_settings": { + "sttLanguage": "de-DE", + "ttsLanguage": "de-DE", + "ttsVoice": "de-DE-Wavenet-A", + "translationEnabled": True, + "targetLanguage": "en-US" + } + } + } + else: + # Fallback to default settings if database fails + logger.warning("Failed to get voice settings from database, using defaults") + return { + "success": True, + "data": { + "user_settings": None, + "default_settings": { + "sttLanguage": "de-DE", + "ttsLanguage": "de-DE", + "ttsVoice": "de-DE-Wavenet-A", + "translationEnabled": True, + "targetLanguage": "en-US" + } + } + } + + except Exception as e: + logger.error(f"Error getting voice settings: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to get voice settings: {str(e)}" + ) + +@router.post("/settings") +async def save_voice_settings( + settings: Dict[str, Any] = Body(...), + current_user: User = Depends(getCurrentUser) +): + """Save voice settings for the current user.""" + try: + logger.info(f"Saving voice settings for user: {current_user.id}") + logger.info(f"Settings: {settings}") + + # Validate required settings + required_fields = ["sttLanguage", "ttsLanguage", "ttsVoice"] + for field in required_fields: + if field not in settings: + raise HTTPException( + status_code=400, + detail=f"Missing required field: {field}" + ) + + # Set default values for optional fields if not provided + if "translationEnabled" not in settings: + settings["translationEnabled"] = True + if "targetLanguage" not in settings: + settings["targetLanguage"] = "en-US" + + # Get database interface with user context + interface = getInterface(current_user) + + # Check if settings already exist for this user + existing_settings = interface.getVoiceSettings(current_user.id) + + if existing_settings: + # Update existing settings + logger.info(f"Updating existing voice settings for user {current_user.id}") + updated_settings = interface.updateVoiceSettings(current_user.id, settings) + logger.info(f"Voice settings updated for user {current_user.id}: {updated_settings}") + else: + # Create new settings + logger.info(f"Creating new voice settings for user {current_user.id}") + # Add userId to settings + settings["userId"] = current_user.id + created_settings = interface.createVoiceSettings(settings) + logger.info(f"Voice settings created for user {current_user.id}: {created_settings}") + + return { + "success": True, + "message": "Voice settings saved successfully", + "data": settings + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error saving voice settings: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to save voice settings: {str(e)}" + ) diff --git a/modules/routes/routeVoiceStreaming.py b/modules/routes/routeVoiceStreaming.py new file mode 100644 index 00000000..6aa2bf99 --- /dev/null +++ b/modules/routes/routeVoiceStreaming.py @@ -0,0 +1,231 @@ +""" +Voice Streaming WebSocket Routes +Provides real-time audio streaming for voice services +""" + +from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends +from fastapi.responses import JSONResponse +import logging +import json +import base64 +import asyncio +from typing import Dict, List + +from modules.shared.configuration import APP_CONFIG +from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/api/voice/ws", tags=["Voice Streaming"]) + +# Store active connections +active_connections: Dict[str, WebSocket] = {} + +class ConnectionManager: + def __init__(self): + self.active_connections: List[WebSocket] = [] + + async def connect(self, websocket: WebSocket, connection_id: str): + await websocket.accept() + self.active_connections.append(websocket) + active_connections[connection_id] = websocket + logger.info(f"WebSocket connected: {connection_id}") + + def disconnect(self, websocket: WebSocket, connection_id: str): + if websocket in self.active_connections: + self.active_connections.remove(websocket) + if connection_id in active_connections: + del active_connections[connection_id] + logger.info(f"WebSocket disconnected: {connection_id}") + + async def send_personal_message(self, message: dict, websocket: WebSocket): + try: + await websocket.send_text(json.dumps(message)) + except Exception as e: + logger.error(f"Error sending message: {e}") + +manager = ConnectionManager() + +@router.websocket("/realtime-interpreter") +async def websocket_realtime_interpreter( + websocket: WebSocket, + user_id: str = "default", + from_language: str = "de-DE", + to_language: str = "en-US" +): + """WebSocket endpoint for real-time voice interpretation""" + connection_id = f"realtime_{user_id}_{from_language}_{to_language}" + + try: + await manager.connect(websocket, connection_id) + + # Send connection confirmation + await manager.send_personal_message({ + "type": "connected", + "connection_id": connection_id, + "message": "Connected to real-time interpreter" + }, websocket) + + # Initialize Google Speech connector + google_speech = ConnectorGoogleSpeech() + + while True: + # Receive message from client + data = await websocket.receive_text() + message = json.loads(data) + + if message["type"] == "audio_chunk": + # Process audio chunk + try: + # Decode base64 audio data + audio_data = base64.b64decode(message["data"]) + + # For now, just acknowledge receipt + # In a full implementation, this would: + # 1. Buffer audio chunks + # 2. Process with Google Cloud Speech-to-Text streaming + # 3. Send partial results back + # 4. Handle translation + + await manager.send_personal_message({ + "type": "audio_received", + "chunk_size": len(audio_data), + "timestamp": message.get("timestamp") + }, websocket) + + except Exception as e: + logger.error(f"Error processing audio chunk: {e}") + await manager.send_personal_message({ + "type": "error", + "error": f"Failed to process audio: {str(e)}" + }, websocket) + + elif message["type"] == "ping": + # Respond to ping + await manager.send_personal_message({ + "type": "pong", + "timestamp": message.get("timestamp") + }, websocket) + + else: + logger.warning(f"Unknown message type: {message['type']}") + + except WebSocketDisconnect: + manager.disconnect(websocket, connection_id) + logger.info(f"Client disconnected: {connection_id}") + except Exception as e: + logger.error(f"WebSocket error: {e}") + manager.disconnect(websocket, connection_id) + +@router.websocket("/speech-to-text") +async def websocket_speech_to_text( + websocket: WebSocket, + user_id: str = "default", + language: str = "de-DE" +): + """WebSocket endpoint for real-time speech-to-text""" + connection_id = f"stt_{user_id}_{language}" + + try: + await manager.connect(websocket, connection_id) + + await manager.send_personal_message({ + "type": "connected", + "connection_id": connection_id, + "message": "Connected to speech-to-text" + }, websocket) + + # Initialize Google Speech connector + google_speech = ConnectorGoogleSpeech() + + while True: + data = await websocket.receive_text() + message = json.loads(data) + + if message["type"] == "audio_chunk": + try: + audio_data = base64.b64decode(message["data"]) + + # Process audio chunk + # This would integrate with Google Cloud Speech-to-Text streaming API + + await manager.send_personal_message({ + "type": "transcription_result", + "text": "Audio chunk received", # Placeholder + "confidence": 0.95, + "is_final": False + }, websocket) + + except Exception as e: + logger.error(f"Error processing audio: {e}") + await manager.send_personal_message({ + "type": "error", + "error": f"Failed to process audio: {str(e)}" + }, websocket) + + elif message["type"] == "ping": + await manager.send_personal_message({ + "type": "pong", + "timestamp": message.get("timestamp") + }, websocket) + + except WebSocketDisconnect: + manager.disconnect(websocket, connection_id) + except Exception as e: + logger.error(f"WebSocket error: {e}") + manager.disconnect(websocket, connection_id) + +@router.websocket("/text-to-speech") +async def websocket_text_to_speech( + websocket: WebSocket, + user_id: str = "default", + language: str = "de-DE", + voice: str = "de-DE-Wavenet-A" +): + """WebSocket endpoint for real-time text-to-speech""" + connection_id = f"tts_{user_id}_{language}_{voice}" + + try: + await manager.connect(websocket, connection_id) + + await manager.send_personal_message({ + "type": "connected", + "connection_id": connection_id, + "message": "Connected to text-to-speech" + }, websocket) + + while True: + data = await websocket.receive_text() + message = json.loads(data) + + if message["type"] == "text_to_speak": + try: + text = message["text"] + + # Process text-to-speech + # This would integrate with Google Cloud Text-to-Speech API + + # For now, send a placeholder response + await manager.send_personal_message({ + "type": "audio_data", + "audio": "base64_encoded_audio_here", # Placeholder + "format": "mp3" + }, websocket) + + except Exception as e: + logger.error(f"Error processing text-to-speech: {e}") + await manager.send_personal_message({ + "type": "error", + "error": f"Failed to process text: {str(e)}" + }, websocket) + + elif message["type"] == "ping": + await manager.send_personal_message({ + "type": "pong", + "timestamp": message.get("timestamp") + }, websocket) + + except WebSocketDisconnect: + manager.disconnect(websocket, connection_id) + except Exception as e: + logger.error(f"WebSocket error: {e}") + manager.disconnect(websocket, connection_id) diff --git a/modules/shared/configuration.py b/modules/shared/configuration.py index 7abc8162..9415b7f7 100644 --- a/modules/shared/configuration.py +++ b/modules/shared/configuration.py @@ -58,20 +58,43 @@ class Configuration: try: with open(configPath, 'r') as f: - for line in f: - line = line.strip() - # Skip empty lines and comments - if not line or line.startswith('#'): - continue + lines = f.readlines() + + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Skip empty lines and comments + if not line or line.startswith('#'): + i += 1 + continue + + # Parse key-value pairs + if '=' in line: + key, value = line.split('=', 1) + key = key.strip() + value = value.strip() + + # Check if value starts with { (JSON object) + if value.startswith('{'): + # Collect all lines until we find the closing } + json_lines = [value] + i += 1 + brace_count = value.count('{') - value.count('}') - # Parse key-value pairs - if '=' in line: - key, value = line.split('=', 1) - key = key.strip() - value = value.strip() + while i < len(lines) and brace_count > 0: + json_lines.append(lines[i].rstrip('\n')) + brace_count += lines[i].count('{') - lines[i].count('}') + i += 1 - # Add directly to data dictionary - self._data[key] = value + # Join all lines and parse as JSON + value = '\n'.join(json_lines) + i -= 1 # Adjust for the loop increment + + # Add to data dictionary + self._data[key] = value + + i += 1 except Exception as e: @@ -144,6 +167,9 @@ class Configuration: # Handle secrets (keys ending with _SECRET) if key.endswith("_SECRET"): return handleSecret(value) + # Handle JSON secrets (keys ending with _API_KEY that contain JSON) + elif key.endswith("_API_KEY") and value.startswith("{"): + return handleJsonSecret(value) return value return default @@ -180,5 +206,27 @@ def handleSecret(value: str) -> str: # In the future, this could be enhanced to decrypt values return value +def handleJsonSecret(value: str) -> str: + """ + Handle JSON secret values (like Google service account keys). + Validates that the value is valid JSON. + + Args: + value: The JSON secret value to handle + + Returns: + str: Processed JSON secret value + + Raises: + ValueError: If the value is not valid JSON + """ + import json + try: + # Validate that it's valid JSON + json.loads(value) + return value + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in secret value: {e}") + # Create the global APP_CONFIG instance APP_CONFIG = Configuration() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bf996e6b..fd9d119f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ ## Web Framework & API fastapi==0.104.1 +websockets==12.0 uvicorn==0.23.2 python-multipart==0.0.6 httpx==0.25.0 @@ -62,6 +63,7 @@ sortedcontainers>=2.4.0 # Required by trio ## Google Cloud Integration google-cloud-speech==2.21.0 google-cloud-translate==3.11.1 +google-cloud-texttospeech==2.16.3 ## MSFT Integration msal==1.24.1