gateway/modules/connectors/connectorVoiceGoogle.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Google Cloud Speech-to-Text and Translation Connector
Replaces Azure Speech Services with Google Cloud APIs
"""

import json
import html
import asyncio
import logging
from typing import Dict, Optional, Any
from google.cloud import speech
from google.cloud import translate_v2 as translate
from google.cloud import texttospeech
from modules.shared.configuration import APP_CONFIG

logger = logging.getLogger(__name__)

class ConnectorGoogleSpeech:
    """
    Google Cloud Speech-to-Text and Translation connector.
    Handles audio processing, speech recognition, and translation.
    """

    def __init__(self):
        """
        Initialize Google Cloud Speech and Translation clients using config.ini.
        """
        try:
            # Get JSON key from config.ini
            apiKey = APP_CONFIG.get("Connector_GoogleSpeech_API_KEY_SECRET")

            if not apiKey or apiKey == "YOUR_GOOGLE_SERVICE_ACCOUNT_JSON_KEY_HERE":
                raise ValueError("Google Speech API key not configured. Please set Connector_GoogleSpeech_API_KEY_SECRET in config.ini with the full service account JSON key")

            # Parse the JSON key and set up authentication
            try:
                credentialsInfo = json.loads(apiKey)

                # Create credentials object directly (no file needed!)
                from google.oauth2 import service_account
                credentials = service_account.Credentials.from_service_account_info(credentialsInfo)

                logger.info("✅ Using Google Speech credentials from config.ini")

            except json.JSONDecodeError as e:
                raise ValueError(f"Invalid JSON in Google Speech API key: {e}")

            # Initialize clients with explicit credentials
            self.speech_client = speech.SpeechClient(credentials=credentials)
            self.translate_client = translate.Client(credentials=credentials)
            self.tts_client = texttospeech.TextToSpeechClient(credentials=credentials)

            logger.info("✅ Google Cloud Speech and Translation clients initialized successfully")

        except Exception as e:
            logger.error(f"❌ Failed to initialize Google Cloud clients: {e}")
            raise

    async def speechToText(self, audioContent: bytes, language: str = "de-DE",
                           sampleRate: int = None, channels: int = None,
                           skipFallbacks: bool = False) -> Dict:
        """
        Convert speech to text using Google Cloud Speech-to-Text API.

        Args:
            audioContent: Raw audio data (various formats supported)
            language: Language code (e.g., 'de-DE', 'en-US')
            sample_rate: Audio sample rate (auto-detected if None)
            channels: Number of audio channels (auto-detected if None)

        Returns:
            Dict containing transcribed text, confidence, and metadata
        """
        try:
            # Treat sampleRate=0 as unknown (invalid value from client)
            if sampleRate is not None and sampleRate <= 0:
                logger.warning(f"Invalid sampleRate={sampleRate}, treating as unknown for auto-detection")
                sampleRate = None

            # Auto-detect audio format if not provided
            if sampleRate is None or channels is None:
                validation = self.validateAudioFormat(audioContent)
                if not validation["valid"]:
                    return {
                        "success": False,
                        "text": "",
                        "confidence": 0.0,
                        "error": f"Invalid audio format: {validation.get('error', 'Unknown error')}"
                    }
                sampleRate = validation["sample_rate"]
                channels = validation["channels"]
                audioFormat = validation["format"]
                logger.info(f"Auto-detected audio: {audioFormat}, {sampleRate}Hz, {channels}ch")
            else:
                # When sampleRate and channels are explicitly provided, assume raw PCM (LINEAR16)
                audioFormat = "linear16"

            logger.info(f"Processing audio with Google Cloud Speech-to-Text")
            logger.info(f"Audio: {len(audioContent)} bytes, {sampleRate}Hz, {channels}ch")

            # Configure audio settings
            audio = speech.RecognitionAudio(content=audioContent)

            # Determine encoding based on detected format
            # Google Cloud Speech API has specific requirements for different formats
            if audioFormat == "webm_opus":
                # For WEBM OPUS, we need to ensure proper format
                encoding = speech.RecognitionConfig.AudioEncoding.WEBM_OPUS
                # WEBM_OPUS requires specific sample rate handling - must match header
                if sampleRate != 48000:
                    logger.warning(f"WEBM_OPUS detected but sample rate is {sampleRate}, adjusting to 48000")
                    sampleRate = 48000
                # For WEBM_OPUS, don't specify sample_rate_hertz in config
                # Google Cloud will read it from the WEBM header
                useSampleRate = False
            elif audioFormat == "linear16":
                # For LINEAR16 format (PCM)
                encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
                # Ensure sample rate is reasonable
                if sampleRate not in [8000, 16000, 22050, 24000, 32000, 44100, 48000]:
                    logger.warning(f"Unusual sample rate {sampleRate}, adjusting to 16000")
                    sampleRate = 16000
                useSampleRate = True
            elif audioFormat == "mp3":
                # For MP3 format
                encoding = speech.RecognitionConfig.AudioEncoding.MP3
                useSampleRate = True
            elif audioFormat == "flac":
                # For FLAC format
                encoding = speech.RecognitionConfig.AudioEncoding.FLAC
                useSampleRate = True
            elif audioFormat == "wav":
                # For WAV format
                encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
                useSampleRate = True
            else:
                # For unknown formats, try LINEAR16 as fallback
                encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
                sampleRate = 16000  # Use standard sample rate
                channels = 1  # Use mono
                useSampleRate = True
                logger.warning(f"Unknown audio format '{audioFormat}', using LINEAR16 encoding with 16000Hz")

            # Build config based on format requirements
            configParams = {
                "encoding": encoding,
                "audio_channel_count": channels,
                "language_code": language,
                "enable_automatic_punctuation": True,
                "model": "latest_long",  # Try latest_long model for better recognition
                "enable_word_time_offsets": True,  # Enable word-level timing
                "enable_word_confidence": True,    # Enable word-level confidence
                "max_alternatives": 3,             # Try more alternatives
                "use_enhanced": True               # Use enhanced model if available
            }

            # Only add sample_rate_hertz if needed (not for WEBM_OPUS)
            if useSampleRate:
                configParams["sample_rate_hertz"] = sampleRate
                logger.debug(f"Recognition config: encoding={encoding}, sample_rate={sampleRate}, channels={channels}, language={language}")
            else:
                logger.debug(f"Recognition config: encoding={encoding}, sample_rate=auto (from header), channels={channels}, language={language}")

            config = speech.RecognitionConfig(**configParams)

            # Perform speech recognition
            logger.info("Sending audio to Google Cloud Speech-to-Text...")

            try:
                # Use regular recognition for single audio files (not streaming)
                # Run in thread pool to avoid blocking the asyncio event loop
                logger.info("Using regular recognition for single audio file...")
                response = await asyncio.to_thread(
                    self.speech_client.recognize, config=config, audio=audio
                )
                logger.debug(f"Google Cloud response: {response}")

            except Exception as apiError:
                logger.error(f"Google Cloud API error: {apiError}")
                # Try with different encoding as fallback
                if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16:
                    logger.info("Trying fallback with LINEAR16 encoding...")
                    fallbackConfig = speech.RecognitionConfig(
                        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                        sample_rate_hertz=16000,
                        audio_channel_count=1,
                        language_code=language,
                        enable_automatic_punctuation=True,
                        model="latest_long"
                    )

                    try:
                        response = await asyncio.to_thread(
                            self.speech_client.recognize, config=fallbackConfig, audio=audio
                        )
                        logger.debug(f"Google Cloud fallback response: {response}")
                    except Exception as fallbackError:
                        logger.error(f"Google Cloud fallback error: {fallbackError}")
                        raise apiError
                else:
                    raise apiError

            # Process results
            if response.results:
                result = response.results[0]
                if result.alternatives:
                    alternative = result.alternatives[0]
                    transcribed_text = alternative.transcript
                    confidence = alternative.confidence

                    logger.info(f"Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")

                    return {
                        "success": True,
                        "text": transcribed_text,
                        "confidence": confidence,
                        "language": language,
                        "raw_result": {
                            "transcript": transcribed_text,
                            "confidence": confidence,
                            "language_code": language
                        }
                    }
                else:
                    logger.warning("No transcription alternatives found")
                    logger.debug(f"Result details: {result}")
                    return {
                        "success": False,
                        "text": "",
                        "confidence": 0.0,
                        "error": "No transcription alternatives found"
                    }
            else:
                logger.warning("No recognition results from Google Cloud")
                logger.debug(f"Response details: {response}")

                # Check if there are any error messages in the response
                if hasattr(response, 'error') and response.error:
                    logger.error(f"Google Cloud error: {response.error}")
                    return {
                        "success": False,
                        "text": "",
                        "confidence": 0.0,
                        "error": f"Google Cloud error: {response.error}"
                    }

                # Skip fallbacks when format is known (e.g. teamsbot with explicit LINEAR16 16kHz)
                if skipFallbacks:
                    return {
                        "success": False,
                        "text": "",
                        "confidence": 0.0,
                        "error": "No recognition results (silence or unclear audio)"
                    }

                # Try multiple fallback approaches
                fallback_configs = []

                if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16:
                    # For WEBM_OPUS, don't try LINEAR16 with detected sample rate as it causes conflicts
                    if audioFormat != "webm_opus":
                        # Try LINEAR16 with detected sample rate for non-WEBM formats
                        fallback_configs.append({
                            "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
                            "sample_rate": sampleRate,
                            "channels": channels,
                            "use_sample_rate": True,
                            "description": f"LINEAR16 with {sampleRate}Hz"
                        })

                # For WEBM_OPUS, only try compatible sample rates or skip sample rate specification
                if audioFormat == "webm_opus":
                    # Try WEBM_OPUS without sample rate specification (let Google read from header)
                    fallback_configs.append({
                        "encoding": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
                        "sample_rate": 48000,
                        "channels": 1,
                        "use_sample_rate": False,  # Don't specify sample rate
                        "description": f"WEBM_OPUS with auto sample rate"
                    })
                    # Try WEBM_OPUS with explicit 48000Hz (matching header)
                    fallback_configs.append({
                        "encoding": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
                        "sample_rate": 48000,
                        "channels": 1,
                        "use_sample_rate": True,
                        "description": f"WEBM_OPUS with 48000Hz"
                    })
                    # Try LINEAR16 with 48000Hz as last resort (may not work with WEBM data)
                    fallback_configs.append({
                        "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
                        "sample_rate": 48000,  # Match the WEBM header
                        "channels": 1,
                        "use_sample_rate": True,
                        "description": f"LINEAR16 with 48000Hz (WEBM compatible)"
                    })
                else:
                    # For other formats, try standard sample rates
                    for std_rate in [16000, 8000, 22050, 44100]:
                        if std_rate != sampleRate:
                            fallback_configs.append({
                                "encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
                                "sample_rate": std_rate,
                                "channels": 1,
                                "use_sample_rate": True,
                                "description": f"LINEAR16 with {std_rate}Hz"
                            })

                # Detect likely silence before expensive fallback loop
                if len(audioContent) > 100:
                    sampleSlice = audioContent[100:min(500, len(audioContent))]
                    if len(set(sampleSlice)) < 3:
                        logger.warning("Audio appears silent (low byte variation) - skipping fallbacks")
                        return {
                            "success": False,
                            "text": "",
                            "confidence": 0.0,
                            "error": "No recognition results (silence or unclear audio)"
                        }

                models = ["latest_long", "phone_call", "latest_short"]

                for fallback_config in fallback_configs:
                    for model in models:
                        try:
                            logger.info(f"Trying fallback: {fallback_config['description']} with {model} model...")

                            fallback_config_params = {
                                "encoding": fallback_config["encoding"],
                                "audio_channel_count": fallback_config["channels"],
                                "language_code": language,
                                "enable_automatic_punctuation": True,
                                "model": model
                            }

                            if fallback_config["use_sample_rate"]:
                                fallback_config_params["sample_rate_hertz"] = fallback_config["sample_rate"]

                            fallback_config_obj = speech.RecognitionConfig(**fallback_config_params)
                            fallback_response = await asyncio.to_thread(
                                self.speech_client.recognize, config=fallback_config_obj, audio=audio
                            )

                            if fallback_response.results:
                                result = fallback_response.results[0]
                                if result.alternatives:
                                    alternative = result.alternatives[0]
                                    transcribed_text = alternative.transcript
                                    confidence = alternative.confidence

                                    logger.info(f"Fallback transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")

                                    return {
                                        "success": True,
                                        "text": transcribed_text,
                                        "confidence": confidence,
                                        "language": language,
                                        "raw_result": {
                                            "transcript": transcribed_text,
                                            "confidence": confidence,
                                            "language_code": language
                                        }
                                    }

                        except Exception as e:
                            logger.debug(f"Fallback failed: {e}")
                            continue

                logger.warning("All fallback attempts failed")
                return {
                    "success": False,
                    "text": "",
                    "confidence": 0.0,
                    "error": "No recognition results - audio may be too short, unclear, or in unsupported format"
                }

        except Exception as e:
            logger.error(f"Google Cloud Speech-to-Text error: {e}")
            return {
                "success": False,
                "text": "",
                "confidence": 0.0,
                "error": str(e)
            }

    async def translateText(self, text: str, targetLanguage: str = "en",
                           sourceLanguage: str = "de") -> Dict:
        """
        Translate text using Google Cloud Translation API.

        Args:
            text: Text to translate
            target_language: Target language code (e.g., 'en', 'de')
            source_language: Source language code (e.g., 'de', 'en')

        Returns:
            Dict containing translated text and metadata
        """
        try:
            if not text.strip():
                logger.warning("⚠️ Empty text provided for translation")
                return {
                    "success": False,
                    "translated_text": "",
                    "error": "Empty text provided"
                }

            logger.info(f"🌐 Translating: '{text}' ({sourceLanguage} -> {targetLanguage})")

            # Perform translation
            result = self.translate_client.translate(
                text,
                source_language=sourceLanguage,
                target_language=targetLanguage
            )

            translatedText = result['translatedText']
            detectedLanguage = result.get('detectedSourceLanguage', sourceLanguage)

            # Decode HTML entities in translated text
            translatedText = html.unescape(translatedText)

            logger.info(f"✅ Translation successful: '{translatedText}'")

            return {
                "success": True,
                "translated_text": translatedText,
                "source_language": detectedLanguage,
                "target_language": targetLanguage,
                "original_text": text
            }

        except Exception as e:
            logger.error(f"❌ Google Cloud Translation error: {e}")
            return {
                "success": False,
                "translated_text": "",
                "error": str(e)
            }

    async def detectLanguage(self, text: str) -> Dict:
        """
        Detect the language of text using Google Cloud Translation API.

        Args:
            text: Text to detect language for

        Returns:
            Dict containing detected language code and confidence
        """
        try:
            if not text.strip():
                logger.warning("⚠️ Empty text provided for language detection")
                return {
                    "success": False,
                    "language": "",
                    "error": "Empty text provided"
                }

            # Use a sample of the text (middle 1000 bytes or full text if smaller)
            textBytes = text.encode('utf-8')
            if len(textBytes) > 1000:
                # Take 1000 bytes from the middle
                startPos = (len(textBytes) - 1000) // 2
                textSample = textBytes[startPos:startPos + 1000].decode('utf-8', errors='ignore')
            else:
                textSample = text

            logger.info(f"🔍 Detecting language for text sample: '{textSample[:100]}...'")

            # Use translation API with auto-detection (source_language=None)
            result = self.translate_client.translate(
                textSample,
                source_language=None,  # Auto-detect
                target_language='en'  # Dummy target, we only need detection
            )

            detectedLanguage = result.get('detectedSourceLanguage', '')

            logger.info(f"✅ Language detected: {detectedLanguage}")

            return {
                "success": True,
                "language": detectedLanguage,
                "confidence": 1.0  # Google Translation API doesn't provide confidence, assume high
            }

        except Exception as e:
            logger.error(f"❌ Google Cloud Language Detection error: {e}")
            return {
                "success": False,
                "language": "",
                "error": str(e)
            }

    async def speechToTranslatedText(self, audioContent: bytes,
                                     fromLanguage: str = "de-DE",
                                     toLanguage: str = "en") -> Dict:
        """
        Complete pipeline: Speech-to-Text + Translation.

        Args:
            audioContent: Raw audio data
            from_language: Source language for speech recognition
            to_language: Target language for translation

        Returns:
            Dict containing original text, translated text, and metadata
        """
        try:
            logger.info(f"🔄 Starting speech-to-translation pipeline: {fromLanguage} -> {toLanguage}")

            # Step 1: Speech-to-Text
            speechResult = await self.speechToText(
                audioContent=audioContent,
                language=fromLanguage
            )

            if not speechResult["success"]:
                return {
                    "success": False,
                    "original_text": "",
                    "translated_text": "",
                    "error": f"Speech recognition failed: {speechResult.get('error', 'Unknown error')}"
                }

            originalText = speechResult["text"]

            # Step 2: Translation
            translationResult = await self.translateText(
                text=originalText,
                sourceLanguage=fromLanguage.split('-')[0],  # Convert 'de-DE' to 'de'
                targetLanguage=toLanguage.split('-')[0]     # Convert 'en-US' to 'en'
            )

            if not translationResult["success"]:
                return {
                    "success": False,
                    "original_text": originalText,
                    "translated_text": "",
                    "error": f"Translation failed: {translationResult.get('error', 'Unknown error')}"
                }

            translatedText = translationResult["translated_text"]

            logger.info(f"✅ Complete pipeline successful:")
            logger.info(f"   Original: '{originalText}'")
            logger.info(f"   Translated: '{translatedText}'")

            return {
                "success": True,
                "original_text": originalText,
                "translated_text": translatedText,
                "confidence": speechResult["confidence"],
                "source_language": fromLanguage,
                "target_language": toLanguage
            }

        except Exception as e:
            logger.error(f"❌ Speech-to-translation pipeline error: {e}")
            return {
                "success": False,
                "original_text": "",
                "translated_text": "",
                "error": str(e)
            }

    def validateAudioFormat(self, audioContent: bytes) -> Dict:
        """
        Validate audio format for Google Cloud Speech-to-Text.

        Args:
            audioContent: Raw audio data

        Returns:
            Dict containing validation results
        """
        try:
            # Basic validation
            if len(audioContent) < 100:
                return {
                    "valid": False,
                    "error": "Audio too short (less than 100 bytes)"
                }

            # Detect audio format by checking file headers
            audio_format = "unknown"
            sample_rate = 16000  # Default fallback
            channels = 1  # Default fallback

            # Debug: Log first few bytes for format detection
            logger.debug(f"Audio header bytes: {audioContent[:20].hex()}")
            logger.debug(f"Audio content length: {len(audioContent)} bytes")

            # Check for WEBM/OPUS format (common from web recordings)
            if audioContent.startswith(b'\x1a\x45\xdf\xa3'):
                audio_format = "webm_opus"
                sample_rate = 48000  # WEBM OPUS typically uses 48kHz
                channels = 1
                logger.info(f"Detected WEBM OPUS format: {sample_rate}Hz, {channels}ch")

            # Check for specific header patterns seen in logs (43c381...)
            # This appears to be a different audio format or corrupted WEBM
            elif audioContent.startswith(b'\x43\xc3\x81') and len(audioContent) > 1000:
                # This might be a different format or corrupted audio
                # Try to detect if it's actually WEBM by looking deeper
                if b'webm' in audioContent[:200] or b'opus' in audioContent[:200]:
                    audio_format = "webm_opus"
                    sample_rate = 48000
                    channels = 1
                    logger.info(f"Detected WEBM format (deep scan): {sample_rate}Hz, {channels}ch")
                else:
                    # Unknown format, try as LINEAR16
                    audio_format = "linear16"
                    sample_rate = 16000
                    channels = 1
                    logger.warning(f"Unknown audio format with header {audioContent[:8].hex()}, trying LINEAR16")

            # Check for WEBM format (alternative detection)
            elif b'webm' in audioContent[:100].lower() or b'opus' in audioContent[:100].lower():
                audio_format = "webm_opus"
                sample_rate = 48000  # WEBM OPUS typically uses 48kHz
                channels = 1
                logger.info(f"Detected WEBM format: {sample_rate}Hz, {channels}ch")

            # Check for MediaRecorder WEBM chunks (common in browser recordings)
            elif audioContent.startswith(b'\x1a\x45\xdf\xa3') and len(audioContent) > 1000:
                audio_format = "webm_opus"
                sample_rate = 48000  # Browser MediaRecorder typically uses 48kHz
                channels = 1
                logger.info(f"Detected MediaRecorder WEBM: {sample_rate}Hz, {channels}ch")

            # Check for OPUS format by looking for OPUS magic bytes
            elif audioContent.startswith(b'OpusHead') or b'OpusHead' in audioContent[:50]:
                audio_format = "webm_opus"
                sample_rate = 48000  # OPUS typically uses 48kHz
                channels = 1
                logger.info(f"Detected OPUS format: {sample_rate}Hz, {channels}ch")

            # Check for OGG format (often contains OPUS)
            elif audioContent.startswith(b'OggS'):
                audio_format = "webm_opus"
                sample_rate = 48000  # OGG OPUS typically uses 48kHz
                channels = 1
                logger.info(f"Detected OGG format: {sample_rate}Hz, {channels}ch")

            # Check for WAV format
            elif audioContent.startswith(b'RIFF') and b'WAVE' in audioContent[:12]:
                audio_format = "wav"
                # Try to extract sample rate from WAV header
                try:
                    # WAV header sample rate is at offset 24-27 (little endian)
                    sample_rate = int.from_bytes(audioContent[24:28], 'little')
                    channels = int.from_bytes(audioContent[22:24], 'little')
                    logger.info(f"Detected WAV format: {sample_rate}Hz, {channels}ch")
                except:
                    sample_rate = 16000  # Fallback
                    channels = 1

            # Check for MP3 format
            elif audioContent.startswith(b'\xff\xfb') or audioContent.startswith(b'ID3'):
                audio_format = "mp3"
                sample_rate = 44100  # MP3 typically uses 44.1kHz
                channels = 2  # Usually stereo
                logger.info(f"Detected MP3 format: {sample_rate}Hz, {channels}ch")

            # Check for FLAC format
            elif audioContent.startswith(b'fLaC'):
                audio_format = "flac"
                sample_rate = 44100  # Common FLAC sample rate
                channels = 2
                logger.info(f"Detected FLAC format: {sample_rate}Hz, {channels}ch")

            else:
                # Unknown format, try WEBM_OPUS as it's most common for web recordings
                audio_format = "webm_opus"
                sample_rate = 48000  # Try 48kHz for web recordings
                channels = 1
                logger.warning(f"Unknown audio format, trying WEBM_OPUS: {sample_rate}Hz, {channels}ch")

            # Calculate estimated duration
            if audio_format == "webm_opus":
                # WEBM OPUS duration is hard to calculate without decoding
                estimated_duration = 3.0  # Assume 3 seconds for web recordings
            else:
                # Rough estimate for uncompressed audio
                estimated_duration = len(audioContent) / (sample_rate * channels * 2)  # 16-bit = 2 bytes per sample

            # Check if audio is too short (less than 0.5 seconds)
            if estimated_duration < 0.5:
                logger.warning(f"Audio too short: {estimated_duration:.2f}s, may not be recognized")

            # Log audio details for debugging
            logger.info(f"Audio analysis: {len(audioContent)} bytes, {estimated_duration:.2f}s, {sample_rate}Hz, {channels}ch, format={audio_format}")

            # Check audio levels (simple check for silence)
            if audio_format == "webm_opus":
                # For WEBM, we can't easily check levels, but log the first few bytes
                logger.debug(f"Audio sample bytes: {audioContent[:20].hex()}")
                # Check if audio has some variation (not all same bytes)
                if len(audioContent) > 100:
                    sample_bytes = audioContent[100:200]  # Skip header
                    if len(set(sample_bytes)) < 5:  # Less than 5 different byte values
                        logger.warning("Audio may be silent or very quiet (low byte variation)")
                    else:
                        logger.debug(f"Audio has good byte variation: {len(set(sample_bytes))} unique values")
            else:
                # For PCM audio, check for silence
                if len(audioContent) > 100:
                    # Convert first 100 bytes to check for silence
                    sample_bytes = audioContent[:100]
                    if all(b == 0 for b in sample_bytes):
                        logger.warning("Audio appears to be silent (all zeros)")
                    else:
                        logger.debug(f"Audio sample bytes: {sample_bytes[:20].hex()}")
                        # Check for low variation
                        if len(set(sample_bytes)) < 5:
                            logger.warning("Audio may be very quiet (low byte variation)")

            return {
                "valid": True,
                "format": audio_format,
                "sample_rate": sample_rate,
                "channels": channels,
                "size": len(audioContent),
                "estimated_duration": estimated_duration
            }

        except Exception as e:
            return {
                "valid": False,
                "error": f"Validation error: {e}"
            }

    async def textToSpeech(self, text: str, languageCode: str = "de-DE", voiceName: str = None) -> Dict[str, Any]:
        """
        Convert text to speech using Google Cloud Text-to-Speech.

        Args:
            text: Text to convert to speech
            language_code: Language code (e.g., 'de-DE', 'en-US')
            voice_name: Specific voice name (optional)

        Returns:
            Dict with success status and audio data
        """
        try:
            logger.info(f"Converting text to speech: '{text[:50]}...' in {languageCode}")

            # Set up the synthesis input
            synthesisInput = texttospeech.SynthesisInput(text=text)

            # Build the voice request
            selectedVoice = voiceName or self._getDefaultVoice(languageCode)

            if not selectedVoice:
                return {
                    "success": False,
                    "error": f"No voice specified for language {languageCode}. Please select a voice."
                }

            logger.info(f"Using TTS voice: {selectedVoice} for language: {languageCode}")

            voice = texttospeech.VoiceSelectionParams(
                language_code=languageCode,
                name=selectedVoice,
                ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
            )

            # Select the type of audio file to return
            audioConfig = texttospeech.AudioConfig(
                audio_encoding=texttospeech.AudioEncoding.MP3
            )

            # Perform the text-to-speech request
            response = self.tts_client.synthesize_speech(
                input=synthesisInput,
                voice=voice,
                audio_config=audioConfig
            )

            # Return the audio content
            return {
                "success": True,
                "audio_content": response.audio_content,
                "audio_format": "mp3",
                "language_code": languageCode,
                "voice_name": voice.name
            }

        except Exception as e:
            logger.error(f"Text-to-Speech error: {e}")
            return {
                "success": False,
                "error": f"Text-to-Speech failed: {str(e)}"
            }

    def _getDefaultVoice(self, languageCode: str) -> str:
        """
        Get default voice name for a language code.
        Falls back to a Wavenet voice for common languages.
        """
        _defaults = {
            "de-DE": "de-DE-Wavenet-A",
            "de-CH": "de-DE-Wavenet-A",
            "en-US": "en-US-Wavenet-C",
            "en-GB": "en-GB-Wavenet-A",
            "fr-FR": "fr-FR-Wavenet-A",
            "it-IT": "it-IT-Wavenet-A",
        }
        return _defaults.get(languageCode)

    async def getAvailableLanguages(self) -> Dict[str, Any]:
        """
        Get available languages from Google Cloud Text-to-Speech.

        Returns:
            Dict containing success status and list of available languages
        """
        try:
            logger.info("🌐 Getting available languages from Google Cloud TTS")

            # List voices from Google Cloud TTS
            response = self.tts_client.list_voices()

            # Extract unique language codes
            # Note: Google TTS API doesn't provide language descriptions, only codes
            language_codes = set()
            for voice in response.voices:
                if voice.language_codes:
                    language_codes.update(voice.language_codes)

            # Convert to sorted list of language codes
            available_languages = sorted(list(language_codes))

            logger.info(f"✅ Found {len(available_languages)} available languages")

            return {
                "success": True,
                "languages": available_languages
            }

        except Exception as e:
            logger.error(f"❌ Failed to get available languages: {e}")
            return {
                "success": False,
                "error": str(e),
                "languages": []
            }

    async def getAvailableVoices(self, languageCode: Optional[str] = None) -> Dict[str, Any]:
        """
        Get available voices from Google Cloud Text-to-Speech.

        Args:
            language_code: Optional language code to filter voices (e.g., 'de-DE', 'en-US')

        Returns:
            Dict containing success status and list of available voices
        """
        try:
            logger.info(f"🎤 Getting available voices from Google Cloud TTS, language filter: {languageCode}")

            # List voices from Google Cloud TTS
            response = self.tts_client.list_voices()

            availableVoices = []

            for voice in response.voices:
                # Extract language code from voice name (e.g., 'de-DE-Wavenet-A' -> 'de-DE')
                voiceLanguage = voice.language_codes[0] if voice.language_codes else None

                # Filter by language if specified
                if languageCode and voiceLanguage != languageCode:
                    continue

                # Determine gender from voice name (A/C = male, B/D = female)
                gender = "Unknown"
                if voice.name:
                    if voice.name.endswith(('-A', '-C')):
                        gender = "Male"
                    elif voice.name.endswith(('-B', '-D')):
                        gender = "Female"

                # Create voice info with all available fields from Google API
                voiceInfo = {
                    "name": voice.name,
                    "language_code": voiceLanguage,
                    "language_codes": list(voice.language_codes) if voice.language_codes else [],
                    "gender": gender,
                    "ssml_gender": voice.ssml_gender.name if voice.ssml_gender else "NEUTRAL",
                    "natural_sample_rate_hertz": voice.natural_sample_rate_hertz
                }

                # Include any additional fields if available from Google API
                # Check for common fields that might exist
                for field_name in ['description', 'display_name', 'labels']:
                    if hasattr(voice, field_name):
                        field_value = getattr(voice, field_name, None)
                        if field_value:
                            voiceInfo[field_name] = field_value

                availableVoices.append(voiceInfo)

            # Sort by language code, then by gender, then by name
            availableVoices.sort(key=lambda x: (x["language_code"], x["gender"], x["name"]))

            logger.info(f"✅ Found {len(availableVoices)} voices for language filter: {languageCode}")

            return {
                "success": True,
                "voices": availableVoices,
                "total_count": len(availableVoices)
            }

        except Exception as e:
            logger.error(f"❌ Failed to get available voices: {e}")
            return {
                "success": False,
                "error": str(e),
                "voices": []
            }