815 lines
37 KiB
Python
815 lines
37 KiB
Python
"""
|
|
Google Cloud Speech-to-Text and Translation Connector
|
|
Replaces Azure Speech Services with Google Cloud APIs
|
|
"""
|
|
|
|
import json
|
|
import html
|
|
import logging
|
|
from typing import Dict, Optional, Any
|
|
from google.cloud import speech
|
|
from google.cloud import translate_v2 as translate
|
|
from google.cloud import texttospeech
|
|
from modules.shared.configuration import APP_CONFIG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class ConnectorGoogleSpeech:
|
|
"""
|
|
Google Cloud Speech-to-Text and Translation connector.
|
|
Handles audio processing, speech recognition, and translation.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Initialize Google Cloud Speech and Translation clients using config.ini.
|
|
"""
|
|
try:
|
|
# Get JSON key from config.ini
|
|
apiKey = APP_CONFIG.get("Connector_GoogleSpeech_API_KEY_SECRET")
|
|
|
|
if not apiKey or apiKey == "YOUR_GOOGLE_SERVICE_ACCOUNT_JSON_KEY_HERE":
|
|
raise ValueError("Google Speech API key not configured. Please set Connector_GoogleSpeech_API_KEY_SECRET in config.ini with the full service account JSON key")
|
|
|
|
# Parse the JSON key and set up authentication
|
|
try:
|
|
credentialsInfo = json.loads(apiKey)
|
|
|
|
# Create credentials object directly (no file needed!)
|
|
from google.oauth2 import service_account
|
|
credentials = service_account.Credentials.from_service_account_info(credentialsInfo)
|
|
|
|
logger.info("✅ Using Google Speech credentials from config.ini")
|
|
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in Google Speech API key: {e}")
|
|
|
|
# Initialize clients with explicit credentials
|
|
self.speech_client = speech.SpeechClient(credentials=credentials)
|
|
self.translate_client = translate.Client(credentials=credentials)
|
|
self.tts_client = texttospeech.TextToSpeechClient(credentials=credentials)
|
|
|
|
logger.info("✅ Google Cloud Speech and Translation clients initialized successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to initialize Google Cloud clients: {e}")
|
|
raise
|
|
|
|
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
|
|
sampleRate: int = None, channels: int = None) -> Dict:
|
|
"""
|
|
Convert speech to text using Google Cloud Speech-to-Text API.
|
|
|
|
Args:
|
|
audioContent: Raw audio data (various formats supported)
|
|
language: Language code (e.g., 'de-DE', 'en-US')
|
|
sample_rate: Audio sample rate (auto-detected if None)
|
|
channels: Number of audio channels (auto-detected if None)
|
|
|
|
Returns:
|
|
Dict containing transcribed text, confidence, and metadata
|
|
"""
|
|
try:
|
|
# Auto-detect audio format if not provided
|
|
if sampleRate is None or channels is None:
|
|
validation = self.validateAudioFormat(audioContent)
|
|
if not validation["valid"]:
|
|
return {
|
|
"success": False,
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"error": f"Invalid audio format: {validation.get('error', 'Unknown error')}"
|
|
}
|
|
sampleRate = validation["sample_rate"]
|
|
channels = validation["channels"]
|
|
audioFormat = validation["format"]
|
|
logger.info(f"Auto-detected audio: {audioFormat}, {sampleRate}Hz, {channels}ch")
|
|
|
|
logger.info(f"Processing audio with Google Cloud Speech-to-Text")
|
|
logger.info(f"Audio: {len(audioContent)} bytes, {sampleRate}Hz, {channels}ch")
|
|
|
|
# Configure audio settings
|
|
audio = speech.RecognitionAudio(content=audioContent)
|
|
|
|
# Determine encoding based on detected format
|
|
# Google Cloud Speech API has specific requirements for different formats
|
|
if audioFormat == "webm_opus":
|
|
# For WEBM OPUS, we need to ensure proper format
|
|
encoding = speech.RecognitionConfig.AudioEncoding.WEBM_OPUS
|
|
# WEBM_OPUS requires specific sample rate handling - must match header
|
|
if sampleRate != 48000:
|
|
logger.warning(f"WEBM_OPUS detected but sample rate is {sampleRate}, adjusting to 48000")
|
|
sampleRate = 48000
|
|
# For WEBM_OPUS, don't specify sample_rate_hertz in config
|
|
# Google Cloud will read it from the WEBM header
|
|
useSampleRate = False
|
|
elif audioFormat == "linear16":
|
|
# For LINEAR16 format (PCM)
|
|
encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
|
|
# Ensure sample rate is reasonable
|
|
if sampleRate not in [8000, 16000, 22050, 24000, 32000, 44100, 48000]:
|
|
logger.warning(f"Unusual sample rate {sampleRate}, adjusting to 16000")
|
|
sampleRate = 16000
|
|
useSampleRate = True
|
|
elif audioFormat == "mp3":
|
|
# For MP3 format
|
|
encoding = speech.RecognitionConfig.AudioEncoding.MP3
|
|
useSampleRate = True
|
|
elif audioFormat == "flac":
|
|
# For FLAC format
|
|
encoding = speech.RecognitionConfig.AudioEncoding.FLAC
|
|
useSampleRate = True
|
|
elif audioFormat == "wav":
|
|
# For WAV format
|
|
encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
|
|
useSampleRate = True
|
|
else:
|
|
# For unknown formats, try LINEAR16 as fallback
|
|
encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16
|
|
sampleRate = 16000 # Use standard sample rate
|
|
channels = 1 # Use mono
|
|
useSampleRate = True
|
|
logger.warning(f"Unknown audio format '{audioFormat}', using LINEAR16 encoding with 16000Hz")
|
|
|
|
# Build config based on format requirements
|
|
configParams = {
|
|
"encoding": encoding,
|
|
"audio_channel_count": channels,
|
|
"language_code": language,
|
|
"enable_automatic_punctuation": True,
|
|
"model": "latest_long", # Try latest_long model for better recognition
|
|
"enable_word_time_offsets": True, # Enable word-level timing
|
|
"enable_word_confidence": True, # Enable word-level confidence
|
|
"max_alternatives": 3, # Try more alternatives
|
|
"use_enhanced": True # Use enhanced model if available
|
|
}
|
|
|
|
# Only add sample_rate_hertz if needed (not for WEBM_OPUS)
|
|
if useSampleRate:
|
|
configParams["sample_rate_hertz"] = sampleRate
|
|
logger.debug(f"Recognition config: encoding={encoding}, sample_rate={sampleRate}, channels={channels}, language={language}")
|
|
else:
|
|
logger.debug(f"Recognition config: encoding={encoding}, sample_rate=auto (from header), channels={channels}, language={language}")
|
|
|
|
config = speech.RecognitionConfig(**configParams)
|
|
|
|
# Perform speech recognition
|
|
logger.info("Sending audio to Google Cloud Speech-to-Text...")
|
|
|
|
try:
|
|
# Use regular recognition for single audio files (not streaming)
|
|
logger.info("Using regular recognition for single audio file...")
|
|
response = self.speech_client.recognize(config=config, audio=audio)
|
|
logger.debug(f"Google Cloud response: {response}")
|
|
|
|
except Exception as apiError:
|
|
logger.error(f"Google Cloud API error: {apiError}")
|
|
# Try with different encoding as fallback
|
|
if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16:
|
|
logger.info("Trying fallback with LINEAR16 encoding...")
|
|
fallbackConfig = speech.RecognitionConfig(
|
|
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
sample_rate_hertz=16000, # Use standard sample rate
|
|
audio_channel_count=1,
|
|
language_code=language,
|
|
enable_automatic_punctuation=True,
|
|
model="latest_long"
|
|
)
|
|
|
|
try:
|
|
response = self.speech_client.recognize(config=fallbackConfig, audio=audio)
|
|
logger.debug(f"Google Cloud fallback response: {response}")
|
|
except Exception as fallbackError:
|
|
logger.error(f"Google Cloud fallback error: {fallbackError}")
|
|
raise apiError
|
|
else:
|
|
raise apiError
|
|
|
|
# Process results
|
|
if response.results:
|
|
result = response.results[0]
|
|
if result.alternatives:
|
|
alternative = result.alternatives[0]
|
|
transcribed_text = alternative.transcript
|
|
confidence = alternative.confidence
|
|
|
|
logger.info(f"Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")
|
|
|
|
return {
|
|
"success": True,
|
|
"text": transcribed_text,
|
|
"confidence": confidence,
|
|
"language": language,
|
|
"raw_result": {
|
|
"transcript": transcribed_text,
|
|
"confidence": confidence,
|
|
"language_code": language
|
|
}
|
|
}
|
|
else:
|
|
logger.warning("No transcription alternatives found")
|
|
logger.debug(f"Result details: {result}")
|
|
return {
|
|
"success": False,
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"error": "No transcription alternatives found"
|
|
}
|
|
else:
|
|
logger.warning("No recognition results from Google Cloud")
|
|
logger.debug(f"Response details: {response}")
|
|
|
|
# Check if there are any error messages in the response
|
|
if hasattr(response, 'error') and response.error:
|
|
logger.error(f"Google Cloud error: {response.error}")
|
|
return {
|
|
"success": False,
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"error": f"Google Cloud error: {response.error}"
|
|
}
|
|
|
|
# Try multiple fallback approaches
|
|
fallback_configs = []
|
|
|
|
if encoding != speech.RecognitionConfig.AudioEncoding.LINEAR16:
|
|
# For WEBM_OPUS, don't try LINEAR16 with detected sample rate as it causes conflicts
|
|
if audioFormat != "webm_opus":
|
|
# Try LINEAR16 with detected sample rate for non-WEBM formats
|
|
fallback_configs.append({
|
|
"encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
"sample_rate": sampleRate,
|
|
"channels": channels,
|
|
"use_sample_rate": True,
|
|
"description": f"LINEAR16 with {sampleRate}Hz"
|
|
})
|
|
|
|
# For WEBM_OPUS, only try compatible sample rates or skip sample rate specification
|
|
if audioFormat == "webm_opus":
|
|
# Try WEBM_OPUS without sample rate specification (let Google read from header)
|
|
fallback_configs.append({
|
|
"encoding": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
|
|
"sample_rate": 48000,
|
|
"channels": 1,
|
|
"use_sample_rate": False, # Don't specify sample rate
|
|
"description": f"WEBM_OPUS with auto sample rate"
|
|
})
|
|
# Try WEBM_OPUS with explicit 48000Hz (matching header)
|
|
fallback_configs.append({
|
|
"encoding": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
|
|
"sample_rate": 48000,
|
|
"channels": 1,
|
|
"use_sample_rate": True,
|
|
"description": f"WEBM_OPUS with 48000Hz"
|
|
})
|
|
# Try LINEAR16 with 48000Hz as last resort (may not work with WEBM data)
|
|
fallback_configs.append({
|
|
"encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
"sample_rate": 48000, # Match the WEBM header
|
|
"channels": 1,
|
|
"use_sample_rate": True,
|
|
"description": f"LINEAR16 with 48000Hz (WEBM compatible)"
|
|
})
|
|
else:
|
|
# For other formats, try standard sample rates
|
|
for std_rate in [16000, 8000, 22050, 44100]:
|
|
if std_rate != sampleRate:
|
|
fallback_configs.append({
|
|
"encoding": speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
|
"sample_rate": std_rate,
|
|
"channels": 1,
|
|
"use_sample_rate": True,
|
|
"description": f"LINEAR16 with {std_rate}Hz"
|
|
})
|
|
|
|
# Try with different models
|
|
models = ["latest_long", "phone_call", "latest_short"]
|
|
|
|
for fallback_config in fallback_configs:
|
|
for model in models:
|
|
try:
|
|
logger.info(f"Trying fallback: {fallback_config['description']} with {model} model...")
|
|
|
|
# Build fallback config with proper sample rate handling
|
|
fallback_config_params = {
|
|
"encoding": fallback_config["encoding"],
|
|
"audio_channel_count": fallback_config["channels"],
|
|
"language_code": language,
|
|
"enable_automatic_punctuation": True,
|
|
"model": model
|
|
}
|
|
|
|
# Only add sample_rate_hertz if needed
|
|
if fallback_config["use_sample_rate"]:
|
|
fallback_config_params["sample_rate_hertz"] = fallback_config["sample_rate"]
|
|
|
|
fallback_config_obj = speech.RecognitionConfig(**fallback_config_params)
|
|
fallback_response = self.speech_client.recognize(config=fallback_config_obj, audio=audio)
|
|
|
|
if fallback_response.results:
|
|
result = fallback_response.results[0]
|
|
if result.alternatives:
|
|
alternative = result.alternatives[0]
|
|
transcribed_text = alternative.transcript
|
|
confidence = alternative.confidence
|
|
|
|
logger.info(f"Fallback transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")
|
|
|
|
return {
|
|
"success": True,
|
|
"text": transcribed_text,
|
|
"confidence": confidence,
|
|
"language": language,
|
|
"raw_result": {
|
|
"transcript": transcribed_text,
|
|
"confidence": confidence,
|
|
"language_code": language
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Fallback failed: {e}")
|
|
continue
|
|
|
|
logger.warning("All fallback attempts failed")
|
|
return {
|
|
"success": False,
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"error": "No recognition results - audio may be too short, unclear, or in unsupported format"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Google Cloud Speech-to-Text error: {e}")
|
|
return {
|
|
"success": False,
|
|
"text": "",
|
|
"confidence": 0.0,
|
|
"error": str(e)
|
|
}
|
|
|
|
async def translateText(self, text: str, targetLanguage: str = "en",
|
|
sourceLanguage: str = "de") -> Dict:
|
|
"""
|
|
Translate text using Google Cloud Translation API.
|
|
|
|
Args:
|
|
text: Text to translate
|
|
target_language: Target language code (e.g., 'en', 'de')
|
|
source_language: Source language code (e.g., 'de', 'en')
|
|
|
|
Returns:
|
|
Dict containing translated text and metadata
|
|
"""
|
|
try:
|
|
if not text.strip():
|
|
logger.warning("⚠️ Empty text provided for translation")
|
|
return {
|
|
"success": False,
|
|
"translated_text": "",
|
|
"error": "Empty text provided"
|
|
}
|
|
|
|
logger.info(f"🌐 Translating: '{text}' ({sourceLanguage} -> {targetLanguage})")
|
|
|
|
# Perform translation
|
|
result = self.translate_client.translate(
|
|
text,
|
|
source_language=sourceLanguage,
|
|
target_language=targetLanguage
|
|
)
|
|
|
|
translatedText = result['translatedText']
|
|
detectedLanguage = result.get('detectedSourceLanguage', sourceLanguage)
|
|
|
|
# Decode HTML entities in translated text
|
|
translatedText = html.unescape(translatedText)
|
|
|
|
logger.info(f"✅ Translation successful: '{translatedText}'")
|
|
|
|
return {
|
|
"success": True,
|
|
"translated_text": translatedText,
|
|
"source_language": detectedLanguage,
|
|
"target_language": targetLanguage,
|
|
"original_text": text
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Google Cloud Translation error: {e}")
|
|
return {
|
|
"success": False,
|
|
"translated_text": "",
|
|
"error": str(e)
|
|
}
|
|
|
|
async def speechToTranslatedText(self, audioContent: bytes,
|
|
fromLanguage: str = "de-DE",
|
|
toLanguage: str = "en") -> Dict:
|
|
"""
|
|
Complete pipeline: Speech-to-Text + Translation.
|
|
|
|
Args:
|
|
audioContent: Raw audio data
|
|
from_language: Source language for speech recognition
|
|
to_language: Target language for translation
|
|
|
|
Returns:
|
|
Dict containing original text, translated text, and metadata
|
|
"""
|
|
try:
|
|
logger.info(f"🔄 Starting speech-to-translation pipeline: {fromLanguage} -> {toLanguage}")
|
|
|
|
# Step 1: Speech-to-Text
|
|
speechResult = await self.speechToText(
|
|
audioContent=audioContent,
|
|
language=fromLanguage
|
|
)
|
|
|
|
if not speechResult["success"]:
|
|
return {
|
|
"success": False,
|
|
"original_text": "",
|
|
"translated_text": "",
|
|
"error": f"Speech recognition failed: {speechResult.get('error', 'Unknown error')}"
|
|
}
|
|
|
|
originalText = speechResult["text"]
|
|
|
|
# Step 2: Translation
|
|
translationResult = await self.translateText(
|
|
text=originalText,
|
|
sourceLanguage=fromLanguage.split('-')[0], # Convert 'de-DE' to 'de'
|
|
targetLanguage=toLanguage.split('-')[0] # Convert 'en-US' to 'en'
|
|
)
|
|
|
|
if not translationResult["success"]:
|
|
return {
|
|
"success": False,
|
|
"original_text": originalText,
|
|
"translated_text": "",
|
|
"error": f"Translation failed: {translationResult.get('error', 'Unknown error')}"
|
|
}
|
|
|
|
translatedText = translationResult["translated_text"]
|
|
|
|
logger.info(f"✅ Complete pipeline successful:")
|
|
logger.info(f" Original: '{originalText}'")
|
|
logger.info(f" Translated: '{translatedText}'")
|
|
|
|
return {
|
|
"success": True,
|
|
"original_text": originalText,
|
|
"translated_text": translatedText,
|
|
"confidence": speechResult["confidence"],
|
|
"source_language": fromLanguage,
|
|
"target_language": toLanguage
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Speech-to-translation pipeline error: {e}")
|
|
return {
|
|
"success": False,
|
|
"original_text": "",
|
|
"translated_text": "",
|
|
"error": str(e)
|
|
}
|
|
|
|
def validateAudioFormat(self, audioContent: bytes) -> Dict:
|
|
"""
|
|
Validate audio format for Google Cloud Speech-to-Text.
|
|
|
|
Args:
|
|
audioContent: Raw audio data
|
|
|
|
Returns:
|
|
Dict containing validation results
|
|
"""
|
|
try:
|
|
# Basic validation
|
|
if len(audioContent) < 100:
|
|
return {
|
|
"valid": False,
|
|
"error": "Audio too short (less than 100 bytes)"
|
|
}
|
|
|
|
# Detect audio format by checking file headers
|
|
audio_format = "unknown"
|
|
sample_rate = 16000 # Default fallback
|
|
channels = 1 # Default fallback
|
|
|
|
# Debug: Log first few bytes for format detection
|
|
logger.debug(f"Audio header bytes: {audioContent[:20].hex()}")
|
|
logger.debug(f"Audio content length: {len(audioContent)} bytes")
|
|
|
|
# Check for WEBM/OPUS format (common from web recordings)
|
|
if audioContent.startswith(b'\x1a\x45\xdf\xa3'):
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000 # WEBM OPUS typically uses 48kHz
|
|
channels = 1
|
|
logger.info(f"Detected WEBM OPUS format: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Check for specific header patterns seen in logs (43c381...)
|
|
# This appears to be a different audio format or corrupted WEBM
|
|
elif audioContent.startswith(b'\x43\xc3\x81') and len(audioContent) > 1000:
|
|
# This might be a different format or corrupted audio
|
|
# Try to detect if it's actually WEBM by looking deeper
|
|
if b'webm' in audioContent[:200] or b'opus' in audioContent[:200]:
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000
|
|
channels = 1
|
|
logger.info(f"Detected WEBM format (deep scan): {sample_rate}Hz, {channels}ch")
|
|
else:
|
|
# Unknown format, try as LINEAR16
|
|
audio_format = "linear16"
|
|
sample_rate = 16000
|
|
channels = 1
|
|
logger.warning(f"Unknown audio format with header {audioContent[:8].hex()}, trying LINEAR16")
|
|
|
|
# Check for WEBM format (alternative detection)
|
|
elif b'webm' in audioContent[:100].lower() or b'opus' in audioContent[:100].lower():
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000 # WEBM OPUS typically uses 48kHz
|
|
channels = 1
|
|
logger.info(f"Detected WEBM format: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Check for MediaRecorder WEBM chunks (common in browser recordings)
|
|
elif audioContent.startswith(b'\x1a\x45\xdf\xa3') and len(audioContent) > 1000:
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000 # Browser MediaRecorder typically uses 48kHz
|
|
channels = 1
|
|
logger.info(f"Detected MediaRecorder WEBM: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Check for OPUS format by looking for OPUS magic bytes
|
|
elif audioContent.startswith(b'OpusHead') or b'OpusHead' in audioContent[:50]:
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000 # OPUS typically uses 48kHz
|
|
channels = 1
|
|
logger.info(f"Detected OPUS format: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Check for OGG format (often contains OPUS)
|
|
elif audioContent.startswith(b'OggS'):
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000 # OGG OPUS typically uses 48kHz
|
|
channels = 1
|
|
logger.info(f"Detected OGG format: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Check for WAV format
|
|
elif audioContent.startswith(b'RIFF') and b'WAVE' in audioContent[:12]:
|
|
audio_format = "wav"
|
|
# Try to extract sample rate from WAV header
|
|
try:
|
|
# WAV header sample rate is at offset 24-27 (little endian)
|
|
sample_rate = int.from_bytes(audioContent[24:28], 'little')
|
|
channels = int.from_bytes(audioContent[22:24], 'little')
|
|
logger.info(f"Detected WAV format: {sample_rate}Hz, {channels}ch")
|
|
except:
|
|
sample_rate = 16000 # Fallback
|
|
channels = 1
|
|
|
|
# Check for MP3 format
|
|
elif audioContent.startswith(b'\xff\xfb') or audioContent.startswith(b'ID3'):
|
|
audio_format = "mp3"
|
|
sample_rate = 44100 # MP3 typically uses 44.1kHz
|
|
channels = 2 # Usually stereo
|
|
logger.info(f"Detected MP3 format: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Check for FLAC format
|
|
elif audioContent.startswith(b'fLaC'):
|
|
audio_format = "flac"
|
|
sample_rate = 44100 # Common FLAC sample rate
|
|
channels = 2
|
|
logger.info(f"Detected FLAC format: {sample_rate}Hz, {channels}ch")
|
|
|
|
else:
|
|
# Unknown format, try WEBM_OPUS as it's most common for web recordings
|
|
audio_format = "webm_opus"
|
|
sample_rate = 48000 # Try 48kHz for web recordings
|
|
channels = 1
|
|
logger.warning(f"Unknown audio format, trying WEBM_OPUS: {sample_rate}Hz, {channels}ch")
|
|
|
|
# Calculate estimated duration
|
|
if audio_format == "webm_opus":
|
|
# WEBM OPUS duration is hard to calculate without decoding
|
|
estimated_duration = 3.0 # Assume 3 seconds for web recordings
|
|
else:
|
|
# Rough estimate for uncompressed audio
|
|
estimated_duration = len(audioContent) / (sample_rate * channels * 2) # 16-bit = 2 bytes per sample
|
|
|
|
# Check if audio is too short (less than 0.5 seconds)
|
|
if estimated_duration < 0.5:
|
|
logger.warning(f"Audio too short: {estimated_duration:.2f}s, may not be recognized")
|
|
|
|
# Log audio details for debugging
|
|
logger.info(f"Audio analysis: {len(audioContent)} bytes, {estimated_duration:.2f}s, {sample_rate}Hz, {channels}ch, format={audio_format}")
|
|
|
|
# Check audio levels (simple check for silence)
|
|
if audio_format == "webm_opus":
|
|
# For WEBM, we can't easily check levels, but log the first few bytes
|
|
logger.debug(f"Audio sample bytes: {audioContent[:20].hex()}")
|
|
# Check if audio has some variation (not all same bytes)
|
|
if len(audioContent) > 100:
|
|
sample_bytes = audioContent[100:200] # Skip header
|
|
if len(set(sample_bytes)) < 5: # Less than 5 different byte values
|
|
logger.warning("Audio may be silent or very quiet (low byte variation)")
|
|
else:
|
|
logger.debug(f"Audio has good byte variation: {len(set(sample_bytes))} unique values")
|
|
else:
|
|
# For PCM audio, check for silence
|
|
if len(audioContent) > 100:
|
|
# Convert first 100 bytes to check for silence
|
|
sample_bytes = audioContent[:100]
|
|
if all(b == 0 for b in sample_bytes):
|
|
logger.warning("Audio appears to be silent (all zeros)")
|
|
else:
|
|
logger.debug(f"Audio sample bytes: {sample_bytes[:20].hex()}")
|
|
# Check for low variation
|
|
if len(set(sample_bytes)) < 5:
|
|
logger.warning("Audio may be very quiet (low byte variation)")
|
|
|
|
return {
|
|
"valid": True,
|
|
"format": audio_format,
|
|
"sample_rate": sample_rate,
|
|
"channels": channels,
|
|
"size": len(audioContent),
|
|
"estimated_duration": estimated_duration
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"valid": False,
|
|
"error": f"Validation error: {e}"
|
|
}
|
|
|
|
async def textToSpeech(self, text: str, languageCode: str = "de-DE", voiceName: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Convert text to speech using Google Cloud Text-to-Speech.
|
|
|
|
Args:
|
|
text: Text to convert to speech
|
|
language_code: Language code (e.g., 'de-DE', 'en-US')
|
|
voice_name: Specific voice name (optional)
|
|
|
|
Returns:
|
|
Dict with success status and audio data
|
|
"""
|
|
try:
|
|
logger.info(f"Converting text to speech: '{text[:50]}...' in {languageCode}")
|
|
|
|
# Set up the synthesis input
|
|
synthesisInput = texttospeech.SynthesisInput(text=text)
|
|
|
|
# Build the voice request
|
|
selectedVoice = voiceName or self._getDefaultVoice(languageCode)
|
|
|
|
if not selectedVoice:
|
|
return {
|
|
"success": False,
|
|
"error": f"No voice specified for language {languageCode}. Please select a voice."
|
|
}
|
|
|
|
logger.info(f"Using TTS voice: {selectedVoice} for language: {languageCode}")
|
|
|
|
voice = texttospeech.VoiceSelectionParams(
|
|
language_code=languageCode,
|
|
name=selectedVoice,
|
|
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
|
|
)
|
|
|
|
# Select the type of audio file to return
|
|
audioConfig = texttospeech.AudioConfig(
|
|
audio_encoding=texttospeech.AudioEncoding.MP3
|
|
)
|
|
|
|
# Perform the text-to-speech request
|
|
response = self.tts_client.synthesize_speech(
|
|
input=synthesisInput,
|
|
voice=voice,
|
|
audio_config=audioConfig
|
|
)
|
|
|
|
# Return the audio content
|
|
return {
|
|
"success": True,
|
|
"audio_content": response.audio_content,
|
|
"audio_format": "mp3",
|
|
"language_code": languageCode,
|
|
"voice_name": voice.name
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Text-to-Speech error: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": f"Text-to-Speech failed: {str(e)}"
|
|
}
|
|
|
|
def _getDefaultVoice(self, languageCode: str) -> str:
|
|
"""
|
|
Get default voice name for a language code.
|
|
Returns None - no defaults, let the frontend handle voice selection.
|
|
"""
|
|
return None
|
|
|
|
async def getAvailableLanguages(self) -> Dict[str, Any]:
|
|
"""
|
|
Get available languages from Google Cloud Text-to-Speech.
|
|
|
|
Returns:
|
|
Dict containing success status and list of available languages
|
|
"""
|
|
try:
|
|
logger.info("🌐 Getting available languages from Google Cloud TTS")
|
|
|
|
# List voices from Google Cloud TTS
|
|
voices = self.tts_client.list_voices()
|
|
|
|
# Extract unique language codes
|
|
language_codes = set()
|
|
for voice in voices:
|
|
if voice.language_codes:
|
|
language_codes.update(voice.language_codes)
|
|
|
|
# Convert to sorted list
|
|
available_languages = sorted(list(language_codes))
|
|
|
|
logger.info(f"✅ Found {len(available_languages)} available languages")
|
|
|
|
return {
|
|
"success": True,
|
|
"languages": available_languages
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to get available languages: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"languages": []
|
|
}
|
|
|
|
async def getAvailableVoices(self, languageCode: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Get available voices from Google Cloud Text-to-Speech.
|
|
|
|
Args:
|
|
language_code: Optional language code to filter voices (e.g., 'de-DE', 'en-US')
|
|
|
|
Returns:
|
|
Dict containing success status and list of available voices
|
|
"""
|
|
try:
|
|
logger.info(f"🎤 Getting available voices from Google Cloud TTS, language filter: {languageCode}")
|
|
|
|
# List voices from Google Cloud TTS
|
|
voices = self.tts_client.list_voices()
|
|
|
|
availableVoices = []
|
|
|
|
for voice in voices:
|
|
# Extract language code from voice name (e.g., 'de-DE-Wavenet-A' -> 'de-DE')
|
|
voiceLanguage = voice.language_codes[0] if voice.language_codes else None
|
|
|
|
# Filter by language if specified
|
|
if languageCode and voiceLanguage != languageCode:
|
|
continue
|
|
|
|
# Determine gender from voice name (A/C = male, B/D = female)
|
|
gender = "Unknown"
|
|
if voice.name:
|
|
if voice.name.endswith(('-A', '-C')):
|
|
gender = "Male"
|
|
elif voice.name.endswith(('-B', '-D')):
|
|
gender = "Female"
|
|
|
|
# Create voice info
|
|
voiceInfo = {
|
|
"name": voice.name,
|
|
"language_code": voiceLanguage,
|
|
"gender": gender,
|
|
"ssml_gender": voice.ssml_gender.name if voice.ssml_gender else "NEUTRAL",
|
|
"natural_sample_rate_hertz": voice.natural_sample_rate_hertz
|
|
}
|
|
|
|
availableVoices.append(voiceInfo)
|
|
|
|
# Sort by language code, then by gender, then by name
|
|
availableVoices.sort(key=lambda x: (x["language_code"], x["gender"], x["name"]))
|
|
|
|
logger.info(f"✅ Found {len(availableVoices)} voices for language filter: {languageCode}")
|
|
|
|
return {
|
|
"success": True,
|
|
"voices": availableVoices,
|
|
"total_count": len(availableVoices)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to get available voices: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"voices": []
|
|
}
|
|
|