gateway/modules/interfaces/interfaceVoiceObjects.py
2025-09-26 23:36:56 +02:00

498 lines
17 KiB
Python

"""
Interface for Voice Services
Provides a generic interface layer between routes and voice connectors.
Handles voice operations including speech-to-text, text-to-speech, and translation.
"""
import logging
from typing import Dict, Any, Optional, List
from datetime import datetime, UTC
from modules.connectors.connectorVoiceGoogle import ConnectorGoogleSpeech
from modules.datamodels.datamodelVoice import VoiceSettings
from modules.datamodels.datamodelUam import User
from modules.shared.timezoneUtils import get_utc_timestamp
logger = logging.getLogger(__name__)
# Singleton factory for Voice instances
_instancesVoice = {}
class VoiceObjects:
"""
Interface for Voice Services.
Provides a generic interface layer between routes and voice connectors.
"""
def __init__(self):
"""Initialize the Voice Interface."""
self.currentUser: Optional[User] = None
self.userId: Optional[str] = None
self._google_speech_connector: Optional[ConnectorGoogleSpeech] = None
def setUserContext(self, currentUser: User):
"""Set the user context for the interface."""
if not currentUser:
logger.info("Initializing voice interface without user context")
return
self.currentUser = currentUser
self.userId = currentUser.id
if not self.userId:
raise ValueError("Invalid user context: id is required")
logger.debug(f"Voice interface user context set: userId={self.userId}")
def _getGoogleSpeechConnector(self) -> ConnectorGoogleSpeech:
"""Get or create Google Cloud Speech connector instance."""
if self._google_speech_connector is None:
try:
self._google_speech_connector = ConnectorGoogleSpeech()
logger.info("✅ Google Cloud Speech connector initialized")
except Exception as e:
logger.error(f"❌ Failed to initialize Google Cloud Speech connector: {e}")
raise
return self._google_speech_connector
# Speech-to-Text Operations
async def speechToText(self, audioContent: bytes, language: str = "de-DE",
sampleRate: int = None, channels: int = None) -> Dict[str, Any]:
"""
Convert speech to text using Google Cloud Speech-to-Text API.
Args:
audioContent: Raw audio data
language: Language code (e.g., 'de-DE', 'en-US')
sampleRate: Audio sample rate (auto-detected if None)
channels: Number of audio channels (auto-detected if None)
Returns:
Dict containing transcribed text, confidence, and metadata
"""
try:
logger.info(f"🎤 Speech-to-text request: {len(audioContent)} bytes, language: {language}")
connector = self._getGoogleSpeechConnector()
result = await connector.speech_to_text(
audio_content=audioContent,
language=language,
sample_rate=sampleRate,
channels=channels
)
if result["success"]:
logger.info(f"✅ Speech-to-text successful: '{result['text']}' (confidence: {result['confidence']:.2f})")
else:
logger.warning(f"⚠️ Speech-to-text failed: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Speech-to-text error: {e}")
return {
"success": False,
"text": "",
"confidence": 0.0,
"error": str(e)
}
# Translation Operations
async def translateText(self, text: str, sourceLanguage: str = "de",
targetLanguage: str = "en") -> Dict[str, Any]:
"""
Translate text using Google Cloud Translation API.
Args:
text: Text to translate
sourceLanguage: Source language code (e.g., 'de', 'en')
targetLanguage: Target language code (e.g., 'en', 'de')
Returns:
Dict containing translated text and metadata
"""
try:
logger.info(f"🌐 Translation request: '{text}' ({sourceLanguage} -> {targetLanguage})")
if not text.strip():
return {
"success": False,
"translated_text": "",
"error": "Empty text provided"
}
connector = self._getGoogleSpeechConnector()
result = await connector.translate_text(
text=text,
source_language=sourceLanguage,
target_language=targetLanguage
)
if result["success"]:
logger.info(f"✅ Translation successful: '{result['translated_text']}'")
else:
logger.warning(f"⚠️ Translation failed: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Translation error: {e}")
return {
"success": False,
"translated_text": "",
"error": str(e)
}
# Combined Operations
async def speechToTranslatedText(self, audioContent: bytes,
fromLanguage: str = "de-DE",
toLanguage: str = "en") -> Dict[str, Any]:
"""
Complete pipeline: Speech-to-Text + Translation.
Args:
audioContent: Raw audio data
fromLanguage: Source language for speech recognition
toLanguage: Target language for translation
Returns:
Dict containing original text, translated text, and metadata
"""
try:
logger.info(f"🔄 Speech-to-translation pipeline: {fromLanguage} -> {toLanguage}")
connector = self._getGoogleSpeechConnector()
result = await connector.speech_to_translated_text(
audio_content=audioContent,
from_language=fromLanguage,
to_language=toLanguage
)
if result["success"]:
logger.info(f"✅ Complete pipeline successful:")
logger.info(f" Original: '{result['original_text']}'")
logger.info(f" Translated: '{result['translated_text']}'")
else:
logger.warning(f"⚠️ Speech-to-translation pipeline failed: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Speech-to-translation pipeline error: {e}")
return {
"success": False,
"original_text": "",
"translated_text": "",
"error": str(e)
}
# Text-to-Speech Operations
async def textToSpeech(self, text: str, languageCode: str = "de-DE",
voiceName: str = None) -> Dict[str, Any]:
"""
Convert text to speech using Google Cloud Text-to-Speech.
Args:
text: Text to convert to speech
languageCode: Language code (e.g., 'de-DE', 'en-US')
voiceName: Specific voice name (optional)
Returns:
Dict with success status and audio data
"""
try:
logger.info(f"🔊 Text-to-Speech request: '{text[:50]}...' in {languageCode}")
if not text.strip():
return {
"success": False,
"error": "Empty text provided for text-to-speech"
}
connector = self._getGoogleSpeechConnector()
result = await connector.text_to_speech(
text=text,
language_code=languageCode,
voice_name=voiceName
)
if result["success"]:
logger.info(f"✅ Text-to-Speech successful: {len(result['audio_content'])} bytes")
else:
logger.warning(f"⚠️ Text-to-Speech failed: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Text-to-Speech error: {e}")
return {
"success": False,
"error": str(e)
}
# Voice Settings Management
def getVoiceSettings(self, userId: str) -> Optional[VoiceSettings]:
"""
Get voice settings for a user.
Args:
userId: User ID to get settings for
Returns:
VoiceSettings object or None if not found
"""
try:
# This would typically query the database
# For now, return None as this is handled by the database interface
logger.debug(f"Getting voice settings for user: {userId}")
return None
except Exception as e:
logger.error(f"❌ Error getting voice settings: {e}")
return None
def createVoiceSettings(self, settingsData: Dict[str, Any]) -> Optional[VoiceSettings]:
"""
Create new voice settings.
Args:
settingsData: Dictionary containing voice settings data
Returns:
Created VoiceSettings object or None if failed
"""
try:
logger.info(f"Creating voice settings: {settingsData}")
# Add timestamps
currentTime = get_utc_timestamp()
settingsData["creationDate"] = currentTime
settingsData["lastModified"] = currentTime
# Create VoiceSettings object
voiceSettings = VoiceSettings(**settingsData)
logger.info(f"✅ Voice settings created: {voiceSettings.id}")
return voiceSettings
except Exception as e:
logger.error(f"❌ Error creating voice settings: {e}")
return None
def updateVoiceSettings(self, userId: str, settingsData: Dict[str, Any]) -> Optional[VoiceSettings]:
"""
Update existing voice settings.
Args:
userId: User ID to update settings for
settingsData: Dictionary containing updated voice settings data
Returns:
Updated VoiceSettings object or None if failed
"""
try:
logger.info(f"Updating voice settings for user {userId}: {settingsData}")
# Add last modified timestamp
settingsData["lastModified"] = get_utc_timestamp()
# Create updated VoiceSettings object
voiceSettings = VoiceSettings(**settingsData)
logger.info(f"✅ Voice settings updated: {voiceSettings.id}")
return voiceSettings
except Exception as e:
logger.error(f"❌ Error updating voice settings: {e}")
return None
def getOrCreateVoiceSettings(self, userId: str) -> Optional[VoiceSettings]:
"""
Get existing voice settings or create default ones.
Args:
userId: User ID to get/create settings for
Returns:
VoiceSettings object
"""
try:
# Try to get existing settings
existingSettings = self.getVoiceSettings(userId)
if existingSettings:
return existingSettings
# Create default settings if none exist
defaultSettings = {
"userId": userId,
"sttLanguage": "de-DE",
"ttsLanguage": "de-DE",
"ttsVoice": "de-DE-Wavenet-A",
"translationEnabled": True,
"targetLanguage": "en-US"
}
return self.createVoiceSettings(defaultSettings)
except Exception as e:
logger.error(f"❌ Error getting or creating voice settings: {e}")
return None
# Language and Voice Information
async def getAvailableLanguages(self) -> Dict[str, Any]:
"""
Get available languages from Google Cloud Text-to-Speech.
Returns:
Dict containing success status and list of available languages
"""
try:
logger.info("🌐 Getting available languages from Google Cloud TTS")
connector = self._getGoogleSpeechConnector()
result = await connector.get_available_languages()
if result["success"]:
logger.info(f"✅ Found {len(result['languages'])} available languages")
else:
logger.warning(f"⚠️ Failed to get languages: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Error getting available languages: {e}")
return {
"success": False,
"error": str(e),
"languages": []
}
async def getAvailableVoices(self, languageCode: Optional[str] = None) -> Dict[str, Any]:
"""
Get available voices from Google Cloud Text-to-Speech.
Args:
languageCode: Optional language code to filter voices
Returns:
Dict containing success status and list of available voices
"""
try:
logger.info(f"🎤 Getting available voices, language filter: {languageCode}")
connector = self._getGoogleSpeechConnector()
result = await connector.get_available_voices(language_code=languageCode)
if result["success"]:
logger.info(f"✅ Found {len(result['voices'])} voices for language filter: {languageCode}")
else:
logger.warning(f"⚠️ Failed to get voices: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Error getting available voices: {e}")
return {
"success": False,
"error": str(e),
"voices": []
}
# Audio Validation
def validateAudioFormat(self, audioContent: bytes) -> Dict[str, Any]:
"""
Validate audio format for Google Cloud Speech-to-Text.
Args:
audioContent: Raw audio data
Returns:
Dict containing validation results
"""
try:
logger.debug(f"Validating audio format: {len(audioContent)} bytes")
connector = self._getGoogleSpeechConnector()
result = connector.validate_audio_format(audioContent)
if result["valid"]:
logger.debug(f"✅ Audio validation successful: {result['format']}, {result['sample_rate']}Hz, {result['channels']}ch")
else:
logger.warning(f"⚠️ Audio validation failed: {result.get('error', 'Unknown error')}")
return result
except Exception as e:
logger.error(f"❌ Audio validation error: {e}")
return {
"valid": False,
"error": str(e)
}
# Health Check
async def healthCheck(self) -> Dict[str, Any]:
"""
Perform health check for voice services.
Returns:
Dict containing health status and test results
"""
try:
logger.info("🏥 Performing voice services health check")
connector = self._getGoogleSpeechConnector()
# Test with a simple translation
testResult = await connector.translate_text(
text="Hello",
source_language="en",
target_language="de"
)
if testResult["success"]:
return {
"status": "healthy",
"service": "Google Cloud Speech-to-Text & Translation",
"test_translation": testResult["translated_text"]
}
else:
return {
"status": "unhealthy",
"error": testResult.get("error", "Unknown error")
}
except Exception as e:
logger.error(f"❌ Health check failed: {e}")
return {
"status": "unhealthy",
"error": str(e)
}
def getVoiceInterface(currentUser: User = None) -> VoiceObjects:
"""
Factory function to get or create Voice interface instance.
Args:
currentUser: User object for context (optional)
Returns:
VoiceObjects instance
"""
# For now, create a new instance each time
# In the future, this could be enhanced with singleton pattern per user
voiceInterface = VoiceObjects()
if currentUser:
voiceInterface.setUserContext(currentUser)
return voiceInterface