""" Azure Voice Services Route Provides endpoints for Azure Speech Services integration including: - Speech-to-Text (STT) - Text-to-Speech (TTS) - Translation services - Real-time conversation """ import logging import asyncio import json from typing import Dict, Any, Optional from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form from fastapi.responses import StreamingResponse from pydantic import BaseModel import io import base64 import asyncio from typing import AsyncGenerator from datetime import datetime from modules.interfaces.interfaceAppObjects import getRootInterface from modules.interfaces.interfaceAppModel import UserInDB from modules.security.auth import getCurrentUser from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/voice", tags=["voice"]) # Pydantic models for request/response class SpeechToTextRequest(BaseModel): language: str = "de-DE" format: str = "detailed" # "simple" or "detailed" class TextToSpeechRequest(BaseModel): text: str language: str = "de-DE" voice: str = "de-DE-KatjaNeural" format: str = "audio-16khz-128kbitrate-mono-mp3" class TranslationRequest(BaseModel): text: str from_language: str = "de-DE" to_language: str = "en-US" class ConversationRequest(BaseModel): message: str language: str = "de-DE" response_voice: str = "de-DE-KatjaNeural" class VoiceSettingsRequest(BaseModel): stt_language: str = "de-DE" tts_language: str = "de-DE" tts_voice: str = "de-DE-KatjaNeural" translation_enabled: bool = True target_language: str = "en-US" # Get Azure Speech connector for current user async def get_azure_speech_connector(current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None) -> ConnectorAzureSpeech: """Get Azure Speech connector for the current user.""" try: root_interface = getRootInterface() # Get user connections user_connections = root_interface.getUserConnections(current_user.id) azure_connection = None if connection_id: # Find specific connection by ID for connection in user_connections: if connection.id == connection_id and connection.authority == "msft": azure_connection = connection break else: # Find first Azure connection for connection in user_connections: if connection.authority == "msft": azure_connection = connection break if not azure_connection: if connection_id: raise HTTPException( status_code=400, detail=f"Azure connection with ID '{connection_id}' not found." ) else: raise HTTPException( status_code=400, detail="No Azure connection found. Please connect your Microsoft account first." ) # Get connection token connection_token = root_interface.getConnectionToken(azure_connection.id) if not connection_token: raise HTTPException( status_code=400, detail="Azure connection token not found. Please reconnect your Microsoft account." ) # For Azure Speech Services, we need a subscription key, not an access token # Use Azure Speech Services subscription key from configuration from modules.shared.configuration import APP_CONFIG subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") # Debug: Log subscription key (masked for security) logger.info(f"Loaded subscription key: {subscription_key[:8]}...{subscription_key[-8:] if subscription_key and len(subscription_key) > 16 else 'INVALID'}") logger.info(f"Key length: {len(subscription_key) if subscription_key else 'None'}") if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": raise HTTPException( status_code=500, detail="Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini" ) # Get region from configuration region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") # Create Azure Speech connector connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) return connector except Exception as e: logger.error(f"Error getting Azure Speech connector: {str(e)}") raise HTTPException(status_code=500, detail=f"Failed to initialize Azure Speech connector: {str(e)}") @router.get("/connections") async def get_user_connections(current_user: UserInDB = Depends(getCurrentUser)): """Get all Microsoft connections for the current user.""" try: root_interface = getRootInterface() # Get user connections user_connections = root_interface.getUserConnections(current_user.id) # Filter for Microsoft connections only msft_connections = [] for connection in user_connections: if connection.authority == "msft": # Check if this connection has speech services subscription key has_speech_key = False if hasattr(connection, 'metadata') and connection.metadata: try: metadata = json.loads(connection.metadata) if isinstance(connection.metadata, str) else connection.metadata has_speech_key = bool(metadata.get('speech_subscription_key') or metadata.get('service_type') == 'speech_services') except (json.JSONDecodeError, AttributeError): pass msft_connections.append({ "id": connection.id, "externalUsername": connection.externalUsername, "authority": connection.authority, "isActive": connection.status == "active", "hasSpeechKey": has_speech_key, "connectedAt": connection.connectedAt, "lastChecked": connection.lastChecked }) return { "success": True, "connections": msft_connections, "count": len(msft_connections) } except Exception as e: logger.error(f"Error getting user connections: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) class SpeechSubscriptionRequest(BaseModel): subscription_key: str region: str = "westeurope" connection_id: Optional[str] = None @router.post("/subscription") async def set_speech_subscription( request: SpeechSubscriptionRequest, current_user: UserInDB = Depends(getCurrentUser) ): """Set Azure Speech Services subscription key for the user.""" try: root_interface = getRootInterface() # Validate subscription key format (basic validation) if not request.subscription_key or len(request.subscription_key) < 32: raise HTTPException( status_code=400, detail="Invalid subscription key format. Please provide a valid Azure Speech Services subscription key." ) # Validate region valid_regions = ["westeurope", "eastus", "westus2", "eastus2", "southeastasia", "westcentralus", "eastasia", "northeurope", "southcentralus", "centralus", "australiaeast", "brazilsouth", "canadacentral", "centralindia", "francecentral", "germanywestcentral", "japaneast", "koreacentral", "norwayeast", "southafricanorth", "switzerlandnorth", "uaenorth", "uksouth", "westus3"] if request.region not in valid_regions: raise HTTPException( status_code=400, detail=f"Invalid region. Supported regions: {', '.join(valid_regions)}" ) # Test the subscription key by creating a temporary connector try: test_connector = ConnectorAzureSpeech(subscription_key=request.subscription_key, region=request.region) # Test with a simple TTS request test_audio = await test_connector.text_to_speech("Test", "en-US", "en-US-AriaNeural") if len(test_audio) == 0: raise Exception("Subscription key test failed") except Exception as e: raise HTTPException( status_code=400, detail=f"Invalid subscription key or region. Test failed: {str(e)}" ) # Get user connections user_connections = root_interface.getUserConnections(current_user.id) # Find the target connection target_connection = None if request.connection_id: # Use specific connection for connection in user_connections: if connection.id == request.connection_id and connection.authority == "msft": target_connection = connection break if not target_connection: raise HTTPException( status_code=400, detail=f"Connection with ID '{request.connection_id}' not found." ) else: # Use first Microsoft connection or create a new one target_connection = None for connection in user_connections: if connection.authority == "msft": target_connection = connection break if not target_connection: # Create a new connection for speech services # This would require implementing connection creation in the interface raise HTTPException( status_code=400, detail="No Microsoft connection found. Please connect your Microsoft account first." ) # Update connection metadata with speech subscription key metadata = {} if hasattr(target_connection, 'metadata') and target_connection.metadata: try: metadata = json.loads(target_connection.metadata) if isinstance(target_connection.metadata, str) else target_connection.metadata except (json.JSONDecodeError, AttributeError): metadata = {} # Add speech services information metadata['speech_subscription_key'] = request.subscription_key metadata['speech_region'] = request.region metadata['speech_configured_at'] = datetime.now().isoformat() # Update the connection (this would require implementing update in the interface) # For now, we'll just return success - the actual update would need to be implemented # in the database interface return { "success": True, "message": "Azure Speech Services subscription key configured successfully", "connection_id": target_connection.id, "region": request.region, "configured_at": metadata['speech_configured_at'] } except HTTPException: raise except Exception as e: logger.error(f"Error setting speech subscription: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.get("/subscription") async def get_speech_subscription( current_user: UserInDB = Depends(getCurrentUser) ): """Get Azure Speech Services subscription information for the user.""" try: # Use Azure Speech Services subscription key from configuration from modules.shared.configuration import APP_CONFIG subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": return { "success": True, "has_subscription": False, "message": "Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini" } # Test the connection try: connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) test_result = await connector.test_connection() return { "success": True, "has_subscription": True, "subscription": { "subscription_key": subscription_key[:8] + "..." + subscription_key[-8:], "region": region, "connection_test": test_result, "source": "config.ini" } } except Exception as test_error: return { "success": True, "has_subscription": True, "subscription": { "subscription_key": subscription_key[:8] + "..." + subscription_key[-8:], "region": region, "connection_test": False, "error": str(test_error), "source": "config.ini" } } except Exception as e: logger.error(f"Error getting speech subscription: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.get("/settings") async def get_voice_settings( current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None ): """Get available voice settings and languages.""" try: connector = await get_azure_speech_connector(current_user, connection_id) # Get available voices and languages voices = await connector.get_available_voices() languages = await connector.get_available_languages() # Get user's saved settings from database from modules.interfaces.interfaceComponentObjects import getInterface component_interface = getInterface(current_user) user_settings = component_interface.getVoiceSettings(current_user.id) # If no settings exist, use defaults if not user_settings: user_settings = { "sttLanguage": "de-DE", "ttsLanguage": "de-DE", "ttsVoice": "de-DE-KatjaNeural", "translationEnabled": True, "targetLanguage": "en-US" } return { "voices": voices, "languages": languages, "user_settings": { "sttLanguage": user_settings.sttLanguage if hasattr(user_settings, 'sttLanguage') else user_settings.get("sttLanguage"), "ttsLanguage": user_settings.ttsLanguage if hasattr(user_settings, 'ttsLanguage') else user_settings.get("ttsLanguage"), "ttsVoice": user_settings.ttsVoice if hasattr(user_settings, 'ttsVoice') else user_settings.get("ttsVoice"), "translationEnabled": user_settings.translationEnabled if hasattr(user_settings, 'translationEnabled') else user_settings.get("translationEnabled"), "targetLanguage": user_settings.targetLanguage if hasattr(user_settings, 'targetLanguage') else user_settings.get("targetLanguage") }, "default_settings": { "sttLanguage": "de-DE", "ttsLanguage": "de-DE", "ttsVoice": "de-DE-KatjaNeural", "translationEnabled": True, "targetLanguage": "en-US" } } except Exception as e: logger.error(f"Error getting voice settings: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/settings") async def save_voice_settings( settings: dict, current_user: UserInDB = Depends(getCurrentUser) ): """Save voice settings for the current user.""" try: from modules.interfaces.interfaceComponentObjects import getInterface component_interface = getInterface(current_user) # Check if settings exist, if not create them existing_settings = component_interface.getVoiceSettings(current_user.id) if not existing_settings: # Create new settings settings["userId"] = current_user.id settings["mandateId"] = current_user.mandateId created_settings = component_interface.createVoiceSettings(settings) return { "success": True, "message": "Voice settings created successfully", "settings": created_settings } else: # Update existing settings updated_settings = component_interface.updateVoiceSettings( current_user.id, settings ) return { "success": True, "message": "Voice settings saved successfully", "settings": updated_settings } except Exception as e: logger.error(f"Error saving voice settings: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/speech-to-text") async def speech_to_text( audio_file: UploadFile = File(...), language: str = Form("de-DE"), format: str = Form("detailed"), connection_id: str = Form(None), current_user: UserInDB = Depends(getCurrentUser) ): """Convert speech to text using Azure Speech Services.""" try: connector = await get_azure_speech_connector(current_user, connection_id) # Read audio file audio_content = await audio_file.read() # Convert speech to text result = await connector.speech_to_text( audio_content=audio_content, language=language, format=format ) return { "success": True, "text": result.get("text", ""), "confidence": result.get("confidence", 0.0), "language": result.get("language", language), "format": format } except Exception as e: logger.error(f"Error in speech-to-text: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/text-to-speech") async def text_to_speech( request: TextToSpeechRequest, current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None ): """Convert text to speech using Azure Speech Services.""" try: connector = await get_azure_speech_connector(current_user, connection_id) # Convert text to speech audio_data = await connector.text_to_speech( text=request.text, language=request.language, voice=request.voice, format=request.format ) # Return audio as base64 encoded string audio_base64 = base64.b64encode(audio_data).decode('utf-8') return { "success": True, "audio_data": audio_base64, "format": request.format, "voice": request.voice, "language": request.language } except Exception as e: logger.error(f"Error in text-to-speech: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/translate") async def translate_text( request: TranslationRequest, current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None ): """Translate text using Azure Translator.""" try: connector = await get_azure_speech_connector(current_user, connection_id) # Translate text translated_text = await connector.translate_text( text=request.text, from_language=request.from_language, to_language=request.to_language ) return { "success": True, "original_text": request.text, "translated_text": translated_text, "from_language": request.from_language, "to_language": request.to_language } except Exception as e: logger.error(f"Error in translation: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/conversation") async def conversation( request: ConversationRequest, current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None ): """Handle conversation with voice response.""" try: connector = await get_azure_speech_connector(current_user, connection_id) # Simple AI response logic - in production, integrate with OpenAI, Azure OpenAI, or similar response_text = await _generate_ai_response(request.message, request.language) # Convert response to speech audio_data = await connector.text_to_speech( text=response_text, language=request.language, voice=request.response_voice, format="audio-16khz-128kbitrate-mono-mp3" ) # Return both text and audio audio_base64 = base64.b64encode(audio_data).decode('utf-8') return { "success": True, "response_text": response_text, "audio_data": audio_base64, "voice": request.response_voice, "language": request.language } except Exception as e: logger.error(f"Error in conversation: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) async def _generate_ai_response(message: str, language: str) -> str: """Generate AI response for conversation.""" try: # Simple rule-based responses for demonstration # In production, this would call an AI service like OpenAI or Azure OpenAI message_lower = message.lower() # German responses if language.startswith("de"): if any(word in message_lower for word in ["hallo", "hi", "hey"]): return "Hallo! Wie kann ich Ihnen heute helfen?" elif any(word in message_lower for word in ["wie geht", "wie gehts"]): return "Mir geht es gut, danke der Nachfrage! Wie geht es Ihnen?" elif any(word in message_lower for word in ["danke", "dankeschön"]): return "Gern geschehen! Gibt es noch etwas, womit ich helfen kann?" elif any(word in message_lower for word in ["zeit", "uhr"]): from datetime import datetime current_time = datetime.now().strftime("%H:%M") return f"Es ist jetzt {current_time} Uhr." elif any(word in message_lower for word in ["wetter", "temperatur"]): return "Ich kann leider keine aktuellen Wetterdaten abrufen. Bitte schauen Sie in eine Wetter-App." else: return f"Das ist interessant: '{message}'. Können Sie mir mehr darüber erzählen?" # English responses elif language.startswith("en"): if any(word in message_lower for word in ["hello", "hi", "hey"]): return "Hello! How can I help you today?" elif any(word in message_lower for word in ["how are you", "how's it going"]): return "I'm doing well, thank you for asking! How are you?" elif any(word in message_lower for word in ["thank you", "thanks"]): return "You're welcome! Is there anything else I can help you with?" elif any(word in message_lower for word in ["time", "clock"]): from datetime import datetime current_time = datetime.now().strftime("%H:%M") return f"It's currently {current_time}." elif any(word in message_lower for word in ["weather", "temperature"]): return "I'm sorry, I can't retrieve current weather data. Please check a weather app." else: return f"That's interesting: '{message}'. Can you tell me more about that?" # Default response else: return f"I heard: '{message}'. How can I help you with that?" except Exception as e: logger.error(f"Error generating AI response: {str(e)}") return "I'm sorry, I didn't understand that. Could you please repeat?" @router.post("/realtime-interpreter") async def realtime_interpreter( audio_file: UploadFile = File(...), from_language: str = Form("de-DE"), to_language: str = Form("en-US"), connection_id: str = Form(None), current_user: UserInDB = Depends(getCurrentUser) ): """Real-time interpreter: speech to translated text.""" try: logger.info(f"Realtime interpreter called with from_language='{from_language}', to_language='{to_language}'") connector = await get_azure_speech_connector(current_user, connection_id) # Read audio file audio_content = await audio_file.read() # Convert speech to text stt_result = await connector.speech_to_text( audio_content=audio_content, language=from_language, format="detailed" ) original_text = stt_result.get("text", "") # Translate text if different languages translated_text = original_text if from_language != to_language: try: translated_text = await connector.translate_text( text=original_text, from_language=from_language, to_language=to_language ) except Exception as e: logger.warning(f"Translation failed, using original text: {str(e)}") translated_text = original_text return { "success": True, "original_text": original_text, "translated_text": translated_text, "from_language": from_language, "to_language": to_language, "confidence": stt_result.get("confidence", 0.0) } except Exception as e: logger.error(f"Error in realtime interpreter: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/stream/speech-to-text") async def stream_speech_to_text( audio_file: UploadFile = File(...), language: str = Form("de-DE"), format: str = Form("detailed"), audio_format: str = Form("wav"), connection_id: str = Form(None), current_user: UserInDB = Depends(getCurrentUser) ): """Stream speech to text using Azure Speech Services.""" try: connector = await get_azure_speech_connector(current_user, connection_id) async def audio_chunk_generator(): """Generate audio chunks from uploaded file.""" chunk_size = 4096 # 4KB chunks while True: chunk = await audio_file.read(chunk_size) if not chunk: break yield chunk async def response_generator(): """Generate streaming responses.""" try: async for result in connector.stream_speech_to_text( audio_chunk_generator(), language=language, format=format, audio_format=audio_format ): yield f"data: {json.dumps(result)}\n\n" except Exception as e: error_result = {"error": str(e), "is_final": True} yield f"data: {json.dumps(error_result)}\n\n" return StreamingResponse( response_generator(), media_type="text/plain", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "text/event-stream" } ) except Exception as e: logger.error(f"Error in streaming speech-to-text: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/stream/text-to-speech") async def stream_text_to_speech( request: TextToSpeechRequest, current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None ): """Stream text to speech using Azure Speech Services.""" try: connector = await get_azure_speech_connector(current_user, connection_id) async def text_chunk_generator(): """Generate text chunks from the request.""" # Split text into sentences for better streaming sentences = request.text.split('. ') for sentence in sentences: if sentence.strip(): yield sentence.strip() + '. ' async def audio_generator(): """Generate streaming audio data.""" try: async for audio_chunk in connector.stream_text_to_speech( text_chunk_generator(), language=request.language, voice=request.voice, format=request.format ): yield audio_chunk except Exception as e: logger.error(f"Error in audio generation: {str(e)}") # Return error as audio (could be a beep or silence) yield b"" return StreamingResponse( audio_generator(), media_type="audio/mpeg", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "audio/mpeg" } ) except Exception as e: logger.error(f"Error in streaming text-to-speech: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/stream/realtime-interpreter") async def stream_realtime_interpreter( audio_file: UploadFile = File(...), from_language: str = Form("de-DE"), to_language: str = Form("en-US"), connection_id: str = Form(None), current_user: UserInDB = Depends(getCurrentUser) ): """Stream real-time interpreter: speech to translated text.""" try: connector = await get_azure_speech_connector(current_user, connection_id) async def audio_chunk_generator(): """Generate audio chunks from uploaded file.""" chunk_size = 4096 # 4KB chunks while True: chunk = await audio_file.read(chunk_size) if not chunk: break yield chunk async def response_generator(): """Generate streaming translation responses.""" try: async for result in connector.stream_realtime_interpreter( audio_chunk_generator(), from_language=from_language, to_language=to_language ): yield f"data: {json.dumps(result)}\n\n" except Exception as e: error_result = {"error": str(e), "is_final": True} yield f"data: {json.dumps(error_result)}\n\n" return StreamingResponse( response_generator(), media_type="text/plain", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "text/event-stream" } ) except Exception as e: logger.error(f"Error in streaming realtime interpreter: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @router.get("/health") async def health_check(): """Health check endpoint for voice services.""" return { "status": "healthy", "service": "azure-voice", "endpoints": [ "speech-to-text", "text-to-speech", "translate", "conversation", "realtime-interpreter", "stream/speech-to-text", "stream/text-to-speech", "stream/realtime-interpreter", "subscription", "connections" ] }