gateway/modules/routes/routeVoiceAzure.py
2025-09-13 01:57:53 +02:00

813 lines
32 KiB
Python

"""
Azure Voice Services Route
Provides endpoints for Azure Speech Services integration including:
- Speech-to-Text (STT)
- Text-to-Speech (TTS)
- Translation services
- Real-time conversation
"""
import logging
import asyncio
import json
from typing import Dict, Any, Optional
from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import io
import base64
import asyncio
from typing import AsyncGenerator
from datetime import datetime
from modules.interfaces.interfaceAppObjects import getRootInterface
from modules.interfaces.interfaceAppModel import UserInDB
from modules.security.auth import getCurrentUser
from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/voice", tags=["voice"])
# Pydantic models for request/response
class SpeechToTextRequest(BaseModel):
language: str = "de-DE"
format: str = "detailed" # "simple" or "detailed"
class TextToSpeechRequest(BaseModel):
text: str
language: str = "de-DE"
voice: str = "de-DE-KatjaNeural"
format: str = "audio-16khz-128kbitrate-mono-mp3"
class TranslationRequest(BaseModel):
text: str
from_language: str = "de-DE"
to_language: str = "en-US"
class ConversationRequest(BaseModel):
message: str
language: str = "de-DE"
response_voice: str = "de-DE-KatjaNeural"
class VoiceSettingsRequest(BaseModel):
stt_language: str = "de-DE"
tts_language: str = "de-DE"
tts_voice: str = "de-DE-KatjaNeural"
translation_enabled: bool = True
target_language: str = "en-US"
# Get Azure Speech connector for current user
async def get_azure_speech_connector(current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None) -> ConnectorAzureSpeech:
"""Get Azure Speech connector for the current user."""
try:
root_interface = getRootInterface()
# Get user connections
user_connections = root_interface.getUserConnections(current_user.id)
azure_connection = None
if connection_id:
# Find specific connection by ID
for connection in user_connections:
if connection.id == connection_id and connection.authority == "msft":
azure_connection = connection
break
else:
# Find first Azure connection
for connection in user_connections:
if connection.authority == "msft":
azure_connection = connection
break
if not azure_connection:
if connection_id:
raise HTTPException(
status_code=400,
detail=f"Azure connection with ID '{connection_id}' not found."
)
else:
raise HTTPException(
status_code=400,
detail="No Azure connection found. Please connect your Microsoft account first."
)
# Get connection token
connection_token = root_interface.getConnectionToken(azure_connection.id)
if not connection_token:
raise HTTPException(
status_code=400,
detail="Azure connection token not found. Please reconnect your Microsoft account."
)
# For Azure Speech Services, we need a subscription key, not an access token
# Use Azure Speech Services subscription key from configuration
from modules.shared.configuration import APP_CONFIG
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
# Debug: Log subscription key (masked for security)
logger.info(f"Loaded subscription key: {subscription_key[:8]}...{subscription_key[-8:] if subscription_key and len(subscription_key) > 16 else 'INVALID'}")
logger.info(f"Key length: {len(subscription_key) if subscription_key else 'None'}")
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
raise HTTPException(
status_code=500,
detail="Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini"
)
# Get region from configuration
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
# Create Azure Speech connector
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
return connector
except Exception as e:
logger.error(f"Error getting Azure Speech connector: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to initialize Azure Speech connector: {str(e)}")
@router.get("/connections")
async def get_user_connections(current_user: UserInDB = Depends(getCurrentUser)):
"""Get all Microsoft connections for the current user."""
try:
root_interface = getRootInterface()
# Get user connections
user_connections = root_interface.getUserConnections(current_user.id)
# Filter for Microsoft connections only
msft_connections = []
for connection in user_connections:
if connection.authority == "msft":
# Check if this connection has speech services subscription key
has_speech_key = False
if hasattr(connection, 'metadata') and connection.metadata:
try:
metadata = json.loads(connection.metadata) if isinstance(connection.metadata, str) else connection.metadata
has_speech_key = bool(metadata.get('speech_subscription_key') or metadata.get('service_type') == 'speech_services')
except (json.JSONDecodeError, AttributeError):
pass
msft_connections.append({
"id": connection.id,
"externalUsername": connection.externalUsername,
"authority": connection.authority,
"isActive": connection.status == "active",
"hasSpeechKey": has_speech_key,
"connectedAt": connection.connectedAt,
"lastChecked": connection.lastChecked
})
return {
"success": True,
"connections": msft_connections,
"count": len(msft_connections)
}
except Exception as e:
logger.error(f"Error getting user connections: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
class SpeechSubscriptionRequest(BaseModel):
subscription_key: str
region: str = "westeurope"
connection_id: Optional[str] = None
@router.post("/subscription")
async def set_speech_subscription(
request: SpeechSubscriptionRequest,
current_user: UserInDB = Depends(getCurrentUser)
):
"""Set Azure Speech Services subscription key for the user."""
try:
root_interface = getRootInterface()
# Validate subscription key format (basic validation)
if not request.subscription_key or len(request.subscription_key) < 32:
raise HTTPException(
status_code=400,
detail="Invalid subscription key format. Please provide a valid Azure Speech Services subscription key."
)
# Validate region
valid_regions = ["westeurope", "eastus", "westus2", "eastus2", "southeastasia", "westcentralus", "eastasia", "northeurope", "southcentralus", "centralus", "australiaeast", "brazilsouth", "canadacentral", "centralindia", "francecentral", "germanywestcentral", "japaneast", "koreacentral", "norwayeast", "southafricanorth", "switzerlandnorth", "uaenorth", "uksouth", "westus3"]
if request.region not in valid_regions:
raise HTTPException(
status_code=400,
detail=f"Invalid region. Supported regions: {', '.join(valid_regions)}"
)
# Test the subscription key by creating a temporary connector
try:
test_connector = ConnectorAzureSpeech(subscription_key=request.subscription_key, region=request.region)
# Test with a simple TTS request
test_audio = await test_connector.text_to_speech("Test", "en-US", "en-US-AriaNeural")
if len(test_audio) == 0:
raise Exception("Subscription key test failed")
except Exception as e:
raise HTTPException(
status_code=400,
detail=f"Invalid subscription key or region. Test failed: {str(e)}"
)
# Get user connections
user_connections = root_interface.getUserConnections(current_user.id)
# Find the target connection
target_connection = None
if request.connection_id:
# Use specific connection
for connection in user_connections:
if connection.id == request.connection_id and connection.authority == "msft":
target_connection = connection
break
if not target_connection:
raise HTTPException(
status_code=400,
detail=f"Connection with ID '{request.connection_id}' not found."
)
else:
# Use first Microsoft connection or create a new one
target_connection = None
for connection in user_connections:
if connection.authority == "msft":
target_connection = connection
break
if not target_connection:
# Create a new connection for speech services
# This would require implementing connection creation in the interface
raise HTTPException(
status_code=400,
detail="No Microsoft connection found. Please connect your Microsoft account first."
)
# Update connection metadata with speech subscription key
metadata = {}
if hasattr(target_connection, 'metadata') and target_connection.metadata:
try:
metadata = json.loads(target_connection.metadata) if isinstance(target_connection.metadata, str) else target_connection.metadata
except (json.JSONDecodeError, AttributeError):
metadata = {}
# Add speech services information
metadata['speech_subscription_key'] = request.subscription_key
metadata['speech_region'] = request.region
metadata['speech_configured_at'] = datetime.now().isoformat()
# Update the connection (this would require implementing update in the interface)
# For now, we'll just return success - the actual update would need to be implemented
# in the database interface
return {
"success": True,
"message": "Azure Speech Services subscription key configured successfully",
"connection_id": target_connection.id,
"region": request.region,
"configured_at": metadata['speech_configured_at']
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error setting speech subscription: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/subscription")
async def get_speech_subscription(
current_user: UserInDB = Depends(getCurrentUser)
):
"""Get Azure Speech Services subscription information for the user."""
try:
# Use Azure Speech Services subscription key from configuration
from modules.shared.configuration import APP_CONFIG
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
return {
"success": True,
"has_subscription": False,
"message": "Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini"
}
# Test the connection
try:
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
test_result = await connector.test_connection()
return {
"success": True,
"has_subscription": True,
"subscription": {
"subscription_key": subscription_key[:8] + "..." + subscription_key[-8:],
"region": region,
"connection_test": test_result,
"source": "config.ini"
}
}
except Exception as test_error:
return {
"success": True,
"has_subscription": True,
"subscription": {
"subscription_key": subscription_key[:8] + "..." + subscription_key[-8:],
"region": region,
"connection_test": False,
"error": str(test_error),
"source": "config.ini"
}
}
except Exception as e:
logger.error(f"Error getting speech subscription: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/settings")
async def get_voice_settings(
current_user: UserInDB = Depends(getCurrentUser),
connection_id: Optional[str] = None
):
"""Get available voice settings and languages."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
# Get available voices and languages
voices = await connector.get_available_voices()
languages = await connector.get_available_languages()
# Get user's saved settings from database
from modules.interfaces.interfaceComponentObjects import getInterface
component_interface = getInterface(current_user)
user_settings = component_interface.getVoiceSettings(current_user.id)
# If no settings exist, use defaults
if not user_settings:
user_settings = {
"sttLanguage": "de-DE",
"ttsLanguage": "de-DE",
"ttsVoice": "de-DE-KatjaNeural",
"translationEnabled": True,
"targetLanguage": "en-US"
}
return {
"voices": voices,
"languages": languages,
"user_settings": {
"sttLanguage": user_settings.sttLanguage if hasattr(user_settings, 'sttLanguage') else user_settings.get("sttLanguage"),
"ttsLanguage": user_settings.ttsLanguage if hasattr(user_settings, 'ttsLanguage') else user_settings.get("ttsLanguage"),
"ttsVoice": user_settings.ttsVoice if hasattr(user_settings, 'ttsVoice') else user_settings.get("ttsVoice"),
"translationEnabled": user_settings.translationEnabled if hasattr(user_settings, 'translationEnabled') else user_settings.get("translationEnabled"),
"targetLanguage": user_settings.targetLanguage if hasattr(user_settings, 'targetLanguage') else user_settings.get("targetLanguage")
},
"default_settings": {
"sttLanguage": "de-DE",
"ttsLanguage": "de-DE",
"ttsVoice": "de-DE-KatjaNeural",
"translationEnabled": True,
"targetLanguage": "en-US"
}
}
except Exception as e:
logger.error(f"Error getting voice settings: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/settings")
async def save_voice_settings(
settings: dict,
current_user: UserInDB = Depends(getCurrentUser)
):
"""Save voice settings for the current user."""
try:
from modules.interfaces.interfaceComponentObjects import getInterface
component_interface = getInterface(current_user)
# Check if settings exist, if not create them
existing_settings = component_interface.getVoiceSettings(current_user.id)
if not existing_settings:
# Create new settings
settings["userId"] = current_user.id
settings["mandateId"] = current_user.mandateId
created_settings = component_interface.createVoiceSettings(settings)
return {
"success": True,
"message": "Voice settings created successfully",
"settings": created_settings
}
else:
# Update existing settings
updated_settings = component_interface.updateVoiceSettings(
current_user.id,
settings
)
return {
"success": True,
"message": "Voice settings saved successfully",
"settings": updated_settings
}
except Exception as e:
logger.error(f"Error saving voice settings: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/speech-to-text")
async def speech_to_text(
audio_file: UploadFile = File(...),
language: str = Form("de-DE"),
format: str = Form("detailed"),
connection_id: str = Form(None),
current_user: UserInDB = Depends(getCurrentUser)
):
"""Convert speech to text using Azure Speech Services."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
# Read audio file
audio_content = await audio_file.read()
# Convert speech to text
result = await connector.speech_to_text(
audio_content=audio_content,
language=language,
format=format
)
return {
"success": True,
"text": result.get("text", ""),
"confidence": result.get("confidence", 0.0),
"language": result.get("language", language),
"format": format
}
except Exception as e:
logger.error(f"Error in speech-to-text: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/text-to-speech")
async def text_to_speech(
request: TextToSpeechRequest,
current_user: UserInDB = Depends(getCurrentUser),
connection_id: Optional[str] = None
):
"""Convert text to speech using Azure Speech Services."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
# Convert text to speech
audio_data = await connector.text_to_speech(
text=request.text,
language=request.language,
voice=request.voice,
format=request.format
)
# Return audio as base64 encoded string
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
return {
"success": True,
"audio_data": audio_base64,
"format": request.format,
"voice": request.voice,
"language": request.language
}
except Exception as e:
logger.error(f"Error in text-to-speech: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/translate")
async def translate_text(
request: TranslationRequest,
current_user: UserInDB = Depends(getCurrentUser),
connection_id: Optional[str] = None
):
"""Translate text using Azure Translator."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
# Translate text
translated_text = await connector.translate_text(
text=request.text,
from_language=request.from_language,
to_language=request.to_language
)
return {
"success": True,
"original_text": request.text,
"translated_text": translated_text,
"from_language": request.from_language,
"to_language": request.to_language
}
except Exception as e:
logger.error(f"Error in translation: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/conversation")
async def conversation(
request: ConversationRequest,
current_user: UserInDB = Depends(getCurrentUser),
connection_id: Optional[str] = None
):
"""Handle conversation with voice response."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
# Simple AI response logic - in production, integrate with OpenAI, Azure OpenAI, or similar
response_text = await _generate_ai_response(request.message, request.language)
# Convert response to speech
audio_data = await connector.text_to_speech(
text=response_text,
language=request.language,
voice=request.response_voice,
format="audio-16khz-128kbitrate-mono-mp3"
)
# Return both text and audio
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
return {
"success": True,
"response_text": response_text,
"audio_data": audio_base64,
"voice": request.response_voice,
"language": request.language
}
except Exception as e:
logger.error(f"Error in conversation: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
async def _generate_ai_response(message: str, language: str) -> str:
"""Generate AI response for conversation."""
try:
# Simple rule-based responses for demonstration
# In production, this would call an AI service like OpenAI or Azure OpenAI
message_lower = message.lower()
# German responses
if language.startswith("de"):
if any(word in message_lower for word in ["hallo", "hi", "hey"]):
return "Hallo! Wie kann ich Ihnen heute helfen?"
elif any(word in message_lower for word in ["wie geht", "wie gehts"]):
return "Mir geht es gut, danke der Nachfrage! Wie geht es Ihnen?"
elif any(word in message_lower for word in ["danke", "dankeschön"]):
return "Gern geschehen! Gibt es noch etwas, womit ich helfen kann?"
elif any(word in message_lower for word in ["zeit", "uhr"]):
from datetime import datetime
current_time = datetime.now().strftime("%H:%M")
return f"Es ist jetzt {current_time} Uhr."
elif any(word in message_lower for word in ["wetter", "temperatur"]):
return "Ich kann leider keine aktuellen Wetterdaten abrufen. Bitte schauen Sie in eine Wetter-App."
else:
return f"Das ist interessant: '{message}'. Können Sie mir mehr darüber erzählen?"
# English responses
elif language.startswith("en"):
if any(word in message_lower for word in ["hello", "hi", "hey"]):
return "Hello! How can I help you today?"
elif any(word in message_lower for word in ["how are you", "how's it going"]):
return "I'm doing well, thank you for asking! How are you?"
elif any(word in message_lower for word in ["thank you", "thanks"]):
return "You're welcome! Is there anything else I can help you with?"
elif any(word in message_lower for word in ["time", "clock"]):
from datetime import datetime
current_time = datetime.now().strftime("%H:%M")
return f"It's currently {current_time}."
elif any(word in message_lower for word in ["weather", "temperature"]):
return "I'm sorry, I can't retrieve current weather data. Please check a weather app."
else:
return f"That's interesting: '{message}'. Can you tell me more about that?"
# Default response
else:
return f"I heard: '{message}'. How can I help you with that?"
except Exception as e:
logger.error(f"Error generating AI response: {str(e)}")
return "I'm sorry, I didn't understand that. Could you please repeat?"
@router.post("/realtime-interpreter")
async def realtime_interpreter(
audio_file: UploadFile = File(...),
from_language: str = Form("de-DE"),
to_language: str = Form("en-US"),
connection_id: str = Form(None),
current_user: UserInDB = Depends(getCurrentUser)
):
"""Real-time interpreter: speech to translated text."""
try:
logger.info(f"Realtime interpreter called with from_language='{from_language}', to_language='{to_language}'")
connector = await get_azure_speech_connector(current_user, connection_id)
# Read audio file
audio_content = await audio_file.read()
# Convert speech to text
stt_result = await connector.speech_to_text(
audio_content=audio_content,
language=from_language,
format="detailed"
)
original_text = stt_result.get("text", "")
# Translate text if different languages
translated_text = original_text
if from_language != to_language:
try:
translated_text = await connector.translate_text(
text=original_text,
from_language=from_language,
to_language=to_language
)
except Exception as e:
logger.warning(f"Translation failed, using original text: {str(e)}")
translated_text = original_text
return {
"success": True,
"original_text": original_text,
"translated_text": translated_text,
"from_language": from_language,
"to_language": to_language,
"confidence": stt_result.get("confidence", 0.0)
}
except Exception as e:
logger.error(f"Error in realtime interpreter: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/stream/speech-to-text")
async def stream_speech_to_text(
audio_file: UploadFile = File(...),
language: str = Form("de-DE"),
format: str = Form("detailed"),
audio_format: str = Form("wav"),
connection_id: str = Form(None),
current_user: UserInDB = Depends(getCurrentUser)
):
"""Stream speech to text using Azure Speech Services."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
async def audio_chunk_generator():
"""Generate audio chunks from uploaded file."""
chunk_size = 4096 # 4KB chunks
while True:
chunk = await audio_file.read(chunk_size)
if not chunk:
break
yield chunk
async def response_generator():
"""Generate streaming responses."""
try:
async for result in connector.stream_speech_to_text(
audio_chunk_generator(),
language=language,
format=format,
audio_format=audio_format
):
yield f"data: {json.dumps(result)}\n\n"
except Exception as e:
error_result = {"error": str(e), "is_final": True}
yield f"data: {json.dumps(error_result)}\n\n"
return StreamingResponse(
response_generator(),
media_type="text/plain",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "text/event-stream"
}
)
except Exception as e:
logger.error(f"Error in streaming speech-to-text: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/stream/text-to-speech")
async def stream_text_to_speech(
request: TextToSpeechRequest,
current_user: UserInDB = Depends(getCurrentUser),
connection_id: Optional[str] = None
):
"""Stream text to speech using Azure Speech Services."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
async def text_chunk_generator():
"""Generate text chunks from the request."""
# Split text into sentences for better streaming
sentences = request.text.split('. ')
for sentence in sentences:
if sentence.strip():
yield sentence.strip() + '. '
async def audio_generator():
"""Generate streaming audio data."""
try:
async for audio_chunk in connector.stream_text_to_speech(
text_chunk_generator(),
language=request.language,
voice=request.voice,
format=request.format
):
yield audio_chunk
except Exception as e:
logger.error(f"Error in audio generation: {str(e)}")
# Return error as audio (could be a beep or silence)
yield b""
return StreamingResponse(
audio_generator(),
media_type="audio/mpeg",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "audio/mpeg"
}
)
except Exception as e:
logger.error(f"Error in streaming text-to-speech: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/stream/realtime-interpreter")
async def stream_realtime_interpreter(
audio_file: UploadFile = File(...),
from_language: str = Form("de-DE"),
to_language: str = Form("en-US"),
connection_id: str = Form(None),
current_user: UserInDB = Depends(getCurrentUser)
):
"""Stream real-time interpreter: speech to translated text."""
try:
connector = await get_azure_speech_connector(current_user, connection_id)
async def audio_chunk_generator():
"""Generate audio chunks from uploaded file."""
chunk_size = 4096 # 4KB chunks
while True:
chunk = await audio_file.read(chunk_size)
if not chunk:
break
yield chunk
async def response_generator():
"""Generate streaming translation responses."""
try:
async for result in connector.stream_realtime_interpreter(
audio_chunk_generator(),
from_language=from_language,
to_language=to_language
):
yield f"data: {json.dumps(result)}\n\n"
except Exception as e:
error_result = {"error": str(e), "is_final": True}
yield f"data: {json.dumps(error_result)}\n\n"
return StreamingResponse(
response_generator(),
media_type="text/plain",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "text/event-stream"
}
)
except Exception as e:
logger.error(f"Error in streaming realtime interpreter: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/health")
async def health_check():
"""Health check endpoint for voice services."""
return {
"status": "healthy",
"service": "azure-voice",
"endpoints": [
"speech-to-text",
"text-to-speech",
"translate",
"conversation",
"realtime-interpreter",
"stream/speech-to-text",
"stream/text-to-speech",
"stream/realtime-interpreter",
"subscription",
"connections"
]
}