813 lines
32 KiB
Python
813 lines
32 KiB
Python
"""
|
|
Azure Voice Services Route
|
|
Provides endpoints for Azure Speech Services integration including:
|
|
- Speech-to-Text (STT)
|
|
- Text-to-Speech (TTS)
|
|
- Translation services
|
|
- Real-time conversation
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
import json
|
|
from typing import Dict, Any, Optional
|
|
from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel
|
|
import io
|
|
import base64
|
|
import asyncio
|
|
from typing import AsyncGenerator
|
|
from datetime import datetime
|
|
|
|
from modules.interfaces.interfaceAppObjects import getRootInterface
|
|
from modules.interfaces.interfaceAppModel import UserInDB
|
|
from modules.security.auth import getCurrentUser
|
|
from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/voice", tags=["voice"])
|
|
|
|
# Pydantic models for request/response
|
|
class SpeechToTextRequest(BaseModel):
|
|
language: str = "de-DE"
|
|
format: str = "detailed" # "simple" or "detailed"
|
|
|
|
class TextToSpeechRequest(BaseModel):
|
|
text: str
|
|
language: str = "de-DE"
|
|
voice: str = "de-DE-KatjaNeural"
|
|
format: str = "audio-16khz-128kbitrate-mono-mp3"
|
|
|
|
class TranslationRequest(BaseModel):
|
|
text: str
|
|
from_language: str = "de-DE"
|
|
to_language: str = "en-US"
|
|
|
|
class ConversationRequest(BaseModel):
|
|
message: str
|
|
language: str = "de-DE"
|
|
response_voice: str = "de-DE-KatjaNeural"
|
|
|
|
class VoiceSettingsRequest(BaseModel):
|
|
stt_language: str = "de-DE"
|
|
tts_language: str = "de-DE"
|
|
tts_voice: str = "de-DE-KatjaNeural"
|
|
translation_enabled: bool = True
|
|
target_language: str = "en-US"
|
|
|
|
# Get Azure Speech connector for current user
|
|
async def get_azure_speech_connector(current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None) -> ConnectorAzureSpeech:
|
|
"""Get Azure Speech connector for the current user."""
|
|
try:
|
|
root_interface = getRootInterface()
|
|
|
|
# Get user connections
|
|
user_connections = root_interface.getUserConnections(current_user.id)
|
|
azure_connection = None
|
|
|
|
if connection_id:
|
|
# Find specific connection by ID
|
|
for connection in user_connections:
|
|
if connection.id == connection_id and connection.authority == "msft":
|
|
azure_connection = connection
|
|
break
|
|
else:
|
|
# Find first Azure connection
|
|
for connection in user_connections:
|
|
if connection.authority == "msft":
|
|
azure_connection = connection
|
|
break
|
|
|
|
if not azure_connection:
|
|
if connection_id:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Azure connection with ID '{connection_id}' not found."
|
|
)
|
|
else:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="No Azure connection found. Please connect your Microsoft account first."
|
|
)
|
|
|
|
# Get connection token
|
|
connection_token = root_interface.getConnectionToken(azure_connection.id)
|
|
if not connection_token:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Azure connection token not found. Please reconnect your Microsoft account."
|
|
)
|
|
|
|
# For Azure Speech Services, we need a subscription key, not an access token
|
|
# Use Azure Speech Services subscription key from configuration
|
|
from modules.shared.configuration import APP_CONFIG
|
|
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
|
|
|
|
# Debug: Log subscription key (masked for security)
|
|
logger.info(f"Loaded subscription key: {subscription_key[:8]}...{subscription_key[-8:] if subscription_key and len(subscription_key) > 16 else 'INVALID'}")
|
|
logger.info(f"Key length: {len(subscription_key) if subscription_key else 'None'}")
|
|
|
|
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail="Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini"
|
|
)
|
|
|
|
# Get region from configuration
|
|
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
|
|
|
|
# Create Azure Speech connector
|
|
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
|
|
|
|
return connector
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting Azure Speech connector: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to initialize Azure Speech connector: {str(e)}")
|
|
|
|
@router.get("/connections")
|
|
async def get_user_connections(current_user: UserInDB = Depends(getCurrentUser)):
|
|
"""Get all Microsoft connections for the current user."""
|
|
try:
|
|
root_interface = getRootInterface()
|
|
|
|
# Get user connections
|
|
user_connections = root_interface.getUserConnections(current_user.id)
|
|
|
|
# Filter for Microsoft connections only
|
|
msft_connections = []
|
|
for connection in user_connections:
|
|
if connection.authority == "msft":
|
|
# Check if this connection has speech services subscription key
|
|
has_speech_key = False
|
|
if hasattr(connection, 'metadata') and connection.metadata:
|
|
try:
|
|
metadata = json.loads(connection.metadata) if isinstance(connection.metadata, str) else connection.metadata
|
|
has_speech_key = bool(metadata.get('speech_subscription_key') or metadata.get('service_type') == 'speech_services')
|
|
except (json.JSONDecodeError, AttributeError):
|
|
pass
|
|
|
|
msft_connections.append({
|
|
"id": connection.id,
|
|
"externalUsername": connection.externalUsername,
|
|
"authority": connection.authority,
|
|
"isActive": connection.status == "active",
|
|
"hasSpeechKey": has_speech_key,
|
|
"connectedAt": connection.connectedAt,
|
|
"lastChecked": connection.lastChecked
|
|
})
|
|
|
|
return {
|
|
"success": True,
|
|
"connections": msft_connections,
|
|
"count": len(msft_connections)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting user connections: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
class SpeechSubscriptionRequest(BaseModel):
|
|
subscription_key: str
|
|
region: str = "westeurope"
|
|
connection_id: Optional[str] = None
|
|
|
|
@router.post("/subscription")
|
|
async def set_speech_subscription(
|
|
request: SpeechSubscriptionRequest,
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Set Azure Speech Services subscription key for the user."""
|
|
try:
|
|
root_interface = getRootInterface()
|
|
|
|
# Validate subscription key format (basic validation)
|
|
if not request.subscription_key or len(request.subscription_key) < 32:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Invalid subscription key format. Please provide a valid Azure Speech Services subscription key."
|
|
)
|
|
|
|
# Validate region
|
|
valid_regions = ["westeurope", "eastus", "westus2", "eastus2", "southeastasia", "westcentralus", "eastasia", "northeurope", "southcentralus", "centralus", "australiaeast", "brazilsouth", "canadacentral", "centralindia", "francecentral", "germanywestcentral", "japaneast", "koreacentral", "norwayeast", "southafricanorth", "switzerlandnorth", "uaenorth", "uksouth", "westus3"]
|
|
if request.region not in valid_regions:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid region. Supported regions: {', '.join(valid_regions)}"
|
|
)
|
|
|
|
# Test the subscription key by creating a temporary connector
|
|
try:
|
|
test_connector = ConnectorAzureSpeech(subscription_key=request.subscription_key, region=request.region)
|
|
# Test with a simple TTS request
|
|
test_audio = await test_connector.text_to_speech("Test", "en-US", "en-US-AriaNeural")
|
|
if len(test_audio) == 0:
|
|
raise Exception("Subscription key test failed")
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid subscription key or region. Test failed: {str(e)}"
|
|
)
|
|
|
|
# Get user connections
|
|
user_connections = root_interface.getUserConnections(current_user.id)
|
|
|
|
# Find the target connection
|
|
target_connection = None
|
|
if request.connection_id:
|
|
# Use specific connection
|
|
for connection in user_connections:
|
|
if connection.id == request.connection_id and connection.authority == "msft":
|
|
target_connection = connection
|
|
break
|
|
if not target_connection:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Connection with ID '{request.connection_id}' not found."
|
|
)
|
|
else:
|
|
# Use first Microsoft connection or create a new one
|
|
target_connection = None
|
|
for connection in user_connections:
|
|
if connection.authority == "msft":
|
|
target_connection = connection
|
|
break
|
|
|
|
if not target_connection:
|
|
# Create a new connection for speech services
|
|
# This would require implementing connection creation in the interface
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="No Microsoft connection found. Please connect your Microsoft account first."
|
|
)
|
|
|
|
# Update connection metadata with speech subscription key
|
|
metadata = {}
|
|
if hasattr(target_connection, 'metadata') and target_connection.metadata:
|
|
try:
|
|
metadata = json.loads(target_connection.metadata) if isinstance(target_connection.metadata, str) else target_connection.metadata
|
|
except (json.JSONDecodeError, AttributeError):
|
|
metadata = {}
|
|
|
|
# Add speech services information
|
|
metadata['speech_subscription_key'] = request.subscription_key
|
|
metadata['speech_region'] = request.region
|
|
metadata['speech_configured_at'] = datetime.now().isoformat()
|
|
|
|
# Update the connection (this would require implementing update in the interface)
|
|
# For now, we'll just return success - the actual update would need to be implemented
|
|
# in the database interface
|
|
|
|
return {
|
|
"success": True,
|
|
"message": "Azure Speech Services subscription key configured successfully",
|
|
"connection_id": target_connection.id,
|
|
"region": request.region,
|
|
"configured_at": metadata['speech_configured_at']
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error setting speech subscription: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.get("/subscription")
|
|
async def get_speech_subscription(
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Get Azure Speech Services subscription information for the user."""
|
|
try:
|
|
# Use Azure Speech Services subscription key from configuration
|
|
from modules.shared.configuration import APP_CONFIG
|
|
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
|
|
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
|
|
|
|
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
|
|
return {
|
|
"success": True,
|
|
"has_subscription": False,
|
|
"message": "Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini"
|
|
}
|
|
|
|
# Test the connection
|
|
try:
|
|
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
|
|
test_result = await connector.test_connection()
|
|
|
|
return {
|
|
"success": True,
|
|
"has_subscription": True,
|
|
"subscription": {
|
|
"subscription_key": subscription_key[:8] + "..." + subscription_key[-8:],
|
|
"region": region,
|
|
"connection_test": test_result,
|
|
"source": "config.ini"
|
|
}
|
|
}
|
|
except Exception as test_error:
|
|
return {
|
|
"success": True,
|
|
"has_subscription": True,
|
|
"subscription": {
|
|
"subscription_key": subscription_key[:8] + "..." + subscription_key[-8:],
|
|
"region": region,
|
|
"connection_test": False,
|
|
"error": str(test_error),
|
|
"source": "config.ini"
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting speech subscription: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.get("/settings")
|
|
async def get_voice_settings(
|
|
current_user: UserInDB = Depends(getCurrentUser),
|
|
connection_id: Optional[str] = None
|
|
):
|
|
"""Get available voice settings and languages."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
# Get available voices and languages
|
|
voices = await connector.get_available_voices()
|
|
languages = await connector.get_available_languages()
|
|
|
|
# Get user's saved settings from database
|
|
from modules.interfaces.interfaceComponentObjects import getInterface
|
|
component_interface = getInterface(current_user)
|
|
user_settings = component_interface.getVoiceSettings(current_user.id)
|
|
|
|
# If no settings exist, use defaults
|
|
if not user_settings:
|
|
user_settings = {
|
|
"sttLanguage": "de-DE",
|
|
"ttsLanguage": "de-DE",
|
|
"ttsVoice": "de-DE-KatjaNeural",
|
|
"translationEnabled": True,
|
|
"targetLanguage": "en-US"
|
|
}
|
|
|
|
return {
|
|
"voices": voices,
|
|
"languages": languages,
|
|
"user_settings": {
|
|
"sttLanguage": user_settings.sttLanguage if hasattr(user_settings, 'sttLanguage') else user_settings.get("sttLanguage"),
|
|
"ttsLanguage": user_settings.ttsLanguage if hasattr(user_settings, 'ttsLanguage') else user_settings.get("ttsLanguage"),
|
|
"ttsVoice": user_settings.ttsVoice if hasattr(user_settings, 'ttsVoice') else user_settings.get("ttsVoice"),
|
|
"translationEnabled": user_settings.translationEnabled if hasattr(user_settings, 'translationEnabled') else user_settings.get("translationEnabled"),
|
|
"targetLanguage": user_settings.targetLanguage if hasattr(user_settings, 'targetLanguage') else user_settings.get("targetLanguage")
|
|
},
|
|
"default_settings": {
|
|
"sttLanguage": "de-DE",
|
|
"ttsLanguage": "de-DE",
|
|
"ttsVoice": "de-DE-KatjaNeural",
|
|
"translationEnabled": True,
|
|
"targetLanguage": "en-US"
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting voice settings: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/settings")
|
|
async def save_voice_settings(
|
|
settings: dict,
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Save voice settings for the current user."""
|
|
try:
|
|
from modules.interfaces.interfaceComponentObjects import getInterface
|
|
component_interface = getInterface(current_user)
|
|
|
|
# Check if settings exist, if not create them
|
|
existing_settings = component_interface.getVoiceSettings(current_user.id)
|
|
if not existing_settings:
|
|
# Create new settings
|
|
settings["userId"] = current_user.id
|
|
settings["mandateId"] = current_user.mandateId
|
|
created_settings = component_interface.createVoiceSettings(settings)
|
|
return {
|
|
"success": True,
|
|
"message": "Voice settings created successfully",
|
|
"settings": created_settings
|
|
}
|
|
else:
|
|
# Update existing settings
|
|
updated_settings = component_interface.updateVoiceSettings(
|
|
current_user.id,
|
|
settings
|
|
)
|
|
return {
|
|
"success": True,
|
|
"message": "Voice settings saved successfully",
|
|
"settings": updated_settings
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error saving voice settings: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/speech-to-text")
|
|
async def speech_to_text(
|
|
audio_file: UploadFile = File(...),
|
|
language: str = Form("de-DE"),
|
|
format: str = Form("detailed"),
|
|
connection_id: str = Form(None),
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Convert speech to text using Azure Speech Services."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
# Read audio file
|
|
audio_content = await audio_file.read()
|
|
|
|
# Convert speech to text
|
|
result = await connector.speech_to_text(
|
|
audio_content=audio_content,
|
|
language=language,
|
|
format=format
|
|
)
|
|
|
|
return {
|
|
"success": True,
|
|
"text": result.get("text", ""),
|
|
"confidence": result.get("confidence", 0.0),
|
|
"language": result.get("language", language),
|
|
"format": format
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in speech-to-text: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/text-to-speech")
|
|
async def text_to_speech(
|
|
request: TextToSpeechRequest,
|
|
current_user: UserInDB = Depends(getCurrentUser),
|
|
connection_id: Optional[str] = None
|
|
):
|
|
"""Convert text to speech using Azure Speech Services."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
# Convert text to speech
|
|
audio_data = await connector.text_to_speech(
|
|
text=request.text,
|
|
language=request.language,
|
|
voice=request.voice,
|
|
format=request.format
|
|
)
|
|
|
|
# Return audio as base64 encoded string
|
|
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
|
|
return {
|
|
"success": True,
|
|
"audio_data": audio_base64,
|
|
"format": request.format,
|
|
"voice": request.voice,
|
|
"language": request.language
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in text-to-speech: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/translate")
|
|
async def translate_text(
|
|
request: TranslationRequest,
|
|
current_user: UserInDB = Depends(getCurrentUser),
|
|
connection_id: Optional[str] = None
|
|
):
|
|
"""Translate text using Azure Translator."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
# Translate text
|
|
translated_text = await connector.translate_text(
|
|
text=request.text,
|
|
from_language=request.from_language,
|
|
to_language=request.to_language
|
|
)
|
|
|
|
return {
|
|
"success": True,
|
|
"original_text": request.text,
|
|
"translated_text": translated_text,
|
|
"from_language": request.from_language,
|
|
"to_language": request.to_language
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in translation: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/conversation")
|
|
async def conversation(
|
|
request: ConversationRequest,
|
|
current_user: UserInDB = Depends(getCurrentUser),
|
|
connection_id: Optional[str] = None
|
|
):
|
|
"""Handle conversation with voice response."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
# Simple AI response logic - in production, integrate with OpenAI, Azure OpenAI, or similar
|
|
response_text = await _generate_ai_response(request.message, request.language)
|
|
|
|
# Convert response to speech
|
|
audio_data = await connector.text_to_speech(
|
|
text=response_text,
|
|
language=request.language,
|
|
voice=request.response_voice,
|
|
format="audio-16khz-128kbitrate-mono-mp3"
|
|
)
|
|
|
|
# Return both text and audio
|
|
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
|
|
return {
|
|
"success": True,
|
|
"response_text": response_text,
|
|
"audio_data": audio_base64,
|
|
"voice": request.response_voice,
|
|
"language": request.language
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in conversation: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
async def _generate_ai_response(message: str, language: str) -> str:
|
|
"""Generate AI response for conversation."""
|
|
try:
|
|
# Simple rule-based responses for demonstration
|
|
# In production, this would call an AI service like OpenAI or Azure OpenAI
|
|
|
|
message_lower = message.lower()
|
|
|
|
# German responses
|
|
if language.startswith("de"):
|
|
if any(word in message_lower for word in ["hallo", "hi", "hey"]):
|
|
return "Hallo! Wie kann ich Ihnen heute helfen?"
|
|
elif any(word in message_lower for word in ["wie geht", "wie gehts"]):
|
|
return "Mir geht es gut, danke der Nachfrage! Wie geht es Ihnen?"
|
|
elif any(word in message_lower for word in ["danke", "dankeschön"]):
|
|
return "Gern geschehen! Gibt es noch etwas, womit ich helfen kann?"
|
|
elif any(word in message_lower for word in ["zeit", "uhr"]):
|
|
from datetime import datetime
|
|
current_time = datetime.now().strftime("%H:%M")
|
|
return f"Es ist jetzt {current_time} Uhr."
|
|
elif any(word in message_lower for word in ["wetter", "temperatur"]):
|
|
return "Ich kann leider keine aktuellen Wetterdaten abrufen. Bitte schauen Sie in eine Wetter-App."
|
|
else:
|
|
return f"Das ist interessant: '{message}'. Können Sie mir mehr darüber erzählen?"
|
|
|
|
# English responses
|
|
elif language.startswith("en"):
|
|
if any(word in message_lower for word in ["hello", "hi", "hey"]):
|
|
return "Hello! How can I help you today?"
|
|
elif any(word in message_lower for word in ["how are you", "how's it going"]):
|
|
return "I'm doing well, thank you for asking! How are you?"
|
|
elif any(word in message_lower for word in ["thank you", "thanks"]):
|
|
return "You're welcome! Is there anything else I can help you with?"
|
|
elif any(word in message_lower for word in ["time", "clock"]):
|
|
from datetime import datetime
|
|
current_time = datetime.now().strftime("%H:%M")
|
|
return f"It's currently {current_time}."
|
|
elif any(word in message_lower for word in ["weather", "temperature"]):
|
|
return "I'm sorry, I can't retrieve current weather data. Please check a weather app."
|
|
else:
|
|
return f"That's interesting: '{message}'. Can you tell me more about that?"
|
|
|
|
# Default response
|
|
else:
|
|
return f"I heard: '{message}'. How can I help you with that?"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating AI response: {str(e)}")
|
|
return "I'm sorry, I didn't understand that. Could you please repeat?"
|
|
|
|
@router.post("/realtime-interpreter")
|
|
async def realtime_interpreter(
|
|
audio_file: UploadFile = File(...),
|
|
from_language: str = Form("de-DE"),
|
|
to_language: str = Form("en-US"),
|
|
connection_id: str = Form(None),
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Real-time interpreter: speech to translated text."""
|
|
try:
|
|
logger.info(f"Realtime interpreter called with from_language='{from_language}', to_language='{to_language}'")
|
|
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
# Read audio file
|
|
audio_content = await audio_file.read()
|
|
|
|
# Convert speech to text
|
|
stt_result = await connector.speech_to_text(
|
|
audio_content=audio_content,
|
|
language=from_language,
|
|
format="detailed"
|
|
)
|
|
|
|
original_text = stt_result.get("text", "")
|
|
|
|
# Translate text if different languages
|
|
translated_text = original_text
|
|
if from_language != to_language:
|
|
try:
|
|
translated_text = await connector.translate_text(
|
|
text=original_text,
|
|
from_language=from_language,
|
|
to_language=to_language
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Translation failed, using original text: {str(e)}")
|
|
translated_text = original_text
|
|
|
|
return {
|
|
"success": True,
|
|
"original_text": original_text,
|
|
"translated_text": translated_text,
|
|
"from_language": from_language,
|
|
"to_language": to_language,
|
|
"confidence": stt_result.get("confidence", 0.0)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in realtime interpreter: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/stream/speech-to-text")
|
|
async def stream_speech_to_text(
|
|
audio_file: UploadFile = File(...),
|
|
language: str = Form("de-DE"),
|
|
format: str = Form("detailed"),
|
|
audio_format: str = Form("wav"),
|
|
connection_id: str = Form(None),
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Stream speech to text using Azure Speech Services."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
async def audio_chunk_generator():
|
|
"""Generate audio chunks from uploaded file."""
|
|
chunk_size = 4096 # 4KB chunks
|
|
while True:
|
|
chunk = await audio_file.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
yield chunk
|
|
|
|
async def response_generator():
|
|
"""Generate streaming responses."""
|
|
try:
|
|
async for result in connector.stream_speech_to_text(
|
|
audio_chunk_generator(),
|
|
language=language,
|
|
format=format,
|
|
audio_format=audio_format
|
|
):
|
|
yield f"data: {json.dumps(result)}\n\n"
|
|
except Exception as e:
|
|
error_result = {"error": str(e), "is_final": True}
|
|
yield f"data: {json.dumps(error_result)}\n\n"
|
|
|
|
return StreamingResponse(
|
|
response_generator(),
|
|
media_type="text/plain",
|
|
headers={
|
|
"Cache-Control": "no-cache",
|
|
"Connection": "keep-alive",
|
|
"Content-Type": "text/event-stream"
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in streaming speech-to-text: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/stream/text-to-speech")
|
|
async def stream_text_to_speech(
|
|
request: TextToSpeechRequest,
|
|
current_user: UserInDB = Depends(getCurrentUser),
|
|
connection_id: Optional[str] = None
|
|
):
|
|
"""Stream text to speech using Azure Speech Services."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
async def text_chunk_generator():
|
|
"""Generate text chunks from the request."""
|
|
# Split text into sentences for better streaming
|
|
sentences = request.text.split('. ')
|
|
for sentence in sentences:
|
|
if sentence.strip():
|
|
yield sentence.strip() + '. '
|
|
|
|
async def audio_generator():
|
|
"""Generate streaming audio data."""
|
|
try:
|
|
async for audio_chunk in connector.stream_text_to_speech(
|
|
text_chunk_generator(),
|
|
language=request.language,
|
|
voice=request.voice,
|
|
format=request.format
|
|
):
|
|
yield audio_chunk
|
|
except Exception as e:
|
|
logger.error(f"Error in audio generation: {str(e)}")
|
|
# Return error as audio (could be a beep or silence)
|
|
yield b""
|
|
|
|
return StreamingResponse(
|
|
audio_generator(),
|
|
media_type="audio/mpeg",
|
|
headers={
|
|
"Cache-Control": "no-cache",
|
|
"Connection": "keep-alive",
|
|
"Content-Type": "audio/mpeg"
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in streaming text-to-speech: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.post("/stream/realtime-interpreter")
|
|
async def stream_realtime_interpreter(
|
|
audio_file: UploadFile = File(...),
|
|
from_language: str = Form("de-DE"),
|
|
to_language: str = Form("en-US"),
|
|
connection_id: str = Form(None),
|
|
current_user: UserInDB = Depends(getCurrentUser)
|
|
):
|
|
"""Stream real-time interpreter: speech to translated text."""
|
|
try:
|
|
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
|
|
async def audio_chunk_generator():
|
|
"""Generate audio chunks from uploaded file."""
|
|
chunk_size = 4096 # 4KB chunks
|
|
while True:
|
|
chunk = await audio_file.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
yield chunk
|
|
|
|
async def response_generator():
|
|
"""Generate streaming translation responses."""
|
|
try:
|
|
async for result in connector.stream_realtime_interpreter(
|
|
audio_chunk_generator(),
|
|
from_language=from_language,
|
|
to_language=to_language
|
|
):
|
|
yield f"data: {json.dumps(result)}\n\n"
|
|
except Exception as e:
|
|
error_result = {"error": str(e), "is_final": True}
|
|
yield f"data: {json.dumps(error_result)}\n\n"
|
|
|
|
return StreamingResponse(
|
|
response_generator(),
|
|
media_type="text/plain",
|
|
headers={
|
|
"Cache-Control": "no-cache",
|
|
"Connection": "keep-alive",
|
|
"Content-Type": "text/event-stream"
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in streaming realtime interpreter: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@router.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint for voice services."""
|
|
return {
|
|
"status": "healthy",
|
|
"service": "azure-voice",
|
|
"endpoints": [
|
|
"speech-to-text",
|
|
"text-to-speech",
|
|
"translate",
|
|
"conversation",
|
|
"realtime-interpreter",
|
|
"stream/speech-to-text",
|
|
"stream/text-to-speech",
|
|
"stream/realtime-interpreter",
|
|
"subscription",
|
|
"connections"
|
|
]
|
|
}
|