fixed delta sync - added voice basics
This commit is contained in:
parent
ada61061c4
commit
ff079f5428
35 changed files with 582 additions and 2300 deletions
7
app.py
7
app.py
|
|
@ -249,8 +249,5 @@ app.include_router(msftRouter)
|
||||||
from modules.routes.routeSecurityGoogle import router as googleRouter
|
from modules.routes.routeSecurityGoogle import router as googleRouter
|
||||||
app.include_router(googleRouter)
|
app.include_router(googleRouter)
|
||||||
|
|
||||||
from modules.routes.routeVoiceAzure import router as voiceRouter
|
from modules.routes.routeVoiceGoogle import router as voiceGoogleRouter
|
||||||
app.include_router(voiceRouter)
|
app.include_router(voiceGoogleRouter)
|
||||||
|
|
||||||
from modules.routes.routeVoiceWebSocket import router as voiceWebSocketRouter
|
|
||||||
app.include_router(voiceWebSocketRouter)
|
|
||||||
|
|
@ -48,9 +48,8 @@ Service_GOOGLE_CLIENT_SECRET = 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
|
||||||
# Tavily Web Search configuration
|
# Tavily Web Search configuration
|
||||||
Connector_WebTavily_API_KEY = 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
|
Connector_WebTavily_API_KEY = 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
|
||||||
|
|
||||||
# Azure Speech Services configuration
|
# Google Cloud Speech Services configuration
|
||||||
Connector_AzureSpeech_SUBSCRIPTION_KEY = 4HuOIbNfeZRyQp3ouqCzyYKpRf0rJxMNwPaoKwdoJDUxl5u9FzK2JQQJ99BIACI8hq2XJ3w3AAAYACOGrE9c
|
# Set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory
|
||||||
Connector_AzureSpeech_REGION = switzerlandnorth
|
|
||||||
|
|
||||||
# Web Search configuration
|
# Web Search configuration
|
||||||
Web_Search_MAX_QUERY_LENGTH = 400
|
Web_Search_MAX_QUERY_LENGTH = 400
|
||||||
|
|
|
||||||
BIN
debug_audio/audio_20250913_223438.wav
Normal file
BIN
debug_audio/audio_20250913_223438.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_223658.wav
Normal file
BIN
debug_audio/audio_20250913_223658.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_224003.wav
Normal file
BIN
debug_audio/audio_20250913_224003.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_224258.wav
Normal file
BIN
debug_audio/audio_20250913_224258.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_224524.wav
Normal file
BIN
debug_audio/audio_20250913_224524.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_224801.wav
Normal file
BIN
debug_audio/audio_20250913_224801.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_230817.wav
Normal file
BIN
debug_audio/audio_20250913_230817.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_230927.wav
Normal file
BIN
debug_audio/audio_20250913_230927.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_231253.wav
Normal file
BIN
debug_audio/audio_20250913_231253.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_231321.wav
Normal file
BIN
debug_audio/audio_20250913_231321.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_231611.wav
Normal file
BIN
debug_audio/audio_20250913_231611.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_231935.wav
Normal file
BIN
debug_audio/audio_20250913_231935.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_232141.wav
Normal file
BIN
debug_audio/audio_20250913_232141.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_232309.wav
Normal file
BIN
debug_audio/audio_20250913_232309.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_232518.wav
Normal file
BIN
debug_audio/audio_20250913_232518.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_232659.wav
Normal file
BIN
debug_audio/audio_20250913_232659.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_232941.wav
Normal file
BIN
debug_audio/audio_20250913_232941.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_233053.wav
Normal file
BIN
debug_audio/audio_20250913_233053.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_233155.wav
Normal file
BIN
debug_audio/audio_20250913_233155.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_233607.wav
Normal file
BIN
debug_audio/audio_20250913_233607.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_234106.wav
Normal file
BIN
debug_audio/audio_20250913_234106.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_234245.wav
Normal file
BIN
debug_audio/audio_20250913_234245.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_234843.wav
Normal file
BIN
debug_audio/audio_20250913_234843.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_235136.wav
Normal file
BIN
debug_audio/audio_20250913_235136.wav
Normal file
Binary file not shown.
BIN
debug_audio/audio_20250913_235409.wav
Normal file
BIN
debug_audio/audio_20250913_235409.wav
Normal file
Binary file not shown.
|
|
@ -1,44 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Script to force Microsoft re-authentication for SharePoint access.
|
|
||||||
This will disconnect the existing connection and provide a new login URL.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
BASE_URL = "http://localhost:8000" # Adjust if your server runs on different port
|
|
||||||
CONNECTION_ID = "cc62583d-3a68-44b6-8283-726725916a7e" # From the logs
|
|
||||||
|
|
||||||
def force_reauth():
|
|
||||||
"""Force Microsoft re-authentication by disconnecting and providing new login URL."""
|
|
||||||
|
|
||||||
print("🔄 Forcing Microsoft re-authentication for SharePoint access...")
|
|
||||||
|
|
||||||
# Step 1: Disconnect existing connection
|
|
||||||
print(f"1. Disconnecting connection {CONNECTION_ID}...")
|
|
||||||
disconnect_url = f"{BASE_URL}/api/connections/{CONNECTION_ID}/disconnect"
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.post(disconnect_url)
|
|
||||||
if response.status_code == 200:
|
|
||||||
print("✅ Connection disconnected successfully")
|
|
||||||
else:
|
|
||||||
print(f"❌ Failed to disconnect: {response.status_code} - {response.text}")
|
|
||||||
return
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error disconnecting: {e}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Step 2: Get new login URL
|
|
||||||
print("2. Getting new Microsoft login URL...")
|
|
||||||
login_url = f"{BASE_URL}/api/msft/login?state=connection&connectionId={CONNECTION_ID}"
|
|
||||||
|
|
||||||
print(f"\n🔗 Please visit this URL to re-authenticate with SharePoint permissions:")
|
|
||||||
print(f" {login_url}")
|
|
||||||
print("\nAfter re-authentication, the JIRA sync should work with SharePoint access.")
|
|
||||||
print("\nNote: The new token will include Sites.ReadWrite.All and Files.ReadWrite.All scopes.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
force_reauth()
|
|
||||||
|
|
@ -1,928 +0,0 @@
|
||||||
"""
|
|
||||||
Azure Speech Services Connector
|
|
||||||
Handles integration with Azure Speech Services for:
|
|
||||||
- Speech-to-Text (STT)
|
|
||||||
- Text-to-Speech (TTS)
|
|
||||||
- Translation services
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
import base64
|
|
||||||
from typing import Dict, Any, Optional, List, AsyncGenerator
|
|
||||||
import aiohttp
|
|
||||||
import io
|
|
||||||
import wave
|
|
||||||
import struct
|
|
||||||
import tempfile
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class ConnectorAzureSpeech:
|
|
||||||
"""Connector for Azure Speech Services."""
|
|
||||||
|
|
||||||
def __init__(self, subscription_key: str, region: str = "westeurope"):
|
|
||||||
"""
|
|
||||||
Initialize Azure Speech connector.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
subscription_key: Azure Speech Services subscription key
|
|
||||||
region: Azure region (default: westeurope)
|
|
||||||
"""
|
|
||||||
self.subscription_key = subscription_key
|
|
||||||
self.region = region
|
|
||||||
self.base_url = f"https://{region}.stt.speech.microsoft.com"
|
|
||||||
self.translator_url = "https://api.cognitive.microsofttranslator.com"
|
|
||||||
|
|
||||||
# Supported audio formats
|
|
||||||
self.supported_stt_formats = {
|
|
||||||
"wav": {"mime": "audio/wav", "codec": "audio/pcm", "sample_rate": 16000},
|
|
||||||
"mp3": {"mime": "audio/mp3", "codec": "audio/mp3", "sample_rate": 16000},
|
|
||||||
"ogg": {"mime": "audio/ogg", "codec": "audio/ogg", "sample_rate": 16000}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.supported_tts_formats = [
|
|
||||||
"audio-16khz-128kbitrate-mono-mp3",
|
|
||||||
"audio-16khz-32kbitrate-mono-mp3",
|
|
||||||
"audio-16khz-64kbitrate-mono-mp3",
|
|
||||||
"audio-24khz-160kbitrate-mono-mp3",
|
|
||||||
"audio-24khz-48kbitrate-mono-mp3",
|
|
||||||
"audio-24khz-96kbitrate-mono-mp3",
|
|
||||||
"audio-48khz-192kbitrate-mono-mp3",
|
|
||||||
"audio-48khz-96kbitrate-mono-mp3",
|
|
||||||
"riff-16khz-16bit-mono-pcm",
|
|
||||||
"riff-24khz-16bit-mono-pcm",
|
|
||||||
"riff-48khz-16bit-mono-pcm"
|
|
||||||
]
|
|
||||||
|
|
||||||
# Rate limiting
|
|
||||||
self.rate_limits = {
|
|
||||||
"stt": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0},
|
|
||||||
"tts": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0},
|
|
||||||
"translation": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Request timeout settings
|
|
||||||
self.timeout = aiohttp.ClientTimeout(total=30, connect=10)
|
|
||||||
|
|
||||||
def _check_rate_limit(self, service_type: str) -> bool:
|
|
||||||
"""Check if rate limit is exceeded for a service type."""
|
|
||||||
current_time = time.time()
|
|
||||||
rate_limit = self.rate_limits[service_type]
|
|
||||||
|
|
||||||
# Reset counter if minute has passed
|
|
||||||
if current_time - rate_limit["last_reset"] >= 60:
|
|
||||||
rate_limit["request_count"] = 0
|
|
||||||
rate_limit["last_reset"] = current_time
|
|
||||||
|
|
||||||
# Check if limit exceeded
|
|
||||||
if rate_limit["request_count"] >= rate_limit["requests_per_minute"]:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Increment counter
|
|
||||||
rate_limit["request_count"] += 1
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _handle_azure_error(self, response_status: int, error_text: str) -> Exception:
|
|
||||||
"""Handle Azure API errors with specific error messages."""
|
|
||||||
if response_status == 401:
|
|
||||||
return Exception("Authentication failed. Please check your Azure Speech Services subscription key.")
|
|
||||||
elif response_status == 403:
|
|
||||||
return Exception("Access forbidden. Please check your Azure Speech Services permissions.")
|
|
||||||
elif response_status == 429:
|
|
||||||
return Exception("Rate limit exceeded. Please wait before making more requests.")
|
|
||||||
elif response_status == 400:
|
|
||||||
return Exception(f"Bad request: {error_text}")
|
|
||||||
elif response_status == 500:
|
|
||||||
return Exception("Azure Speech Services internal error. Please try again later.")
|
|
||||||
elif response_status == 503:
|
|
||||||
return Exception("Azure Speech Services temporarily unavailable. Please try again later.")
|
|
||||||
else:
|
|
||||||
return Exception(f"Azure API error {response_status}: {error_text}")
|
|
||||||
|
|
||||||
async def _make_request_with_retry(self, url: str, method: str = "GET", headers: Dict = None, data: bytes = None, params: Dict = None, max_retries: int = 3) -> Dict:
|
|
||||||
"""Make HTTP request with retry logic."""
|
|
||||||
if headers is None:
|
|
||||||
headers = {}
|
|
||||||
|
|
||||||
headers.update({
|
|
||||||
"Ocp-Apim-Subscription-Key": self.subscription_key,
|
|
||||||
"User-Agent": "PowerOn-Voice-Services/1.0"
|
|
||||||
})
|
|
||||||
|
|
||||||
# Debug: Log subscription key (masked for security)
|
|
||||||
logger.debug(f"Using subscription key: {self.subscription_key[:8]}...{self.subscription_key[-8:] if len(self.subscription_key) > 16 else 'SHORT'}")
|
|
||||||
logger.debug(f"Request URL: {url}")
|
|
||||||
logger.debug(f"Request params: {params}")
|
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
async with aiohttp.ClientSession(timeout=self.timeout) as session:
|
|
||||||
if method.upper() == "GET":
|
|
||||||
async with session.get(url, headers=headers, params=params) as response:
|
|
||||||
return await self._handle_response(response)
|
|
||||||
elif method.upper() == "POST":
|
|
||||||
async with session.post(url, headers=headers, data=data, params=params) as response:
|
|
||||||
return await self._handle_response(response)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported HTTP method: {method}")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
if attempt == max_retries - 1:
|
|
||||||
raise Exception("Request timeout after multiple retries")
|
|
||||||
logger.warning(f"Request timeout, retrying... (attempt {attempt + 1}/{max_retries})")
|
|
||||||
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
|
||||||
except Exception as e:
|
|
||||||
if attempt == max_retries - 1:
|
|
||||||
raise
|
|
||||||
logger.warning(f"Request failed, retrying... (attempt {attempt + 1}/{max_retries}): {str(e)}")
|
|
||||||
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
|
||||||
|
|
||||||
async def _make_request(self, url: str, method: str = "GET", headers: Dict = None, data: bytes = None) -> Dict:
|
|
||||||
"""Make HTTP request to Azure services."""
|
|
||||||
if headers is None:
|
|
||||||
headers = {}
|
|
||||||
|
|
||||||
headers.update({
|
|
||||||
"Ocp-Apim-Subscription-Key": self.subscription_key,
|
|
||||||
"User-Agent": "PowerOn-Voice-Services/1.0"
|
|
||||||
})
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
|
||||||
try:
|
|
||||||
if method.upper() == "GET":
|
|
||||||
async with session.get(url, headers=headers) as response:
|
|
||||||
return await self._handle_response(response)
|
|
||||||
elif method.upper() == "POST":
|
|
||||||
async with session.post(url, headers=headers, data=data) as response:
|
|
||||||
return await self._handle_response(response)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported HTTP method: {method}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Request failed: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def _handle_response(self, response) -> Dict:
|
|
||||||
"""Handle HTTP response."""
|
|
||||||
if response.status == 200:
|
|
||||||
content_type = response.headers.get('content-type', '')
|
|
||||||
if 'application/json' in content_type:
|
|
||||||
return await response.json()
|
|
||||||
else:
|
|
||||||
# For audio responses, return binary data
|
|
||||||
return {"data": await response.read()}
|
|
||||||
else:
|
|
||||||
error_text = await response.text()
|
|
||||||
logger.error(f"API request failed: {response.status} - {error_text}")
|
|
||||||
raise self._handle_azure_error(response.status, error_text)
|
|
||||||
|
|
||||||
def _validate_audio_format(self, audio_content: bytes, expected_format: str = "wav") -> Dict[str, Any]:
|
|
||||||
"""Validate audio format and return format information."""
|
|
||||||
try:
|
|
||||||
# Try to detect format from content
|
|
||||||
if audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]:
|
|
||||||
format_type = "wav"
|
|
||||||
elif audio_content.startswith(b'\xff\xfb') or audio_content.startswith(b'ID3'):
|
|
||||||
format_type = "mp3"
|
|
||||||
elif audio_content.startswith(b'OggS'):
|
|
||||||
format_type = "ogg"
|
|
||||||
elif audio_content.startswith(b'fLaC'):
|
|
||||||
format_type = "flac"
|
|
||||||
else:
|
|
||||||
# If we can't detect format, assume it's raw audio or WAV without proper header
|
|
||||||
format_type = "wav" # Azure Speech Services can handle this
|
|
||||||
|
|
||||||
# Validate WAV format specifically
|
|
||||||
if format_type == "wav":
|
|
||||||
try:
|
|
||||||
with io.BytesIO(audio_content) as audio_io:
|
|
||||||
with wave.open(audio_io, 'rb') as wav_file:
|
|
||||||
sample_rate = wav_file.getframerate()
|
|
||||||
channels = wav_file.getnchannels()
|
|
||||||
sample_width = wav_file.getsampwidth()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"valid": True,
|
|
||||||
"format": format_type,
|
|
||||||
"sample_rate": sample_rate,
|
|
||||||
"channels": channels,
|
|
||||||
"sample_width": sample_width,
|
|
||||||
"duration": wav_file.getnframes() / sample_rate
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
# If WAV validation fails, it might be raw audio data
|
|
||||||
# Azure Speech Services can handle raw audio, so we'll allow it
|
|
||||||
logger.info(f"WAV validation failed, treating as raw audio: {str(e)}")
|
|
||||||
return {
|
|
||||||
"valid": True,
|
|
||||||
"format": "raw_audio",
|
|
||||||
"sample_rate": 16000, # Default assumption for raw audio
|
|
||||||
"channels": 1, # Default assumption
|
|
||||||
"sample_width": 2, # Default assumption
|
|
||||||
"duration": len(audio_content) / (16000 * 2) # Rough estimate
|
|
||||||
}
|
|
||||||
|
|
||||||
# For other formats, assume valid if we can detect them
|
|
||||||
return {
|
|
||||||
"valid": True,
|
|
||||||
"format": format_type,
|
|
||||||
"sample_rate": 16000, # Default assumption
|
|
||||||
"channels": 1, # Default assumption
|
|
||||||
"sample_width": 2, # Default assumption
|
|
||||||
"duration": 0 # Unknown
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Audio validation failed: {str(e)}")
|
|
||||||
return {"valid": False, "error": str(e)}
|
|
||||||
|
|
||||||
def _convert_audio_to_wav(self, audio_content: bytes, target_sample_rate: int = 16000) -> bytes:
|
|
||||||
"""Convert audio to WAV format with specified sample rate."""
|
|
||||||
try:
|
|
||||||
# If it's already WAV, try to resample if needed
|
|
||||||
if audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]:
|
|
||||||
with io.BytesIO(audio_content) as audio_io:
|
|
||||||
with wave.open(audio_io, 'rb') as wav_file:
|
|
||||||
current_sample_rate = wav_file.getframerate()
|
|
||||||
|
|
||||||
# If sample rate matches, return as-is
|
|
||||||
if current_sample_rate == target_sample_rate:
|
|
||||||
return audio_content
|
|
||||||
|
|
||||||
# For now, return original (in production, implement resampling)
|
|
||||||
logger.warning(f"Audio sample rate {current_sample_rate} doesn't match target {target_sample_rate}")
|
|
||||||
return audio_content
|
|
||||||
|
|
||||||
# If it's raw audio data (no header), create a basic WAV header
|
|
||||||
elif not audio_content.startswith(b'RIFF'):
|
|
||||||
logger.info("Converting raw audio data to WAV format")
|
|
||||||
return self._create_wav_header(audio_content, target_sample_rate)
|
|
||||||
|
|
||||||
# For other formats, return as-is for now
|
|
||||||
# In production, implement proper conversion with pydub or ffmpeg
|
|
||||||
logger.info("Audio format conversion not fully implemented - returning original")
|
|
||||||
return audio_content
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Audio conversion failed: {str(e)}")
|
|
||||||
raise Exception(f"Audio conversion failed: {str(e)}")
|
|
||||||
|
|
||||||
def _create_wav_header(self, audio_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
|
|
||||||
"""Create a WAV header for raw audio data."""
|
|
||||||
try:
|
|
||||||
import struct
|
|
||||||
|
|
||||||
# Calculate data size
|
|
||||||
data_size = len(audio_data)
|
|
||||||
file_size = 36 + data_size
|
|
||||||
|
|
||||||
# Create WAV header
|
|
||||||
header = struct.pack('<4sI4s4sIHHIIHH4sI',
|
|
||||||
b'RIFF', # Chunk ID
|
|
||||||
file_size, # Chunk size
|
|
||||||
b'WAVE', # Format
|
|
||||||
b'fmt ', # Subchunk1 ID
|
|
||||||
16, # Subchunk1 size
|
|
||||||
1, # Audio format (PCM)
|
|
||||||
channels, # Number of channels
|
|
||||||
sample_rate, # Sample rate
|
|
||||||
sample_rate * channels * sample_width, # Byte rate
|
|
||||||
channels * sample_width, # Block align
|
|
||||||
sample_width * 8, # Bits per sample
|
|
||||||
b'data', # Subchunk2 ID
|
|
||||||
data_size # Subchunk2 size
|
|
||||||
)
|
|
||||||
|
|
||||||
return header + audio_data
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to create WAV header: {str(e)}")
|
|
||||||
# Return original data if header creation fails
|
|
||||||
return audio_data
|
|
||||||
|
|
||||||
def _get_audio_content_type(self, audio_format: str) -> str:
|
|
||||||
"""Get MIME type for audio format."""
|
|
||||||
if audio_format in self.supported_stt_formats:
|
|
||||||
return self.supported_stt_formats[audio_format]["mime"]
|
|
||||||
return "audio/wav" # Default
|
|
||||||
|
|
||||||
async def speech_to_text(self, audio_content: bytes, language: str = "de-DE", format: str = "detailed", audio_format: str = "wav") -> Dict:
|
|
||||||
"""
|
|
||||||
Convert speech to text using Azure Speech Services.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
audio_content: Audio file content as bytes
|
|
||||||
language: Language code (e.g., "de-DE")
|
|
||||||
format: Response format ("simple" or "detailed")
|
|
||||||
audio_format: Audio format ("wav", "mp3", "ogg")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict with transcription results
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check rate limit
|
|
||||||
if not self._check_rate_limit("stt"):
|
|
||||||
raise Exception("Rate limit exceeded for speech-to-text service. Please wait before making more requests.")
|
|
||||||
|
|
||||||
# Validate audio format
|
|
||||||
validation_result = self._validate_audio_format(audio_content, audio_format)
|
|
||||||
if not validation_result.get("valid", False):
|
|
||||||
raise Exception(f"Invalid audio format: {validation_result.get('error', 'Unknown error')}")
|
|
||||||
|
|
||||||
# Convert audio to required format if needed
|
|
||||||
processed_audio = self._convert_audio_to_wav(audio_content)
|
|
||||||
|
|
||||||
# Update audio_format based on validation result
|
|
||||||
detected_format = validation_result.get("format", audio_format)
|
|
||||||
if detected_format == "raw_audio":
|
|
||||||
audio_format = "wav" # Treat raw audio as WAV for Azure
|
|
||||||
|
|
||||||
url = f"{self.base_url}/speech/recognition/conversation/cognitiveservices/v1"
|
|
||||||
|
|
||||||
# Get appropriate content type
|
|
||||||
content_type = self._get_audio_content_type(audio_format)
|
|
||||||
if audio_format == "wav":
|
|
||||||
content_type = f"{content_type}; codecs=audio/pcm; samplerate=16000"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Content-Type": content_type,
|
|
||||||
"Accept": "application/json",
|
|
||||||
"Ocp-Apim-Subscription-Region": self.region
|
|
||||||
}
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"language": language,
|
|
||||||
"format": "detailed" if format == "detailed" else "simple"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Make API call with retry logic
|
|
||||||
result = await self._make_request_with_retry(
|
|
||||||
url=url,
|
|
||||||
method="POST",
|
|
||||||
headers=headers,
|
|
||||||
data=processed_audio,
|
|
||||||
params=params
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse the response based on format
|
|
||||||
if format == "detailed":
|
|
||||||
return {
|
|
||||||
"text": result.get("DisplayText", ""),
|
|
||||||
"confidence": result.get("Confidence", 0.0),
|
|
||||||
"language": result.get("RecognitionStatus", language),
|
|
||||||
"format": format,
|
|
||||||
"audio_info": validation_result,
|
|
||||||
"raw_result": result
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {
|
|
||||||
"text": result.get("DisplayText", ""),
|
|
||||||
"confidence": 1.0, # Simple format doesn't provide confidence
|
|
||||||
"language": language,
|
|
||||||
"format": format,
|
|
||||||
"audio_info": validation_result
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Speech-to-text failed: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def text_to_speech(self, text: str, language: str = "de-DE", voice: str = "de-DE-KatjaNeural", format: str = "audio-16khz-128kbitrate-mono-mp3") -> bytes:
|
|
||||||
"""
|
|
||||||
Convert text to speech using Azure Speech Services.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to convert to speech
|
|
||||||
language: Language code
|
|
||||||
voice: Voice name
|
|
||||||
format: Audio format
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Audio data as bytes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check rate limit
|
|
||||||
if not self._check_rate_limit("tts"):
|
|
||||||
raise Exception("Rate limit exceeded for text-to-speech service. Please wait before making more requests.")
|
|
||||||
|
|
||||||
# Validate format
|
|
||||||
if format not in self.supported_tts_formats:
|
|
||||||
raise Exception(f"Unsupported TTS format: {format}. Supported formats: {', '.join(self.supported_tts_formats)}")
|
|
||||||
|
|
||||||
url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/ssml+xml",
|
|
||||||
"X-Microsoft-OutputFormat": format,
|
|
||||||
"Ocp-Apim-Subscription-Key": self.subscription_key,
|
|
||||||
"User-Agent": "PowerOn-Voice-Services/1.0"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create SSML with proper escaping
|
|
||||||
escaped_text = text.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace("'", "'")
|
|
||||||
ssml = f"""<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='{language}'>
|
|
||||||
<voice name='{voice}'>
|
|
||||||
{escaped_text}
|
|
||||||
</voice>
|
|
||||||
</speak>"""
|
|
||||||
|
|
||||||
# Make API call with retry logic
|
|
||||||
result = await self._make_request_with_retry(
|
|
||||||
url=url,
|
|
||||||
method="POST",
|
|
||||||
headers=headers,
|
|
||||||
data=ssml.encode('utf-8'),
|
|
||||||
params=None
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return audio data
|
|
||||||
return result.get("data", b"")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Text-to-speech failed: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def translate_text(self, text: str, from_language: str, to_language: str) -> str:
|
|
||||||
"""
|
|
||||||
Translate text using Azure Translator.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Text to translate
|
|
||||||
from_language: Source language code
|
|
||||||
to_language: Target language code
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Translated text
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check if text is empty
|
|
||||||
if not text or not text.strip():
|
|
||||||
logger.debug("Empty text provided, returning original text")
|
|
||||||
return text
|
|
||||||
|
|
||||||
# Check rate limit
|
|
||||||
if not self._check_rate_limit("translation"):
|
|
||||||
raise Exception("Rate limit exceeded for translation service. Please wait before making more requests.")
|
|
||||||
|
|
||||||
url = f"{self.translator_url}/translate"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Ocp-Apim-Subscription-Key": self.subscription_key,
|
|
||||||
"Ocp-Apim-Subscription-Region": self.region,
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"api-version": "3.0",
|
|
||||||
"from": from_language,
|
|
||||||
"to": to_language
|
|
||||||
}
|
|
||||||
|
|
||||||
data = [{"text": text}]
|
|
||||||
|
|
||||||
# Debug: Log translation request details
|
|
||||||
logger.debug(f"Translation request - URL: {url}")
|
|
||||||
logger.debug(f"Translation request - Headers: {headers}")
|
|
||||||
logger.debug(f"Translation request - Data: {data}")
|
|
||||||
|
|
||||||
# Make API call with retry logic
|
|
||||||
result = await self._make_request_with_retry(
|
|
||||||
url=url,
|
|
||||||
method="POST",
|
|
||||||
headers=headers,
|
|
||||||
data=json.dumps(data).encode('utf-8'),
|
|
||||||
params=None
|
|
||||||
)
|
|
||||||
|
|
||||||
if result and len(result) > 0 and 'translations' in result[0]:
|
|
||||||
return result[0]['translations'][0]['text']
|
|
||||||
else:
|
|
||||||
logger.warning(f"Unexpected translation response format: {result}")
|
|
||||||
return text # Return original text if translation fails
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Translation failed: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def get_available_voices(self) -> List[Dict]:
|
|
||||||
"""Get list of available voices from Azure Speech Services."""
|
|
||||||
try:
|
|
||||||
# Azure doesn't provide a direct API for voice list, so we return a comprehensive list
|
|
||||||
# based on Azure's supported voices
|
|
||||||
voices = [
|
|
||||||
# German voices
|
|
||||||
{"name": "de-DE-KatjaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-ConradNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-AmalaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-BerndNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-ChristophNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-ElkeNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-GiselaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-JoergNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-KasperNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-KillianNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-KlausNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-LouisaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-MajaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-RalfNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"},
|
|
||||||
{"name": "de-DE-TanjaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"},
|
|
||||||
|
|
||||||
# English (US) voices
|
|
||||||
{"name": "en-US-AriaNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-DavisNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-GuyNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-JaneNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-JasonNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-JennyNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-MichelleNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-RyanNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-SaraNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"},
|
|
||||||
{"name": "en-US-TonyNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"},
|
|
||||||
|
|
||||||
# English (UK) voices
|
|
||||||
{"name": "en-GB-LibbyNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"},
|
|
||||||
{"name": "en-GB-MaisieNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"},
|
|
||||||
{"name": "en-GB-RyanNeural", "language": "en-GB", "gender": "Male", "style": "Neural", "locale": "en-GB"},
|
|
||||||
{"name": "en-GB-SoniaNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"},
|
|
||||||
{"name": "en-GB-ThomasNeural", "language": "en-GB", "gender": "Male", "style": "Neural", "locale": "en-GB"},
|
|
||||||
|
|
||||||
# French voices
|
|
||||||
{"name": "fr-FR-DeniseNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-HenriNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-ArianeNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-ClaudeNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-JacquelineNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-JeromeNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-JosephineNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-MauriceNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
{"name": "fr-FR-YvetteNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"},
|
|
||||||
|
|
||||||
# Spanish voices
|
|
||||||
{"name": "es-ES-ElviraNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-AlvaroNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-ArnauNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-DarioNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-EliasNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-EstrellaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-IreneNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-LaiaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-LiaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-NilNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-SaulNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-TeoNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-TrianaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"},
|
|
||||||
{"name": "es-ES-VeraNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}
|
|
||||||
]
|
|
||||||
return voices
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to get voices: {str(e)}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def get_available_languages(self) -> List[Dict]:
|
|
||||||
"""Get list of available languages supported by Azure Speech Services."""
|
|
||||||
try:
|
|
||||||
# Comprehensive list of Azure Speech Services supported languages
|
|
||||||
languages = [
|
|
||||||
# European languages
|
|
||||||
{"code": "de-DE", "name": "German (Germany)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "en-US", "name": "English (United States)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "en-GB", "name": "English (United Kingdom)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "fr-FR", "name": "French (France)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "es-ES", "name": "Spanish (Spain)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "es-MX", "name": "Spanish (Mexico)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "it-IT", "name": "Italian (Italy)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "pt-BR", "name": "Portuguese (Brazil)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "pt-PT", "name": "Portuguese (Portugal)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ru-RU", "name": "Russian (Russia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "nl-NL", "name": "Dutch (Netherlands)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "sv-SE", "name": "Swedish (Sweden)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "no-NO", "name": "Norwegian (Norway)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "da-DK", "name": "Danish (Denmark)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "fi-FI", "name": "Finnish (Finland)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "pl-PL", "name": "Polish (Poland)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "cs-CZ", "name": "Czech (Czech Republic)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "hu-HU", "name": "Hungarian (Hungary)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ro-RO", "name": "Romanian (Romania)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "bg-BG", "name": "Bulgarian (Bulgaria)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "hr-HR", "name": "Croatian (Croatia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "sk-SK", "name": "Slovak (Slovakia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "sl-SI", "name": "Slovenian (Slovenia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "et-EE", "name": "Estonian (Estonia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "lv-LV", "name": "Latvian (Latvia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "lt-LT", "name": "Lithuanian (Lithuania)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "mt-MT", "name": "Maltese (Malta)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ga-IE", "name": "Irish (Ireland)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "cy-GB", "name": "Welsh (United Kingdom)", "stt": True, "tts": True, "translation": True},
|
|
||||||
|
|
||||||
# Asian languages
|
|
||||||
{"code": "ja-JP", "name": "Japanese (Japan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ko-KR", "name": "Korean (Korea)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "zh-CN", "name": "Chinese (Simplified)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "zh-TW", "name": "Chinese (Traditional)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "zh-HK", "name": "Chinese (Hong Kong)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "th-TH", "name": "Thai (Thailand)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "vi-VN", "name": "Vietnamese (Vietnam)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "id-ID", "name": "Indonesian (Indonesia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ms-MY", "name": "Malay (Malaysia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "tl-PH", "name": "Filipino (Philippines)", "stt": True, "tts": True, "translation": True},
|
|
||||||
|
|
||||||
# Middle Eastern and African languages
|
|
||||||
{"code": "ar-SA", "name": "Arabic (Saudi Arabia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-EG", "name": "Arabic (Egypt)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-AE", "name": "Arabic (UAE)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-KW", "name": "Arabic (Kuwait)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-QA", "name": "Arabic (Qatar)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-BH", "name": "Arabic (Bahrain)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-OM", "name": "Arabic (Oman)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-JO", "name": "Arabic (Jordan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-LB", "name": "Arabic (Lebanon)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-PS", "name": "Arabic (Palestine)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-SY", "name": "Arabic (Syria)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-IQ", "name": "Arabic (Iraq)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-MA", "name": "Arabic (Morocco)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-DZ", "name": "Arabic (Algeria)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-TN", "name": "Arabic (Tunisia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-LY", "name": "Arabic (Libya)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ar-SD", "name": "Arabic (Sudan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "he-IL", "name": "Hebrew (Israel)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "tr-TR", "name": "Turkish (Turkey)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "fa-IR", "name": "Persian (Iran)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ur-PK", "name": "Urdu (Pakistan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "hi-IN", "name": "Hindi (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "bn-BD", "name": "Bengali (Bangladesh)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ta-IN", "name": "Tamil (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "te-IN", "name": "Telugu (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ml-IN", "name": "Malayalam (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "kn-IN", "name": "Kannada (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "gu-IN", "name": "Gujarati (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "pa-IN", "name": "Punjabi (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "mr-IN", "name": "Marathi (India)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ne-NP", "name": "Nepali (Nepal)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "si-LK", "name": "Sinhala (Sri Lanka)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "my-MM", "name": "Burmese (Myanmar)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "km-KH", "name": "Khmer (Cambodia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "lo-LA", "name": "Lao (Laos)", "stt": True, "tts": True, "translation": True},
|
|
||||||
|
|
||||||
# African languages
|
|
||||||
{"code": "sw-KE", "name": "Swahili (Kenya)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "sw-TZ", "name": "Swahili (Tanzania)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "am-ET", "name": "Amharic (Ethiopia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "zu-ZA", "name": "Zulu (South Africa)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "af-ZA", "name": "Afrikaans (South Africa)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "yo-NG", "name": "Yoruba (Nigeria)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ig-NG", "name": "Igbo (Nigeria)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ha-NG", "name": "Hausa (Nigeria)", "stt": True, "tts": True, "translation": True},
|
|
||||||
|
|
||||||
# Other languages
|
|
||||||
{"code": "is-IS", "name": "Icelandic (Iceland)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "mk-MK", "name": "Macedonian (North Macedonia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "sq-AL", "name": "Albanian (Albania)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "sr-RS", "name": "Serbian (Serbia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "bs-BA", "name": "Bosnian (Bosnia and Herzegovina)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "me-ME", "name": "Montenegrin (Montenegro)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "uk-UA", "name": "Ukrainian (Ukraine)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "be-BY", "name": "Belarusian (Belarus)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ka-GE", "name": "Georgian (Georgia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "hy-AM", "name": "Armenian (Armenia)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "az-AZ", "name": "Azerbaijani (Azerbaijan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "kk-KZ", "name": "Kazakh (Kazakhstan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "ky-KG", "name": "Kyrgyz (Kyrgyzstan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "uz-UZ", "name": "Uzbek (Uzbekistan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "tg-TJ", "name": "Tajik (Tajikistan)", "stt": True, "tts": True, "translation": True},
|
|
||||||
{"code": "mn-MN", "name": "Mongolian (Mongolia)", "stt": True, "tts": True, "translation": True}
|
|
||||||
]
|
|
||||||
return languages
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to get languages: {str(e)}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def test_connection(self) -> bool:
|
|
||||||
"""Test Azure Speech Services connection."""
|
|
||||||
try:
|
|
||||||
# Test with a simple TTS request
|
|
||||||
test_audio = await self.text_to_speech("Test", "en-US", "en-US-AriaNeural")
|
|
||||||
return len(test_audio) > 0
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Connection test failed: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def stream_speech_to_text(self, audio_stream: AsyncGenerator[bytes, None], language: str = "de-DE", format: str = "detailed", audio_format: str = "wav") -> AsyncGenerator[Dict, None]:
|
|
||||||
"""
|
|
||||||
Stream speech to text using Azure Speech Services.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
audio_stream: Async generator yielding audio chunks
|
|
||||||
language: Language code (e.g., "de-DE")
|
|
||||||
format: Response format ("simple" or "detailed")
|
|
||||||
audio_format: Audio format ("wav", "mp3", "ogg")
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
Dict with partial transcription results
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check rate limit
|
|
||||||
if not self._check_rate_limit("stt"):
|
|
||||||
raise Exception("Rate limit exceeded for speech-to-text service. Please wait before making more requests.")
|
|
||||||
|
|
||||||
url = f"{self.base_url}/speech/recognition/conversation/cognitiveservices/v1"
|
|
||||||
|
|
||||||
# Get appropriate content type
|
|
||||||
content_type = self._get_audio_content_type(audio_format)
|
|
||||||
if audio_format == "wav":
|
|
||||||
content_type = f"{content_type}; codecs=audio/pcm; samplerate=16000"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Content-Type": content_type,
|
|
||||||
"Accept": "application/json",
|
|
||||||
"Ocp-Apim-Subscription-Key": self.subscription_key,
|
|
||||||
"Ocp-Apim-Subscription-Region": self.region
|
|
||||||
}
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"language": language,
|
|
||||||
"format": "detailed" if format == "detailed" else "simple"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Process audio stream in chunks
|
|
||||||
async with aiohttp.ClientSession(timeout=self.timeout) as session:
|
|
||||||
async for audio_chunk in audio_stream:
|
|
||||||
try:
|
|
||||||
# Validate chunk
|
|
||||||
if not audio_chunk:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Make API call for this chunk
|
|
||||||
async with session.post(
|
|
||||||
url,
|
|
||||||
headers=headers,
|
|
||||||
params=params,
|
|
||||||
data=audio_chunk
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
result = await response.json()
|
|
||||||
|
|
||||||
# Yield partial result
|
|
||||||
if format == "detailed":
|
|
||||||
yield {
|
|
||||||
"text": result.get("DisplayText", ""),
|
|
||||||
"confidence": result.get("Confidence", 0.0),
|
|
||||||
"language": result.get("RecognitionStatus", language),
|
|
||||||
"format": format,
|
|
||||||
"is_final": result.get("RecognitionStatus") == "Success",
|
|
||||||
"raw_result": result
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
yield {
|
|
||||||
"text": result.get("DisplayText", ""),
|
|
||||||
"confidence": 1.0,
|
|
||||||
"language": language,
|
|
||||||
"format": format,
|
|
||||||
"is_final": result.get("RecognitionStatus") == "Success"
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
error_text = await response.text()
|
|
||||||
logger.error(f"Streaming STT API failed: {response.status} - {error_text}")
|
|
||||||
yield {
|
|
||||||
"error": f"API error {response.status}: {error_text}",
|
|
||||||
"is_final": True
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing audio chunk: {str(e)}")
|
|
||||||
yield {
|
|
||||||
"error": str(e),
|
|
||||||
"is_final": True
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Streaming speech-to-text failed: {str(e)}")
|
|
||||||
yield {
|
|
||||||
"error": str(e),
|
|
||||||
"is_final": True
|
|
||||||
}
|
|
||||||
|
|
||||||
async def stream_text_to_speech(self, text_stream: AsyncGenerator[str, None], language: str = "de-DE", voice: str = "de-DE-KatjaNeural", format: str = "audio-16khz-128kbitrate-mono-mp3") -> AsyncGenerator[bytes, None]:
|
|
||||||
"""
|
|
||||||
Stream text to speech using Azure Speech Services.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text_stream: Async generator yielding text chunks
|
|
||||||
language: Language code
|
|
||||||
voice: Voice name
|
|
||||||
format: Audio format
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
Audio data chunks as bytes
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check rate limit
|
|
||||||
if not self._check_rate_limit("tts"):
|
|
||||||
raise Exception("Rate limit exceeded for text-to-speech service. Please wait before making more requests.")
|
|
||||||
|
|
||||||
# Validate format
|
|
||||||
if format not in self.supported_tts_formats:
|
|
||||||
raise Exception(f"Unsupported TTS format: {format}. Supported formats: {', '.join(self.supported_tts_formats)}")
|
|
||||||
|
|
||||||
url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/ssml+xml",
|
|
||||||
"X-Microsoft-OutputFormat": format,
|
|
||||||
"Ocp-Apim-Subscription-Key": self.subscription_key,
|
|
||||||
"User-Agent": "PowerOn-Voice-Services/1.0"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Process text stream in chunks
|
|
||||||
async with aiohttp.ClientSession(timeout=self.timeout) as session:
|
|
||||||
async for text_chunk in text_stream:
|
|
||||||
try:
|
|
||||||
if not text_chunk.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create SSML for this chunk
|
|
||||||
escaped_text = text_chunk.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace("'", "'")
|
|
||||||
ssml = f"""<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='{language}'>
|
|
||||||
<voice name='{voice}'>
|
|
||||||
{escaped_text}
|
|
||||||
</voice>
|
|
||||||
</speak>"""
|
|
||||||
|
|
||||||
# Make API call for this chunk
|
|
||||||
async with session.post(
|
|
||||||
url,
|
|
||||||
headers=headers,
|
|
||||||
data=ssml.encode('utf-8')
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
audio_data = await response.read()
|
|
||||||
if audio_data:
|
|
||||||
yield audio_data
|
|
||||||
else:
|
|
||||||
error_text = await response.text()
|
|
||||||
logger.error(f"Streaming TTS API failed: {response.status} - {error_text}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing text chunk: {str(e)}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Streaming text-to-speech failed: {str(e)}")
|
|
||||||
|
|
||||||
async def stream_realtime_interpreter(self, audio_stream: AsyncGenerator[bytes, None], from_language: str = "de-DE", to_language: str = "en-US") -> AsyncGenerator[Dict, None]:
|
|
||||||
"""
|
|
||||||
Stream real-time interpreter: speech to translated text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
audio_stream: Async generator yielding audio chunks
|
|
||||||
from_language: Source language code
|
|
||||||
to_language: Target language code
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
Dict with translation results
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Check rate limits
|
|
||||||
if not self._check_rate_limit("stt") or not self._check_rate_limit("translation"):
|
|
||||||
raise Exception("Rate limit exceeded for interpreter service. Please wait before making more requests.")
|
|
||||||
|
|
||||||
# Process audio stream
|
|
||||||
async for stt_result in self.stream_speech_to_text(audio_stream, from_language):
|
|
||||||
if "error" in stt_result:
|
|
||||||
yield stt_result
|
|
||||||
continue
|
|
||||||
|
|
||||||
original_text = stt_result.get("text", "")
|
|
||||||
|
|
||||||
# Translate text if different languages
|
|
||||||
translated_text = original_text
|
|
||||||
if from_language != to_language and original_text.strip():
|
|
||||||
try:
|
|
||||||
translated_text = await self.translate_text(
|
|
||||||
text=original_text,
|
|
||||||
from_language=from_language,
|
|
||||||
to_language=to_language
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Translation failed: {str(e)}")
|
|
||||||
translated_text = original_text
|
|
||||||
|
|
||||||
yield {
|
|
||||||
"original_text": original_text,
|
|
||||||
"translated_text": translated_text,
|
|
||||||
"from_language": from_language,
|
|
||||||
"to_language": to_language,
|
|
||||||
"confidence": stt_result.get("confidence", 0.0),
|
|
||||||
"is_final": stt_result.get("is_final", False)
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Streaming realtime interpreter failed: {str(e)}")
|
|
||||||
yield {
|
|
||||||
"error": str(e),
|
|
||||||
"is_final": True
|
|
||||||
}
|
|
||||||
285
modules/connectors/connectorGoogleSpeech.py
Normal file
285
modules/connectors/connectorGoogleSpeech.py
Normal file
|
|
@ -0,0 +1,285 @@
|
||||||
|
"""
|
||||||
|
Google Cloud Speech-to-Text and Translation Connector
|
||||||
|
Replaces Azure Speech Services with Google Cloud APIs
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
from typing import Dict, Optional, Any
|
||||||
|
from google.cloud import speech
|
||||||
|
from google.cloud import translate_v2 as translate
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class ConnectorGoogleSpeech:
|
||||||
|
"""
|
||||||
|
Google Cloud Speech-to-Text and Translation connector.
|
||||||
|
Handles audio processing, speech recognition, and translation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, credentials_path: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize Google Cloud Speech and Translation clients.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
credentials_path: Path to Google Cloud service account JSON file
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Set up authentication
|
||||||
|
if credentials_path:
|
||||||
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
|
||||||
|
|
||||||
|
# Initialize clients
|
||||||
|
self.speech_client = speech.SpeechClient()
|
||||||
|
self.translate_client = translate.Client()
|
||||||
|
|
||||||
|
logger.info("✅ Google Cloud Speech and Translation clients initialized successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to initialize Google Cloud clients: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def speech_to_text(self, audio_content: bytes, language: str = "de-DE",
|
||||||
|
sample_rate: int = 16000, channels: int = 1) -> Dict:
|
||||||
|
"""
|
||||||
|
Convert speech to text using Google Cloud Speech-to-Text API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_content: Raw audio data (PCM format)
|
||||||
|
language: Language code (e.g., 'de-DE', 'en-US')
|
||||||
|
sample_rate: Audio sample rate (default: 16000 Hz)
|
||||||
|
channels: Number of audio channels (default: 1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing transcribed text, confidence, and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"🎤 Processing audio with Google Cloud Speech-to-Text")
|
||||||
|
logger.info(f"📊 Audio: {len(audio_content)} bytes, {sample_rate}Hz, {channels}ch")
|
||||||
|
|
||||||
|
# Configure audio settings
|
||||||
|
audio = speech.RecognitionAudio(content=audio_content)
|
||||||
|
config = speech.RecognitionConfig(
|
||||||
|
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||||
|
sample_rate_hertz=sample_rate,
|
||||||
|
audio_channel_count=channels,
|
||||||
|
language_code=language,
|
||||||
|
enable_automatic_punctuation=True,
|
||||||
|
model="latest_long" # Use the latest model
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform speech recognition
|
||||||
|
logger.info("🔄 Sending audio to Google Cloud Speech-to-Text...")
|
||||||
|
response = self.speech_client.recognize(config=config, audio=audio)
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
if response.results:
|
||||||
|
result = response.results[0]
|
||||||
|
if result.alternatives:
|
||||||
|
alternative = result.alternatives[0]
|
||||||
|
transcribed_text = alternative.transcript
|
||||||
|
confidence = alternative.confidence
|
||||||
|
|
||||||
|
logger.info(f"✅ Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"text": transcribed_text,
|
||||||
|
"confidence": confidence,
|
||||||
|
"language": language,
|
||||||
|
"raw_result": {
|
||||||
|
"transcript": transcribed_text,
|
||||||
|
"confidence": confidence,
|
||||||
|
"language_code": language
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
logger.warning("⚠️ No transcription alternatives found")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"text": "",
|
||||||
|
"confidence": 0.0,
|
||||||
|
"error": "No transcription alternatives found"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
logger.warning("⚠️ No recognition results from Google Cloud")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"text": "",
|
||||||
|
"confidence": 0.0,
|
||||||
|
"error": "No recognition results"
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Google Cloud Speech-to-Text error: {e}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"text": "",
|
||||||
|
"confidence": 0.0,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
async def translate_text(self, text: str, target_language: str = "en",
|
||||||
|
source_language: str = "de") -> Dict:
|
||||||
|
"""
|
||||||
|
Translate text using Google Cloud Translation API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to translate
|
||||||
|
target_language: Target language code (e.g., 'en', 'de')
|
||||||
|
source_language: Source language code (e.g., 'de', 'en')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing translated text and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not text.strip():
|
||||||
|
logger.warning("⚠️ Empty text provided for translation")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"translated_text": "",
|
||||||
|
"error": "Empty text provided"
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"🌐 Translating: '{text}' ({source_language} -> {target_language})")
|
||||||
|
|
||||||
|
# Perform translation
|
||||||
|
result = self.translate_client.translate(
|
||||||
|
text,
|
||||||
|
source_language=source_language,
|
||||||
|
target_language=target_language
|
||||||
|
)
|
||||||
|
|
||||||
|
translated_text = result['translatedText']
|
||||||
|
detected_language = result.get('detectedSourceLanguage', source_language)
|
||||||
|
|
||||||
|
logger.info(f"✅ Translation successful: '{translated_text}'")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"translated_text": translated_text,
|
||||||
|
"source_language": detected_language,
|
||||||
|
"target_language": target_language,
|
||||||
|
"original_text": text
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Google Cloud Translation error: {e}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"translated_text": "",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
async def speech_to_translated_text(self, audio_content: bytes,
|
||||||
|
from_language: str = "de-DE",
|
||||||
|
to_language: str = "en") -> Dict:
|
||||||
|
"""
|
||||||
|
Complete pipeline: Speech-to-Text + Translation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_content: Raw audio data
|
||||||
|
from_language: Source language for speech recognition
|
||||||
|
to_language: Target language for translation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing original text, translated text, and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"🔄 Starting speech-to-translation pipeline: {from_language} -> {to_language}")
|
||||||
|
|
||||||
|
# Step 1: Speech-to-Text
|
||||||
|
speech_result = await self.speech_to_text(
|
||||||
|
audio_content=audio_content,
|
||||||
|
language=from_language
|
||||||
|
)
|
||||||
|
|
||||||
|
if not speech_result["success"]:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"original_text": "",
|
||||||
|
"translated_text": "",
|
||||||
|
"error": f"Speech recognition failed: {speech_result.get('error', 'Unknown error')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
original_text = speech_result["text"]
|
||||||
|
|
||||||
|
# Step 2: Translation
|
||||||
|
translation_result = await self.translate_text(
|
||||||
|
text=original_text,
|
||||||
|
source_language=from_language.split('-')[0], # Convert 'de-DE' to 'de'
|
||||||
|
target_language=to_language.split('-')[0] # Convert 'en-US' to 'en'
|
||||||
|
)
|
||||||
|
|
||||||
|
if not translation_result["success"]:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"original_text": original_text,
|
||||||
|
"translated_text": "",
|
||||||
|
"error": f"Translation failed: {translation_result.get('error', 'Unknown error')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
translated_text = translation_result["translated_text"]
|
||||||
|
|
||||||
|
logger.info(f"✅ Complete pipeline successful:")
|
||||||
|
logger.info(f" Original: '{original_text}'")
|
||||||
|
logger.info(f" Translated: '{translated_text}'")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"original_text": original_text,
|
||||||
|
"translated_text": translated_text,
|
||||||
|
"confidence": speech_result["confidence"],
|
||||||
|
"source_language": from_language,
|
||||||
|
"target_language": to_language
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Speech-to-translation pipeline error: {e}")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"original_text": "",
|
||||||
|
"translated_text": "",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def validate_audio_format(self, audio_content: bytes) -> Dict:
|
||||||
|
"""
|
||||||
|
Validate audio format for Google Cloud Speech-to-Text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio_content: Raw audio data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict containing validation results
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Google Cloud Speech-to-Text supports various formats
|
||||||
|
# We'll do basic validation
|
||||||
|
if len(audio_content) < 100:
|
||||||
|
return {
|
||||||
|
"valid": False,
|
||||||
|
"error": "Audio too short (less than 100 bytes)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if it looks like PCM audio (basic check)
|
||||||
|
if len(audio_content) % 2 != 0:
|
||||||
|
return {
|
||||||
|
"valid": False,
|
||||||
|
"error": "Audio data length is odd (not 16-bit PCM)"
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"valid": True,
|
||||||
|
"format": "pcm",
|
||||||
|
"size": len(audio_content),
|
||||||
|
"estimated_duration": len(audio_content) / (16000 * 2) # Rough estimate for 16kHz, 16-bit
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"valid": False,
|
||||||
|
"error": f"Validation error: {e}"
|
||||||
|
}
|
||||||
|
|
@ -1,813 +0,0 @@
|
||||||
"""
|
|
||||||
Azure Voice Services Route
|
|
||||||
Provides endpoints for Azure Speech Services integration including:
|
|
||||||
- Speech-to-Text (STT)
|
|
||||||
- Text-to-Speech (TTS)
|
|
||||||
- Translation services
|
|
||||||
- Real-time conversation
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
from pydantic import BaseModel
|
|
||||||
import io
|
|
||||||
import base64
|
|
||||||
import asyncio
|
|
||||||
from typing import AsyncGenerator
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from modules.interfaces.interfaceAppObjects import getRootInterface
|
|
||||||
from modules.interfaces.interfaceAppModel import UserInDB
|
|
||||||
from modules.security.auth import getCurrentUser
|
|
||||||
from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/voice", tags=["voice"])
|
|
||||||
|
|
||||||
# Pydantic models for request/response
|
|
||||||
class SpeechToTextRequest(BaseModel):
|
|
||||||
language: str = "de-DE"
|
|
||||||
format: str = "detailed" # "simple" or "detailed"
|
|
||||||
|
|
||||||
class TextToSpeechRequest(BaseModel):
|
|
||||||
text: str
|
|
||||||
language: str = "de-DE"
|
|
||||||
voice: str = "de-DE-KatjaNeural"
|
|
||||||
format: str = "audio-16khz-128kbitrate-mono-mp3"
|
|
||||||
|
|
||||||
class TranslationRequest(BaseModel):
|
|
||||||
text: str
|
|
||||||
from_language: str = "de-DE"
|
|
||||||
to_language: str = "en-US"
|
|
||||||
|
|
||||||
class ConversationRequest(BaseModel):
|
|
||||||
message: str
|
|
||||||
language: str = "de-DE"
|
|
||||||
response_voice: str = "de-DE-KatjaNeural"
|
|
||||||
|
|
||||||
class VoiceSettingsRequest(BaseModel):
|
|
||||||
stt_language: str = "de-DE"
|
|
||||||
tts_language: str = "de-DE"
|
|
||||||
tts_voice: str = "de-DE-KatjaNeural"
|
|
||||||
translation_enabled: bool = True
|
|
||||||
target_language: str = "en-US"
|
|
||||||
|
|
||||||
# Get Azure Speech connector for current user
|
|
||||||
async def get_azure_speech_connector(current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None) -> ConnectorAzureSpeech:
|
|
||||||
"""Get Azure Speech connector for the current user."""
|
|
||||||
try:
|
|
||||||
root_interface = getRootInterface()
|
|
||||||
|
|
||||||
# Get user connections
|
|
||||||
user_connections = root_interface.getUserConnections(current_user.id)
|
|
||||||
azure_connection = None
|
|
||||||
|
|
||||||
if connection_id:
|
|
||||||
# Find specific connection by ID
|
|
||||||
for connection in user_connections:
|
|
||||||
if connection.id == connection_id and connection.authority == "msft":
|
|
||||||
azure_connection = connection
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# Find first Azure connection
|
|
||||||
for connection in user_connections:
|
|
||||||
if connection.authority == "msft":
|
|
||||||
azure_connection = connection
|
|
||||||
break
|
|
||||||
|
|
||||||
if not azure_connection:
|
|
||||||
if connection_id:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Azure connection with ID '{connection_id}' not found."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="No Azure connection found. Please connect your Microsoft account first."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get connection token
|
|
||||||
connection_token = root_interface.getConnectionToken(azure_connection.id)
|
|
||||||
if not connection_token:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="Azure connection token not found. Please reconnect your Microsoft account."
|
|
||||||
)
|
|
||||||
|
|
||||||
# For Azure Speech Services, we need a subscription key, not an access token
|
|
||||||
# Use Azure Speech Services subscription key from configuration
|
|
||||||
from modules.shared.configuration import APP_CONFIG
|
|
||||||
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
|
|
||||||
|
|
||||||
# Debug: Log subscription key (masked for security)
|
|
||||||
logger.info(f"Loaded subscription key: {subscription_key[:8]}...{subscription_key[-8:] if subscription_key and len(subscription_key) > 16 else 'INVALID'}")
|
|
||||||
logger.info(f"Key length: {len(subscription_key) if subscription_key else 'None'}")
|
|
||||||
|
|
||||||
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500,
|
|
||||||
detail="Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get region from configuration
|
|
||||||
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
|
|
||||||
|
|
||||||
# Create Azure Speech connector
|
|
||||||
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
|
|
||||||
|
|
||||||
return connector
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error getting Azure Speech connector: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=f"Failed to initialize Azure Speech connector: {str(e)}")
|
|
||||||
|
|
||||||
@router.get("/connections")
|
|
||||||
async def get_user_connections(current_user: UserInDB = Depends(getCurrentUser)):
|
|
||||||
"""Get all Microsoft connections for the current user."""
|
|
||||||
try:
|
|
||||||
root_interface = getRootInterface()
|
|
||||||
|
|
||||||
# Get user connections
|
|
||||||
user_connections = root_interface.getUserConnections(current_user.id)
|
|
||||||
|
|
||||||
# Filter for Microsoft connections only
|
|
||||||
msft_connections = []
|
|
||||||
for connection in user_connections:
|
|
||||||
if connection.authority == "msft":
|
|
||||||
# Check if this connection has speech services subscription key
|
|
||||||
has_speech_key = False
|
|
||||||
if hasattr(connection, 'metadata') and connection.metadata:
|
|
||||||
try:
|
|
||||||
metadata = json.loads(connection.metadata) if isinstance(connection.metadata, str) else connection.metadata
|
|
||||||
has_speech_key = bool(metadata.get('speech_subscription_key') or metadata.get('service_type') == 'speech_services')
|
|
||||||
except (json.JSONDecodeError, AttributeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
msft_connections.append({
|
|
||||||
"id": connection.id,
|
|
||||||
"externalUsername": connection.externalUsername,
|
|
||||||
"authority": connection.authority,
|
|
||||||
"isActive": connection.status == "active",
|
|
||||||
"hasSpeechKey": has_speech_key,
|
|
||||||
"connectedAt": connection.connectedAt,
|
|
||||||
"lastChecked": connection.lastChecked
|
|
||||||
})
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"connections": msft_connections,
|
|
||||||
"count": len(msft_connections)
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error getting user connections: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
class SpeechSubscriptionRequest(BaseModel):
|
|
||||||
subscription_key: str
|
|
||||||
region: str = "westeurope"
|
|
||||||
connection_id: Optional[str] = None
|
|
||||||
|
|
||||||
@router.post("/subscription")
|
|
||||||
async def set_speech_subscription(
|
|
||||||
request: SpeechSubscriptionRequest,
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Set Azure Speech Services subscription key for the user."""
|
|
||||||
try:
|
|
||||||
root_interface = getRootInterface()
|
|
||||||
|
|
||||||
# Validate subscription key format (basic validation)
|
|
||||||
if not request.subscription_key or len(request.subscription_key) < 32:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="Invalid subscription key format. Please provide a valid Azure Speech Services subscription key."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validate region
|
|
||||||
valid_regions = ["westeurope", "eastus", "westus2", "eastus2", "southeastasia", "westcentralus", "eastasia", "northeurope", "southcentralus", "centralus", "australiaeast", "brazilsouth", "canadacentral", "centralindia", "francecentral", "germanywestcentral", "japaneast", "koreacentral", "norwayeast", "southafricanorth", "switzerlandnorth", "uaenorth", "uksouth", "westus3"]
|
|
||||||
if request.region not in valid_regions:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Invalid region. Supported regions: {', '.join(valid_regions)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test the subscription key by creating a temporary connector
|
|
||||||
try:
|
|
||||||
test_connector = ConnectorAzureSpeech(subscription_key=request.subscription_key, region=request.region)
|
|
||||||
# Test with a simple TTS request
|
|
||||||
test_audio = await test_connector.text_to_speech("Test", "en-US", "en-US-AriaNeural")
|
|
||||||
if len(test_audio) == 0:
|
|
||||||
raise Exception("Subscription key test failed")
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Invalid subscription key or region. Test failed: {str(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user connections
|
|
||||||
user_connections = root_interface.getUserConnections(current_user.id)
|
|
||||||
|
|
||||||
# Find the target connection
|
|
||||||
target_connection = None
|
|
||||||
if request.connection_id:
|
|
||||||
# Use specific connection
|
|
||||||
for connection in user_connections:
|
|
||||||
if connection.id == request.connection_id and connection.authority == "msft":
|
|
||||||
target_connection = connection
|
|
||||||
break
|
|
||||||
if not target_connection:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail=f"Connection with ID '{request.connection_id}' not found."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Use first Microsoft connection or create a new one
|
|
||||||
target_connection = None
|
|
||||||
for connection in user_connections:
|
|
||||||
if connection.authority == "msft":
|
|
||||||
target_connection = connection
|
|
||||||
break
|
|
||||||
|
|
||||||
if not target_connection:
|
|
||||||
# Create a new connection for speech services
|
|
||||||
# This would require implementing connection creation in the interface
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400,
|
|
||||||
detail="No Microsoft connection found. Please connect your Microsoft account first."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update connection metadata with speech subscription key
|
|
||||||
metadata = {}
|
|
||||||
if hasattr(target_connection, 'metadata') and target_connection.metadata:
|
|
||||||
try:
|
|
||||||
metadata = json.loads(target_connection.metadata) if isinstance(target_connection.metadata, str) else target_connection.metadata
|
|
||||||
except (json.JSONDecodeError, AttributeError):
|
|
||||||
metadata = {}
|
|
||||||
|
|
||||||
# Add speech services information
|
|
||||||
metadata['speech_subscription_key'] = request.subscription_key
|
|
||||||
metadata['speech_region'] = request.region
|
|
||||||
metadata['speech_configured_at'] = datetime.now().isoformat()
|
|
||||||
|
|
||||||
# Update the connection (this would require implementing update in the interface)
|
|
||||||
# For now, we'll just return success - the actual update would need to be implemented
|
|
||||||
# in the database interface
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"message": "Azure Speech Services subscription key configured successfully",
|
|
||||||
"connection_id": target_connection.id,
|
|
||||||
"region": request.region,
|
|
||||||
"configured_at": metadata['speech_configured_at']
|
|
||||||
}
|
|
||||||
|
|
||||||
except HTTPException:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error setting speech subscription: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.get("/subscription")
|
|
||||||
async def get_speech_subscription(
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Get Azure Speech Services subscription information for the user."""
|
|
||||||
try:
|
|
||||||
# Use Azure Speech Services subscription key from configuration
|
|
||||||
from modules.shared.configuration import APP_CONFIG
|
|
||||||
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
|
|
||||||
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
|
|
||||||
|
|
||||||
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"has_subscription": False,
|
|
||||||
"message": "Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Test the connection
|
|
||||||
try:
|
|
||||||
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
|
|
||||||
test_result = await connector.test_connection()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"has_subscription": True,
|
|
||||||
"subscription": {
|
|
||||||
"subscription_key": subscription_key[:8] + "..." + subscription_key[-8:],
|
|
||||||
"region": region,
|
|
||||||
"connection_test": test_result,
|
|
||||||
"source": "config.ini"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
except Exception as test_error:
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"has_subscription": True,
|
|
||||||
"subscription": {
|
|
||||||
"subscription_key": subscription_key[:8] + "..." + subscription_key[-8:],
|
|
||||||
"region": region,
|
|
||||||
"connection_test": False,
|
|
||||||
"error": str(test_error),
|
|
||||||
"source": "config.ini"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error getting speech subscription: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.get("/settings")
|
|
||||||
async def get_voice_settings(
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser),
|
|
||||||
connection_id: Optional[str] = None
|
|
||||||
):
|
|
||||||
"""Get available voice settings and languages."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
# Get available voices and languages
|
|
||||||
voices = await connector.get_available_voices()
|
|
||||||
languages = await connector.get_available_languages()
|
|
||||||
|
|
||||||
# Get user's saved settings from database
|
|
||||||
from modules.interfaces.interfaceComponentObjects import getInterface
|
|
||||||
component_interface = getInterface(current_user)
|
|
||||||
user_settings = component_interface.getVoiceSettings(current_user.id)
|
|
||||||
|
|
||||||
# If no settings exist, use defaults
|
|
||||||
if not user_settings:
|
|
||||||
user_settings = {
|
|
||||||
"sttLanguage": "de-DE",
|
|
||||||
"ttsLanguage": "de-DE",
|
|
||||||
"ttsVoice": "de-DE-KatjaNeural",
|
|
||||||
"translationEnabled": True,
|
|
||||||
"targetLanguage": "en-US"
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
"voices": voices,
|
|
||||||
"languages": languages,
|
|
||||||
"user_settings": {
|
|
||||||
"sttLanguage": user_settings.sttLanguage if hasattr(user_settings, 'sttLanguage') else user_settings.get("sttLanguage"),
|
|
||||||
"ttsLanguage": user_settings.ttsLanguage if hasattr(user_settings, 'ttsLanguage') else user_settings.get("ttsLanguage"),
|
|
||||||
"ttsVoice": user_settings.ttsVoice if hasattr(user_settings, 'ttsVoice') else user_settings.get("ttsVoice"),
|
|
||||||
"translationEnabled": user_settings.translationEnabled if hasattr(user_settings, 'translationEnabled') else user_settings.get("translationEnabled"),
|
|
||||||
"targetLanguage": user_settings.targetLanguage if hasattr(user_settings, 'targetLanguage') else user_settings.get("targetLanguage")
|
|
||||||
},
|
|
||||||
"default_settings": {
|
|
||||||
"sttLanguage": "de-DE",
|
|
||||||
"ttsLanguage": "de-DE",
|
|
||||||
"ttsVoice": "de-DE-KatjaNeural",
|
|
||||||
"translationEnabled": True,
|
|
||||||
"targetLanguage": "en-US"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error getting voice settings: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/settings")
|
|
||||||
async def save_voice_settings(
|
|
||||||
settings: dict,
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Save voice settings for the current user."""
|
|
||||||
try:
|
|
||||||
from modules.interfaces.interfaceComponentObjects import getInterface
|
|
||||||
component_interface = getInterface(current_user)
|
|
||||||
|
|
||||||
# Check if settings exist, if not create them
|
|
||||||
existing_settings = component_interface.getVoiceSettings(current_user.id)
|
|
||||||
if not existing_settings:
|
|
||||||
# Create new settings
|
|
||||||
settings["userId"] = current_user.id
|
|
||||||
settings["mandateId"] = current_user.mandateId
|
|
||||||
created_settings = component_interface.createVoiceSettings(settings)
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"message": "Voice settings created successfully",
|
|
||||||
"settings": created_settings
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# Update existing settings
|
|
||||||
updated_settings = component_interface.updateVoiceSettings(
|
|
||||||
current_user.id,
|
|
||||||
settings
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"message": "Voice settings saved successfully",
|
|
||||||
"settings": updated_settings
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error saving voice settings: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/speech-to-text")
|
|
||||||
async def speech_to_text(
|
|
||||||
audio_file: UploadFile = File(...),
|
|
||||||
language: str = Form("de-DE"),
|
|
||||||
format: str = Form("detailed"),
|
|
||||||
connection_id: str = Form(None),
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Convert speech to text using Azure Speech Services."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
# Read audio file
|
|
||||||
audio_content = await audio_file.read()
|
|
||||||
|
|
||||||
# Convert speech to text
|
|
||||||
result = await connector.speech_to_text(
|
|
||||||
audio_content=audio_content,
|
|
||||||
language=language,
|
|
||||||
format=format
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"text": result.get("text", ""),
|
|
||||||
"confidence": result.get("confidence", 0.0),
|
|
||||||
"language": result.get("language", language),
|
|
||||||
"format": format
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in speech-to-text: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/text-to-speech")
|
|
||||||
async def text_to_speech(
|
|
||||||
request: TextToSpeechRequest,
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser),
|
|
||||||
connection_id: Optional[str] = None
|
|
||||||
):
|
|
||||||
"""Convert text to speech using Azure Speech Services."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
# Convert text to speech
|
|
||||||
audio_data = await connector.text_to_speech(
|
|
||||||
text=request.text,
|
|
||||||
language=request.language,
|
|
||||||
voice=request.voice,
|
|
||||||
format=request.format
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return audio as base64 encoded string
|
|
||||||
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"audio_data": audio_base64,
|
|
||||||
"format": request.format,
|
|
||||||
"voice": request.voice,
|
|
||||||
"language": request.language
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in text-to-speech: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/translate")
|
|
||||||
async def translate_text(
|
|
||||||
request: TranslationRequest,
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser),
|
|
||||||
connection_id: Optional[str] = None
|
|
||||||
):
|
|
||||||
"""Translate text using Azure Translator."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
# Translate text
|
|
||||||
translated_text = await connector.translate_text(
|
|
||||||
text=request.text,
|
|
||||||
from_language=request.from_language,
|
|
||||||
to_language=request.to_language
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"original_text": request.text,
|
|
||||||
"translated_text": translated_text,
|
|
||||||
"from_language": request.from_language,
|
|
||||||
"to_language": request.to_language
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in translation: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/conversation")
|
|
||||||
async def conversation(
|
|
||||||
request: ConversationRequest,
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser),
|
|
||||||
connection_id: Optional[str] = None
|
|
||||||
):
|
|
||||||
"""Handle conversation with voice response."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
# Simple AI response logic - in production, integrate with OpenAI, Azure OpenAI, or similar
|
|
||||||
response_text = await _generate_ai_response(request.message, request.language)
|
|
||||||
|
|
||||||
# Convert response to speech
|
|
||||||
audio_data = await connector.text_to_speech(
|
|
||||||
text=response_text,
|
|
||||||
language=request.language,
|
|
||||||
voice=request.response_voice,
|
|
||||||
format="audio-16khz-128kbitrate-mono-mp3"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return both text and audio
|
|
||||||
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"response_text": response_text,
|
|
||||||
"audio_data": audio_base64,
|
|
||||||
"voice": request.response_voice,
|
|
||||||
"language": request.language
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in conversation: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
async def _generate_ai_response(message: str, language: str) -> str:
|
|
||||||
"""Generate AI response for conversation."""
|
|
||||||
try:
|
|
||||||
# Simple rule-based responses for demonstration
|
|
||||||
# In production, this would call an AI service like OpenAI or Azure OpenAI
|
|
||||||
|
|
||||||
message_lower = message.lower()
|
|
||||||
|
|
||||||
# German responses
|
|
||||||
if language.startswith("de"):
|
|
||||||
if any(word in message_lower for word in ["hallo", "hi", "hey"]):
|
|
||||||
return "Hallo! Wie kann ich Ihnen heute helfen?"
|
|
||||||
elif any(word in message_lower for word in ["wie geht", "wie gehts"]):
|
|
||||||
return "Mir geht es gut, danke der Nachfrage! Wie geht es Ihnen?"
|
|
||||||
elif any(word in message_lower for word in ["danke", "dankeschön"]):
|
|
||||||
return "Gern geschehen! Gibt es noch etwas, womit ich helfen kann?"
|
|
||||||
elif any(word in message_lower for word in ["zeit", "uhr"]):
|
|
||||||
from datetime import datetime
|
|
||||||
current_time = datetime.now().strftime("%H:%M")
|
|
||||||
return f"Es ist jetzt {current_time} Uhr."
|
|
||||||
elif any(word in message_lower for word in ["wetter", "temperatur"]):
|
|
||||||
return "Ich kann leider keine aktuellen Wetterdaten abrufen. Bitte schauen Sie in eine Wetter-App."
|
|
||||||
else:
|
|
||||||
return f"Das ist interessant: '{message}'. Können Sie mir mehr darüber erzählen?"
|
|
||||||
|
|
||||||
# English responses
|
|
||||||
elif language.startswith("en"):
|
|
||||||
if any(word in message_lower for word in ["hello", "hi", "hey"]):
|
|
||||||
return "Hello! How can I help you today?"
|
|
||||||
elif any(word in message_lower for word in ["how are you", "how's it going"]):
|
|
||||||
return "I'm doing well, thank you for asking! How are you?"
|
|
||||||
elif any(word in message_lower for word in ["thank you", "thanks"]):
|
|
||||||
return "You're welcome! Is there anything else I can help you with?"
|
|
||||||
elif any(word in message_lower for word in ["time", "clock"]):
|
|
||||||
from datetime import datetime
|
|
||||||
current_time = datetime.now().strftime("%H:%M")
|
|
||||||
return f"It's currently {current_time}."
|
|
||||||
elif any(word in message_lower for word in ["weather", "temperature"]):
|
|
||||||
return "I'm sorry, I can't retrieve current weather data. Please check a weather app."
|
|
||||||
else:
|
|
||||||
return f"That's interesting: '{message}'. Can you tell me more about that?"
|
|
||||||
|
|
||||||
# Default response
|
|
||||||
else:
|
|
||||||
return f"I heard: '{message}'. How can I help you with that?"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error generating AI response: {str(e)}")
|
|
||||||
return "I'm sorry, I didn't understand that. Could you please repeat?"
|
|
||||||
|
|
||||||
@router.post("/realtime-interpreter")
|
|
||||||
async def realtime_interpreter(
|
|
||||||
audio_file: UploadFile = File(...),
|
|
||||||
from_language: str = Form("de-DE"),
|
|
||||||
to_language: str = Form("en-US"),
|
|
||||||
connection_id: str = Form(None),
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Real-time interpreter: speech to translated text."""
|
|
||||||
try:
|
|
||||||
logger.info(f"Realtime interpreter called with from_language='{from_language}', to_language='{to_language}'")
|
|
||||||
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
# Read audio file
|
|
||||||
audio_content = await audio_file.read()
|
|
||||||
|
|
||||||
# Convert speech to text
|
|
||||||
stt_result = await connector.speech_to_text(
|
|
||||||
audio_content=audio_content,
|
|
||||||
language=from_language,
|
|
||||||
format="detailed"
|
|
||||||
)
|
|
||||||
|
|
||||||
original_text = stt_result.get("text", "")
|
|
||||||
|
|
||||||
# Translate text if different languages
|
|
||||||
translated_text = original_text
|
|
||||||
if from_language != to_language:
|
|
||||||
try:
|
|
||||||
translated_text = await connector.translate_text(
|
|
||||||
text=original_text,
|
|
||||||
from_language=from_language,
|
|
||||||
to_language=to_language
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Translation failed, using original text: {str(e)}")
|
|
||||||
translated_text = original_text
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"original_text": original_text,
|
|
||||||
"translated_text": translated_text,
|
|
||||||
"from_language": from_language,
|
|
||||||
"to_language": to_language,
|
|
||||||
"confidence": stt_result.get("confidence", 0.0)
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in realtime interpreter: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/stream/speech-to-text")
|
|
||||||
async def stream_speech_to_text(
|
|
||||||
audio_file: UploadFile = File(...),
|
|
||||||
language: str = Form("de-DE"),
|
|
||||||
format: str = Form("detailed"),
|
|
||||||
audio_format: str = Form("wav"),
|
|
||||||
connection_id: str = Form(None),
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Stream speech to text using Azure Speech Services."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
async def audio_chunk_generator():
|
|
||||||
"""Generate audio chunks from uploaded file."""
|
|
||||||
chunk_size = 4096 # 4KB chunks
|
|
||||||
while True:
|
|
||||||
chunk = await audio_file.read(chunk_size)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
yield chunk
|
|
||||||
|
|
||||||
async def response_generator():
|
|
||||||
"""Generate streaming responses."""
|
|
||||||
try:
|
|
||||||
async for result in connector.stream_speech_to_text(
|
|
||||||
audio_chunk_generator(),
|
|
||||||
language=language,
|
|
||||||
format=format,
|
|
||||||
audio_format=audio_format
|
|
||||||
):
|
|
||||||
yield f"data: {json.dumps(result)}\n\n"
|
|
||||||
except Exception as e:
|
|
||||||
error_result = {"error": str(e), "is_final": True}
|
|
||||||
yield f"data: {json.dumps(error_result)}\n\n"
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
response_generator(),
|
|
||||||
media_type="text/plain",
|
|
||||||
headers={
|
|
||||||
"Cache-Control": "no-cache",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Content-Type": "text/event-stream"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in streaming speech-to-text: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/stream/text-to-speech")
|
|
||||||
async def stream_text_to_speech(
|
|
||||||
request: TextToSpeechRequest,
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser),
|
|
||||||
connection_id: Optional[str] = None
|
|
||||||
):
|
|
||||||
"""Stream text to speech using Azure Speech Services."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
async def text_chunk_generator():
|
|
||||||
"""Generate text chunks from the request."""
|
|
||||||
# Split text into sentences for better streaming
|
|
||||||
sentences = request.text.split('. ')
|
|
||||||
for sentence in sentences:
|
|
||||||
if sentence.strip():
|
|
||||||
yield sentence.strip() + '. '
|
|
||||||
|
|
||||||
async def audio_generator():
|
|
||||||
"""Generate streaming audio data."""
|
|
||||||
try:
|
|
||||||
async for audio_chunk in connector.stream_text_to_speech(
|
|
||||||
text_chunk_generator(),
|
|
||||||
language=request.language,
|
|
||||||
voice=request.voice,
|
|
||||||
format=request.format
|
|
||||||
):
|
|
||||||
yield audio_chunk
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in audio generation: {str(e)}")
|
|
||||||
# Return error as audio (could be a beep or silence)
|
|
||||||
yield b""
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
audio_generator(),
|
|
||||||
media_type="audio/mpeg",
|
|
||||||
headers={
|
|
||||||
"Cache-Control": "no-cache",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Content-Type": "audio/mpeg"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in streaming text-to-speech: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.post("/stream/realtime-interpreter")
|
|
||||||
async def stream_realtime_interpreter(
|
|
||||||
audio_file: UploadFile = File(...),
|
|
||||||
from_language: str = Form("de-DE"),
|
|
||||||
to_language: str = Form("en-US"),
|
|
||||||
connection_id: str = Form(None),
|
|
||||||
current_user: UserInDB = Depends(getCurrentUser)
|
|
||||||
):
|
|
||||||
"""Stream real-time interpreter: speech to translated text."""
|
|
||||||
try:
|
|
||||||
connector = await get_azure_speech_connector(current_user, connection_id)
|
|
||||||
|
|
||||||
async def audio_chunk_generator():
|
|
||||||
"""Generate audio chunks from uploaded file."""
|
|
||||||
chunk_size = 4096 # 4KB chunks
|
|
||||||
while True:
|
|
||||||
chunk = await audio_file.read(chunk_size)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
yield chunk
|
|
||||||
|
|
||||||
async def response_generator():
|
|
||||||
"""Generate streaming translation responses."""
|
|
||||||
try:
|
|
||||||
async for result in connector.stream_realtime_interpreter(
|
|
||||||
audio_chunk_generator(),
|
|
||||||
from_language=from_language,
|
|
||||||
to_language=to_language
|
|
||||||
):
|
|
||||||
yield f"data: {json.dumps(result)}\n\n"
|
|
||||||
except Exception as e:
|
|
||||||
error_result = {"error": str(e), "is_final": True}
|
|
||||||
yield f"data: {json.dumps(error_result)}\n\n"
|
|
||||||
|
|
||||||
return StreamingResponse(
|
|
||||||
response_generator(),
|
|
||||||
media_type="text/plain",
|
|
||||||
headers={
|
|
||||||
"Cache-Control": "no-cache",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Content-Type": "text/event-stream"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in streaming realtime interpreter: {str(e)}")
|
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
|
||||||
|
|
||||||
@router.get("/health")
|
|
||||||
async def health_check():
|
|
||||||
"""Health check endpoint for voice services."""
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"service": "azure-voice",
|
|
||||||
"endpoints": [
|
|
||||||
"speech-to-text",
|
|
||||||
"text-to-speech",
|
|
||||||
"translate",
|
|
||||||
"conversation",
|
|
||||||
"realtime-interpreter",
|
|
||||||
"stream/speech-to-text",
|
|
||||||
"stream/text-to-speech",
|
|
||||||
"stream/realtime-interpreter",
|
|
||||||
"subscription",
|
|
||||||
"connections"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
268
modules/routes/routeVoiceGoogle.py
Normal file
268
modules/routes/routeVoiceGoogle.py
Normal file
|
|
@ -0,0 +1,268 @@
|
||||||
|
"""
|
||||||
|
Google Cloud Voice Services Routes
|
||||||
|
Replaces Azure voice services with Google Cloud Speech-to-Text and Translation
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException
|
||||||
|
from typing import Optional
|
||||||
|
from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech
|
||||||
|
from modules.security.auth import getCurrentUser
|
||||||
|
from modules.interfaces.interfaceAppModel import User
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/voice-google", tags=["voice-google"])
|
||||||
|
|
||||||
|
# Global connector instance
|
||||||
|
_google_speech_connector = None
|
||||||
|
|
||||||
|
def get_google_speech_connector() -> ConnectorGoogleSpeech:
|
||||||
|
"""Get or create Google Cloud Speech connector instance."""
|
||||||
|
global _google_speech_connector
|
||||||
|
|
||||||
|
if _google_speech_connector is None:
|
||||||
|
try:
|
||||||
|
# Get credentials path from environment or config
|
||||||
|
credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||||
|
if not credentials_path:
|
||||||
|
# Try to find credentials in common locations
|
||||||
|
possible_paths = [
|
||||||
|
"credentials/google-service-account.json",
|
||||||
|
"config/google-credentials.json",
|
||||||
|
"google-credentials.json"
|
||||||
|
]
|
||||||
|
|
||||||
|
for path in possible_paths:
|
||||||
|
if os.path.exists(path):
|
||||||
|
credentials_path = path
|
||||||
|
break
|
||||||
|
|
||||||
|
if not credentials_path:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail="Google Cloud credentials not found. Please set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory."
|
||||||
|
)
|
||||||
|
|
||||||
|
_google_speech_connector = ConnectorGoogleSpeech(credentials_path)
|
||||||
|
logger.info("✅ Google Cloud Speech connector initialized")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to initialize Google Cloud Speech connector: {e}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Failed to initialize Google Cloud Speech connector: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return _google_speech_connector
|
||||||
|
|
||||||
|
@router.post("/speech-to-text")
|
||||||
|
async def speech_to_text(
|
||||||
|
audio_file: UploadFile = File(...),
|
||||||
|
language: str = Form("de-DE"),
|
||||||
|
current_user: User = Depends(getCurrentUser)
|
||||||
|
):
|
||||||
|
"""Convert speech to text using Google Cloud Speech-to-Text API."""
|
||||||
|
try:
|
||||||
|
logger.info(f"🎤 Speech-to-text request: {audio_file.filename}, language: {language}")
|
||||||
|
|
||||||
|
# Read audio file
|
||||||
|
audio_content = await audio_file.read()
|
||||||
|
logger.info(f"📊 Audio file size: {len(audio_content)} bytes")
|
||||||
|
|
||||||
|
# Validate audio format
|
||||||
|
connector = get_google_speech_connector()
|
||||||
|
validation = connector.validate_audio_format(audio_content)
|
||||||
|
|
||||||
|
if not validation["valid"]:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform speech recognition
|
||||||
|
result = await connector.speech_to_text(
|
||||||
|
audio_content=audio_content,
|
||||||
|
language=language
|
||||||
|
)
|
||||||
|
|
||||||
|
if result["success"]:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"text": result["text"],
|
||||||
|
"confidence": result["confidence"],
|
||||||
|
"language": result["language"],
|
||||||
|
"audio_info": {
|
||||||
|
"size": len(audio_content),
|
||||||
|
"format": validation["format"],
|
||||||
|
"estimated_duration": validation.get("estimated_duration", 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Speech recognition failed: {result.get('error', 'Unknown error')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Speech-to-text error: {e}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Speech-to-text processing failed: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/translate")
|
||||||
|
async def translate_text(
|
||||||
|
text: str = Form(...),
|
||||||
|
source_language: str = Form("de"),
|
||||||
|
target_language: str = Form("en"),
|
||||||
|
current_user: User = Depends(getCurrentUser)
|
||||||
|
):
|
||||||
|
"""Translate text using Google Cloud Translation API."""
|
||||||
|
try:
|
||||||
|
logger.info(f"🌐 Translation request: '{text}' ({source_language} -> {target_language})")
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Empty text provided for translation"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform translation
|
||||||
|
connector = get_google_speech_connector()
|
||||||
|
result = await connector.translate_text(
|
||||||
|
text=text,
|
||||||
|
source_language=source_language,
|
||||||
|
target_language=target_language
|
||||||
|
)
|
||||||
|
|
||||||
|
if result["success"]:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"original_text": result["original_text"],
|
||||||
|
"translated_text": result["translated_text"],
|
||||||
|
"source_language": result["source_language"],
|
||||||
|
"target_language": result["target_language"]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Translation failed: {result.get('error', 'Unknown error')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Translation error: {e}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Translation processing failed: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.post("/realtime-interpreter")
|
||||||
|
async def realtime_interpreter(
|
||||||
|
audio_file: UploadFile = File(...),
|
||||||
|
from_language: str = Form("de-DE"),
|
||||||
|
to_language: str = Form("en-US"),
|
||||||
|
connection_id: str = Form(None),
|
||||||
|
current_user: User = Depends(getCurrentUser)
|
||||||
|
):
|
||||||
|
"""Real-time interpreter: speech to translated text using Google Cloud APIs."""
|
||||||
|
try:
|
||||||
|
logger.info(f"🔄 Real-time interpreter request: {audio_file.filename}")
|
||||||
|
logger.info(f" From: {from_language} -> To: {to_language}")
|
||||||
|
|
||||||
|
# Read audio file
|
||||||
|
audio_content = await audio_file.read()
|
||||||
|
logger.info(f"📊 Audio file size: {len(audio_content)} bytes")
|
||||||
|
|
||||||
|
# Save audio file for debugging
|
||||||
|
debug_filename = f"debug_audio/audio_google_{audio_file.filename}"
|
||||||
|
os.makedirs("debug_audio", exist_ok=True)
|
||||||
|
with open(debug_filename, "wb") as f:
|
||||||
|
f.write(audio_content)
|
||||||
|
logger.info(f"💾 Saved audio file for debugging: {debug_filename}")
|
||||||
|
|
||||||
|
# Validate audio format
|
||||||
|
connector = get_google_speech_connector()
|
||||||
|
validation = connector.validate_audio_format(audio_content)
|
||||||
|
|
||||||
|
if not validation["valid"]:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform complete pipeline: Speech-to-Text + Translation
|
||||||
|
result = await connector.speech_to_translated_text(
|
||||||
|
audio_content=audio_content,
|
||||||
|
from_language=from_language,
|
||||||
|
to_language=to_language
|
||||||
|
)
|
||||||
|
|
||||||
|
if result["success"]:
|
||||||
|
logger.info(f"✅ Real-time interpreter successful:")
|
||||||
|
logger.info(f" Original: '{result['original_text']}'")
|
||||||
|
logger.info(f" Translated: '{result['translated_text']}'")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"original_text": result["original_text"],
|
||||||
|
"translated_text": result["translated_text"],
|
||||||
|
"confidence": result["confidence"],
|
||||||
|
"source_language": result["source_language"],
|
||||||
|
"target_language": result["target_language"],
|
||||||
|
"audio_info": {
|
||||||
|
"size": len(audio_content),
|
||||||
|
"format": validation["format"],
|
||||||
|
"estimated_duration": validation.get("estimated_duration", 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Real-time interpreter failed: {result.get('error', 'Unknown error')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Real-time interpreter error: {e}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Real-time interpreter processing failed: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
async def health_check(current_user: User = Depends(getCurrentUser)):
|
||||||
|
"""Health check for Google Cloud voice services."""
|
||||||
|
try:
|
||||||
|
connector = get_google_speech_connector()
|
||||||
|
|
||||||
|
# Test with a simple translation
|
||||||
|
test_result = await connector.translate_text(
|
||||||
|
text="Hello",
|
||||||
|
source_language="en",
|
||||||
|
target_language="de"
|
||||||
|
)
|
||||||
|
|
||||||
|
if test_result["success"]:
|
||||||
|
return {
|
||||||
|
"status": "healthy",
|
||||||
|
"service": "Google Cloud Speech-to-Text & Translation",
|
||||||
|
"test_translation": test_result["translated_text"]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"status": "unhealthy",
|
||||||
|
"error": test_result.get("error", "Unknown error")
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Health check failed: {e}")
|
||||||
|
return {
|
||||||
|
"status": "unhealthy",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
@ -1,494 +0,0 @@
|
||||||
"""
|
|
||||||
Azure Voice Services WebSocket Routes
|
|
||||||
Provides real-time WebSocket endpoints for:
|
|
||||||
- Live microphone audio streaming
|
|
||||||
- Real-time speech-to-text
|
|
||||||
- Real-time translation
|
|
||||||
- Real-time text-to-speech
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from typing import Dict, Any, Optional, List
|
|
||||||
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, Query
|
|
||||||
from fastapi.websockets import WebSocketState
|
|
||||||
import base64
|
|
||||||
import io
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from modules.interfaces.interfaceAppObjects import getRootInterface
|
|
||||||
from modules.interfaces.interfaceAppModel import UserInDB
|
|
||||||
from modules.security.auth import getCurrentUser
|
|
||||||
from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/voice/ws", tags=["voice-websocket"])
|
|
||||||
|
|
||||||
class ConnectionManager:
|
|
||||||
"""Manages WebSocket connections for real-time voice services."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.active_connections: Dict[str, WebSocket] = {}
|
|
||||||
self.user_connections: Dict[str, str] = {} # user_id -> connection_id
|
|
||||||
self.connection_metadata: Dict[str, Dict] = {}
|
|
||||||
|
|
||||||
async def connect(self, websocket: WebSocket, connection_id: str, user_id: str, metadata: Dict = None):
|
|
||||||
"""Accept a WebSocket connection."""
|
|
||||||
await websocket.accept()
|
|
||||||
self.active_connections[connection_id] = websocket
|
|
||||||
self.user_connections[user_id] = connection_id
|
|
||||||
self.connection_metadata[connection_id] = metadata or {}
|
|
||||||
logger.info(f"WebSocket connected: {connection_id} for user {user_id}")
|
|
||||||
|
|
||||||
def disconnect(self, connection_id: str):
|
|
||||||
"""Remove a WebSocket connection."""
|
|
||||||
if connection_id in self.active_connections:
|
|
||||||
del self.active_connections[connection_id]
|
|
||||||
# Remove from user mapping
|
|
||||||
user_id = None
|
|
||||||
for uid, cid in self.user_connections.items():
|
|
||||||
if cid == connection_id:
|
|
||||||
user_id = uid
|
|
||||||
break
|
|
||||||
if user_id:
|
|
||||||
del self.user_connections[user_id]
|
|
||||||
if connection_id in self.connection_metadata:
|
|
||||||
del self.connection_metadata[connection_id]
|
|
||||||
logger.info(f"WebSocket disconnected: {connection_id}")
|
|
||||||
|
|
||||||
async def send_message(self, connection_id: str, message: Dict):
|
|
||||||
"""Send a message to a specific connection."""
|
|
||||||
if connection_id in self.active_connections:
|
|
||||||
websocket = self.active_connections[connection_id]
|
|
||||||
try:
|
|
||||||
if websocket.client_state == WebSocketState.CONNECTED:
|
|
||||||
await websocket.send_text(json.dumps(message))
|
|
||||||
else:
|
|
||||||
self.disconnect(connection_id)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error sending message to {connection_id}: {str(e)}")
|
|
||||||
self.disconnect(connection_id)
|
|
||||||
|
|
||||||
async def send_error(self, connection_id: str, error: str, error_code: str = "GENERAL_ERROR"):
|
|
||||||
"""Send an error message to a connection."""
|
|
||||||
await self.send_message(connection_id, {
|
|
||||||
"type": "error",
|
|
||||||
"error": error,
|
|
||||||
"error_code": error_code,
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
def get_connection_metadata(self, connection_id: str) -> Dict:
|
|
||||||
"""Get metadata for a connection."""
|
|
||||||
return self.connection_metadata.get(connection_id, {})
|
|
||||||
|
|
||||||
# Global connection manager
|
|
||||||
manager = ConnectionManager()
|
|
||||||
|
|
||||||
async def get_azure_speech_connector_ws(user_id: str) -> Optional[ConnectorAzureSpeech]:
|
|
||||||
"""Get Azure Speech connector for WebSocket user."""
|
|
||||||
try:
|
|
||||||
root_interface = getRootInterface()
|
|
||||||
|
|
||||||
# Get user by ID
|
|
||||||
user = root_interface.getUser(user_id)
|
|
||||||
if not user:
|
|
||||||
logger.error(f"User with ID {user_id} not found")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get user connections
|
|
||||||
user_connections = root_interface.getUserConnections(user_id)
|
|
||||||
azure_connection = None
|
|
||||||
|
|
||||||
# Find first Azure connection
|
|
||||||
for connection in user_connections:
|
|
||||||
if connection.authority == "msft":
|
|
||||||
azure_connection = connection
|
|
||||||
break
|
|
||||||
|
|
||||||
if not azure_connection:
|
|
||||||
logger.error(f"No Azure connection found for user {user_id}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get connection token
|
|
||||||
connection_token = root_interface.getConnectionToken(azure_connection.id)
|
|
||||||
if not connection_token:
|
|
||||||
logger.error(f"No connection token found for user {user_id}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Use Azure Speech Services subscription key from configuration
|
|
||||||
from modules.shared.configuration import APP_CONFIG
|
|
||||||
subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY")
|
|
||||||
|
|
||||||
if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here":
|
|
||||||
logger.error("Azure Speech Services subscription key not configured")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get region from configuration
|
|
||||||
region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope")
|
|
||||||
|
|
||||||
# Create Azure Speech connector
|
|
||||||
connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region)
|
|
||||||
return connector
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error getting Azure Speech connector for WebSocket: {str(e)}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
@router.websocket("/realtime-interpreter")
|
|
||||||
async def websocket_realtime_interpreter(
|
|
||||||
websocket: WebSocket,
|
|
||||||
user_id: str = Query(...),
|
|
||||||
from_language: str = Query("de-DE"),
|
|
||||||
to_language: str = Query("en-US"),
|
|
||||||
audio_format: str = Query("wav")
|
|
||||||
):
|
|
||||||
"""WebSocket endpoint for real-time interpreter with live audio streaming."""
|
|
||||||
connection_id = f"interpreter_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Connect WebSocket
|
|
||||||
await manager.connect(websocket, connection_id, user_id, {
|
|
||||||
"service": "realtime_interpreter",
|
|
||||||
"from_language": from_language,
|
|
||||||
"to_language": to_language,
|
|
||||||
"audio_format": audio_format
|
|
||||||
})
|
|
||||||
|
|
||||||
# Get Azure Speech connector
|
|
||||||
connector = await get_azure_speech_connector_ws(user_id)
|
|
||||||
if not connector:
|
|
||||||
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Send connection confirmation
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "connected",
|
|
||||||
"connection_id": connection_id,
|
|
||||||
"service": "realtime_interpreter",
|
|
||||||
"from_language": from_language,
|
|
||||||
"to_language": to_language,
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
# Process incoming audio chunks
|
|
||||||
audio_buffer = io.BytesIO()
|
|
||||||
chunk_count = 0
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
# Receive message from client
|
|
||||||
data = await websocket.receive_text()
|
|
||||||
message = json.loads(data)
|
|
||||||
|
|
||||||
if message.get("type") == "audio_chunk":
|
|
||||||
# Decode base64 audio data
|
|
||||||
audio_data = base64.b64decode(message.get("data", ""))
|
|
||||||
audio_buffer.write(audio_data)
|
|
||||||
chunk_count += 1
|
|
||||||
|
|
||||||
# Process every 10 chunks or when buffer is large enough
|
|
||||||
if chunk_count % 10 == 0 or len(audio_data) > 8192:
|
|
||||||
audio_buffer.seek(0)
|
|
||||||
audio_content = audio_buffer.read()
|
|
||||||
|
|
||||||
if len(audio_content) > 0:
|
|
||||||
try:
|
|
||||||
# Convert speech to text
|
|
||||||
stt_result = await connector.speech_to_text(
|
|
||||||
audio_content=audio_content,
|
|
||||||
language=from_language,
|
|
||||||
format="detailed",
|
|
||||||
audio_format=audio_format
|
|
||||||
)
|
|
||||||
|
|
||||||
original_text = stt_result.get("text", "")
|
|
||||||
|
|
||||||
# Translate if different languages
|
|
||||||
translated_text = original_text
|
|
||||||
if from_language != to_language and original_text.strip():
|
|
||||||
try:
|
|
||||||
translated_text = await connector.translate_text(
|
|
||||||
text=original_text,
|
|
||||||
from_language=from_language,
|
|
||||||
to_language=to_language
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Translation failed: {str(e)}")
|
|
||||||
translated_text = original_text
|
|
||||||
|
|
||||||
# Send result back to client
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "translation_result",
|
|
||||||
"original_text": original_text,
|
|
||||||
"translated_text": translated_text,
|
|
||||||
"from_language": from_language,
|
|
||||||
"to_language": to_language,
|
|
||||||
"confidence": stt_result.get("confidence", 0.0),
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing audio chunk: {str(e)}")
|
|
||||||
await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR")
|
|
||||||
|
|
||||||
# Reset buffer
|
|
||||||
audio_buffer = io.BytesIO()
|
|
||||||
chunk_count = 0
|
|
||||||
|
|
||||||
elif message.get("type") == "ping":
|
|
||||||
# Respond to ping with pong
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "pong",
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
elif message.get("type") == "disconnect":
|
|
||||||
# Client requested disconnect
|
|
||||||
break
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
break
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing WebSocket message: {str(e)}")
|
|
||||||
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"WebSocket error: {str(e)}")
|
|
||||||
try:
|
|
||||||
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
manager.disconnect(connection_id)
|
|
||||||
|
|
||||||
@router.websocket("/speech-to-text")
|
|
||||||
async def websocket_speech_to_text(
|
|
||||||
websocket: WebSocket,
|
|
||||||
user_id: str = Query(...),
|
|
||||||
language: str = Query("de-DE"),
|
|
||||||
audio_format: str = Query("wav")
|
|
||||||
):
|
|
||||||
"""WebSocket endpoint for real-time speech-to-text with live audio streaming."""
|
|
||||||
connection_id = f"stt_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Connect WebSocket
|
|
||||||
await manager.connect(websocket, connection_id, user_id, {
|
|
||||||
"service": "speech_to_text",
|
|
||||||
"language": language,
|
|
||||||
"audio_format": audio_format
|
|
||||||
})
|
|
||||||
|
|
||||||
# Get Azure Speech connector
|
|
||||||
connector = await get_azure_speech_connector_ws(user_id)
|
|
||||||
if not connector:
|
|
||||||
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Send connection confirmation
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "connected",
|
|
||||||
"connection_id": connection_id,
|
|
||||||
"service": "speech_to_text",
|
|
||||||
"language": language,
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
# Process incoming audio chunks
|
|
||||||
audio_buffer = io.BytesIO()
|
|
||||||
chunk_count = 0
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
# Receive message from client
|
|
||||||
data = await websocket.receive_text()
|
|
||||||
message = json.loads(data)
|
|
||||||
|
|
||||||
if message.get("type") == "audio_chunk":
|
|
||||||
# Decode base64 audio data
|
|
||||||
audio_data = base64.b64decode(message.get("data", ""))
|
|
||||||
audio_buffer.write(audio_data)
|
|
||||||
chunk_count += 1
|
|
||||||
|
|
||||||
# Process every 10 chunks or when buffer is large enough
|
|
||||||
if chunk_count % 10 == 0 or len(audio_data) > 8192:
|
|
||||||
audio_buffer.seek(0)
|
|
||||||
audio_content = audio_buffer.read()
|
|
||||||
|
|
||||||
if len(audio_content) > 0:
|
|
||||||
try:
|
|
||||||
# Convert speech to text
|
|
||||||
stt_result = await connector.speech_to_text(
|
|
||||||
audio_content=audio_content,
|
|
||||||
language=language,
|
|
||||||
format="detailed",
|
|
||||||
audio_format=audio_format
|
|
||||||
)
|
|
||||||
|
|
||||||
# Send result back to client
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "transcription_result",
|
|
||||||
"text": stt_result.get("text", ""),
|
|
||||||
"confidence": stt_result.get("confidence", 0.0),
|
|
||||||
"language": stt_result.get("language", language),
|
|
||||||
"is_final": stt_result.get("RecognitionStatus") == "Success",
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing audio chunk: {str(e)}")
|
|
||||||
await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR")
|
|
||||||
|
|
||||||
# Reset buffer
|
|
||||||
audio_buffer = io.BytesIO()
|
|
||||||
chunk_count = 0
|
|
||||||
|
|
||||||
elif message.get("type") == "ping":
|
|
||||||
# Respond to ping with pong
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "pong",
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
elif message.get("type") == "disconnect":
|
|
||||||
# Client requested disconnect
|
|
||||||
break
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
break
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing WebSocket message: {str(e)}")
|
|
||||||
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"WebSocket error: {str(e)}")
|
|
||||||
try:
|
|
||||||
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
manager.disconnect(connection_id)
|
|
||||||
|
|
||||||
@router.websocket("/text-to-speech")
|
|
||||||
async def websocket_text_to_speech(
|
|
||||||
websocket: WebSocket,
|
|
||||||
user_id: str = Query(...),
|
|
||||||
language: str = Query("de-DE"),
|
|
||||||
voice: str = Query("de-DE-KatjaNeural"),
|
|
||||||
audio_format: str = Query("audio-16khz-128kbitrate-mono-mp3")
|
|
||||||
):
|
|
||||||
"""WebSocket endpoint for real-time text-to-speech streaming."""
|
|
||||||
connection_id = f"tts_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Connect WebSocket
|
|
||||||
await manager.connect(websocket, connection_id, user_id, {
|
|
||||||
"service": "text_to_speech",
|
|
||||||
"language": language,
|
|
||||||
"voice": voice,
|
|
||||||
"audio_format": audio_format
|
|
||||||
})
|
|
||||||
|
|
||||||
# Get Azure Speech connector
|
|
||||||
connector = await get_azure_speech_connector_ws(user_id)
|
|
||||||
if not connector:
|
|
||||||
await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Send connection confirmation
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "connected",
|
|
||||||
"connection_id": connection_id,
|
|
||||||
"service": "text_to_speech",
|
|
||||||
"language": language,
|
|
||||||
"voice": voice,
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
# Receive message from client
|
|
||||||
data = await websocket.receive_text()
|
|
||||||
message = json.loads(data)
|
|
||||||
|
|
||||||
if message.get("type") == "text_to_speak":
|
|
||||||
text = message.get("text", "")
|
|
||||||
|
|
||||||
if text.strip():
|
|
||||||
try:
|
|
||||||
# Convert text to speech
|
|
||||||
audio_data = await connector.text_to_speech(
|
|
||||||
text=text,
|
|
||||||
language=language,
|
|
||||||
voice=voice,
|
|
||||||
format=audio_format
|
|
||||||
)
|
|
||||||
|
|
||||||
# Send audio data back to client
|
|
||||||
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "audio_data",
|
|
||||||
"audio_data": audio_base64,
|
|
||||||
"format": audio_format,
|
|
||||||
"voice": voice,
|
|
||||||
"text": text,
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error converting text to speech: {str(e)}")
|
|
||||||
await manager.send_error(connection_id, f"TTS failed: {str(e)}", "TTS_ERROR")
|
|
||||||
|
|
||||||
elif message.get("type") == "ping":
|
|
||||||
# Respond to ping with pong
|
|
||||||
await manager.send_message(connection_id, {
|
|
||||||
"type": "pong",
|
|
||||||
"timestamp": datetime.now().isoformat()
|
|
||||||
})
|
|
||||||
|
|
||||||
elif message.get("type") == "disconnect":
|
|
||||||
# Client requested disconnect
|
|
||||||
break
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
break
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing WebSocket message: {str(e)}")
|
|
||||||
await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR")
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"WebSocket error: {str(e)}")
|
|
||||||
try:
|
|
||||||
await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
manager.disconnect(connection_id)
|
|
||||||
|
|
||||||
@router.get("/status")
|
|
||||||
async def websocket_status():
|
|
||||||
"""Get WebSocket connection status."""
|
|
||||||
return {
|
|
||||||
"active_connections": len(manager.active_connections),
|
|
||||||
"connected_users": len(manager.user_connections),
|
|
||||||
"services": {
|
|
||||||
"realtime_interpreter": len([c for c in manager.connection_metadata.values() if c.get("service") == "realtime_interpreter"]),
|
|
||||||
"speech_to_text": len([c for c in manager.connection_metadata.values() if c.get("service") == "speech_to_text"]),
|
|
||||||
"text_to_speech": len([c for c in manager.connection_metadata.values() if c.get("service") == "text_to_speech"])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -57,18 +57,23 @@ class ManagerSyncDelta:
|
||||||
JIRA_ISSUE_TYPE = "Task"
|
JIRA_ISSUE_TYPE = "Task"
|
||||||
|
|
||||||
# Task sync definition for field mapping (like original synchronizer)
|
# Task sync definition for field mapping (like original synchronizer)
|
||||||
TASK_SYNC_DEFINITION = {
|
|
||||||
"ID": ["get", ["key"]],
|
task_sync_definition={
|
||||||
"Summary": ["get", ["fields", "summary"]],
|
#key=excel-header, [get:jira>excel | put: excel>jira, jira-xml-field-list]
|
||||||
"Status": ["get", ["fields", "status", "name"]],
|
'ID': ['get', ['key']],
|
||||||
"Assignee": ["get", ["fields", "assignee", "displayName"]],
|
'Module Category': ['get', ['fields', 'customfield_10058', 'value']],
|
||||||
"Reporter": ["get", ["fields", "reporter", "displayName"]],
|
'Summary': ['get', ['fields', 'summary']],
|
||||||
"Created": ["get", ["fields", "created"]],
|
'Description': ['get', ['fields', 'description']],
|
||||||
"Updated": ["get", ["fields", "updated"]],
|
'References': ['get', ['fields', 'customfield_10066']],
|
||||||
"Priority": ["get", ["fields", "priority", "name"]],
|
'Priority': ['get', ['fields', 'priority', 'name']],
|
||||||
"IssueType": ["get", ["fields", "issuetype", "name"]],
|
'Issue Status': ['get', ['fields', 'customfield_10062']],
|
||||||
"Project": ["get", ["fields", "project", "name"]],
|
'Assignee': ['get', ['fields', 'assignee', 'displayName']],
|
||||||
"Description": ["get", ["fields", "description"]],
|
'Issue Created': ['get', ['fields', 'created']],
|
||||||
|
'Due Date': ['get', ['fields', 'duedate']],
|
||||||
|
'DELTA Comments': ['get', ['fields', 'customfield_10060']],
|
||||||
|
'SELISE Ticket References': ['put', ['fields', 'customfield_10067']],
|
||||||
|
'SELISE Status Values': ['put', ['fields', 'customfield_10065']],
|
||||||
|
'SELISE Comments': ['put', ['fields', 'customfield_10064']],
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,9 @@ Office365-REST-Python-Client==2.6.2 # Easy Sharepoint integration
|
||||||
## Image Processing
|
## Image Processing
|
||||||
Pillow>=10.0.0 # Für Bildverarbeitung (als PIL importiert)
|
Pillow>=10.0.0 # Für Bildverarbeitung (als PIL importiert)
|
||||||
|
|
||||||
|
## Audio Processing
|
||||||
|
# Audio format conversion handled by pure Python implementation
|
||||||
|
|
||||||
## Utilities & Timezone Support
|
## Utilities & Timezone Support
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
|
|
@ -56,6 +59,10 @@ pytz>=2023.3 # For timezone handling and UTC operations
|
||||||
## Dependencies for trio (used by httpx)
|
## Dependencies for trio (used by httpx)
|
||||||
sortedcontainers>=2.4.0 # Required by trio
|
sortedcontainers>=2.4.0 # Required by trio
|
||||||
|
|
||||||
|
## Google Cloud Integration
|
||||||
|
google-cloud-speech==2.21.0
|
||||||
|
google-cloud-translate==3.11.1
|
||||||
|
|
||||||
## MSFT Integration
|
## MSFT Integration
|
||||||
msal==1.24.1
|
msal==1.24.1
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue