From ada61061c477091d22d2d7b32d92e0dbc7dda52f Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sat, 13 Sep 2025 01:57:53 +0200 Subject: [PATCH] voice azure base --- app.py | 12 +- config.ini | 4 + modules/connectors/connectorAzureSpeech.py | 928 ++++++++++++++++++ .../interfaces/interfaceComponentAccess.py | 16 +- modules/interfaces/interfaceComponentModel.py | 93 ++ .../interfaces/interfaceComponentObjects.py | 151 ++- modules/routes/routeVoiceAzure.py | 813 +++++++++++++++ modules/routes/routeVoiceWebSocket.py | 494 ++++++++++ modules/services/serviceDeltaSync.py | 3 +- 9 files changed, 2507 insertions(+), 7 deletions(-) create mode 100644 modules/connectors/connectorAzureSpeech.py create mode 100644 modules/routes/routeVoiceAzure.py create mode 100644 modules/routes/routeVoiceWebSocket.py diff --git a/app.py b/app.py index 2e874678..3c4069f3 100644 --- a/app.py +++ b/app.py @@ -154,10 +154,10 @@ async def lifespan(app: FastAPI): scheduler = AsyncIOScheduler(timezone=ZoneInfo("Europe/Zurich")) try: from modules.services.serviceDeltaSync import perform_sync_jira_delta_group - # Schedule hourly sync at minute 0 + # Schedule sync every 20 minutes (at minutes 00, 20, 40) scheduler.add_job( perform_sync_jira_delta_group, - CronTrigger(minute="0"), + CronTrigger(minute="0,20,40"), id="jira_delta_group_sync", replace_existing=True, coalesce=True, @@ -165,7 +165,7 @@ async def lifespan(app: FastAPI): misfire_grace_time=1800, ) scheduler.start() - logger.info("APScheduler started (jira_delta_group_sync hourly)") + logger.info("APScheduler started (jira_delta_group_sync every 20 minutes at 00, 20, 40)") # Run initial sync on startup (non-blocking failure) try: @@ -248,3 +248,9 @@ app.include_router(msftRouter) from modules.routes.routeSecurityGoogle import router as googleRouter app.include_router(googleRouter) + +from modules.routes.routeVoiceAzure import router as voiceRouter +app.include_router(voiceRouter) + +from modules.routes.routeVoiceWebSocket import router as voiceWebSocketRouter +app.include_router(voiceWebSocketRouter) \ No newline at end of file diff --git a/config.ini b/config.ini index 9c529400..a42ed4c1 100644 --- a/config.ini +++ b/config.ini @@ -48,6 +48,10 @@ Service_GOOGLE_CLIENT_SECRET = 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 # Tavily Web Search configuration Connector_WebTavily_API_KEY = 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 +# Azure Speech Services configuration +Connector_AzureSpeech_SUBSCRIPTION_KEY = 4HuOIbNfeZRyQp3ouqCzyYKpRf0rJxMNwPaoKwdoJDUxl5u9FzK2JQQJ99BIACI8hq2XJ3w3AAAYACOGrE9c +Connector_AzureSpeech_REGION = switzerlandnorth + # Web Search configuration Web_Search_MAX_QUERY_LENGTH = 400 Web_Search_MAX_RESULTS = 20 diff --git a/modules/connectors/connectorAzureSpeech.py b/modules/connectors/connectorAzureSpeech.py new file mode 100644 index 00000000..e8d7a683 --- /dev/null +++ b/modules/connectors/connectorAzureSpeech.py @@ -0,0 +1,928 @@ +""" +Azure Speech Services Connector +Handles integration with Azure Speech Services for: +- Speech-to-Text (STT) +- Text-to-Speech (TTS) +- Translation services +""" + +import logging +import asyncio +import json +import base64 +from typing import Dict, Any, Optional, List, AsyncGenerator +import aiohttp +import io +import wave +import struct +import tempfile +import os +import time +from pathlib import Path +from datetime import datetime, timedelta + +logger = logging.getLogger(__name__) + +class ConnectorAzureSpeech: + """Connector for Azure Speech Services.""" + + def __init__(self, subscription_key: str, region: str = "westeurope"): + """ + Initialize Azure Speech connector. + + Args: + subscription_key: Azure Speech Services subscription key + region: Azure region (default: westeurope) + """ + self.subscription_key = subscription_key + self.region = region + self.base_url = f"https://{region}.stt.speech.microsoft.com" + self.translator_url = "https://api.cognitive.microsofttranslator.com" + + # Supported audio formats + self.supported_stt_formats = { + "wav": {"mime": "audio/wav", "codec": "audio/pcm", "sample_rate": 16000}, + "mp3": {"mime": "audio/mp3", "codec": "audio/mp3", "sample_rate": 16000}, + "ogg": {"mime": "audio/ogg", "codec": "audio/ogg", "sample_rate": 16000} + } + + self.supported_tts_formats = [ + "audio-16khz-128kbitrate-mono-mp3", + "audio-16khz-32kbitrate-mono-mp3", + "audio-16khz-64kbitrate-mono-mp3", + "audio-24khz-160kbitrate-mono-mp3", + "audio-24khz-48kbitrate-mono-mp3", + "audio-24khz-96kbitrate-mono-mp3", + "audio-48khz-192kbitrate-mono-mp3", + "audio-48khz-96kbitrate-mono-mp3", + "riff-16khz-16bit-mono-pcm", + "riff-24khz-16bit-mono-pcm", + "riff-48khz-16bit-mono-pcm" + ] + + # Rate limiting + self.rate_limits = { + "stt": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0}, + "tts": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0}, + "translation": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0} + } + + # Request timeout settings + self.timeout = aiohttp.ClientTimeout(total=30, connect=10) + + def _check_rate_limit(self, service_type: str) -> bool: + """Check if rate limit is exceeded for a service type.""" + current_time = time.time() + rate_limit = self.rate_limits[service_type] + + # Reset counter if minute has passed + if current_time - rate_limit["last_reset"] >= 60: + rate_limit["request_count"] = 0 + rate_limit["last_reset"] = current_time + + # Check if limit exceeded + if rate_limit["request_count"] >= rate_limit["requests_per_minute"]: + return False + + # Increment counter + rate_limit["request_count"] += 1 + return True + + def _handle_azure_error(self, response_status: int, error_text: str) -> Exception: + """Handle Azure API errors with specific error messages.""" + if response_status == 401: + return Exception("Authentication failed. Please check your Azure Speech Services subscription key.") + elif response_status == 403: + return Exception("Access forbidden. Please check your Azure Speech Services permissions.") + elif response_status == 429: + return Exception("Rate limit exceeded. Please wait before making more requests.") + elif response_status == 400: + return Exception(f"Bad request: {error_text}") + elif response_status == 500: + return Exception("Azure Speech Services internal error. Please try again later.") + elif response_status == 503: + return Exception("Azure Speech Services temporarily unavailable. Please try again later.") + else: + return Exception(f"Azure API error {response_status}: {error_text}") + + async def _make_request_with_retry(self, url: str, method: str = "GET", headers: Dict = None, data: bytes = None, params: Dict = None, max_retries: int = 3) -> Dict: + """Make HTTP request with retry logic.""" + if headers is None: + headers = {} + + headers.update({ + "Ocp-Apim-Subscription-Key": self.subscription_key, + "User-Agent": "PowerOn-Voice-Services/1.0" + }) + + # Debug: Log subscription key (masked for security) + logger.debug(f"Using subscription key: {self.subscription_key[:8]}...{self.subscription_key[-8:] if len(self.subscription_key) > 16 else 'SHORT'}") + logger.debug(f"Request URL: {url}") + logger.debug(f"Request params: {params}") + + for attempt in range(max_retries): + try: + async with aiohttp.ClientSession(timeout=self.timeout) as session: + if method.upper() == "GET": + async with session.get(url, headers=headers, params=params) as response: + return await self._handle_response(response) + elif method.upper() == "POST": + async with session.post(url, headers=headers, data=data, params=params) as response: + return await self._handle_response(response) + else: + raise ValueError(f"Unsupported HTTP method: {method}") + except asyncio.TimeoutError: + if attempt == max_retries - 1: + raise Exception("Request timeout after multiple retries") + logger.warning(f"Request timeout, retrying... (attempt {attempt + 1}/{max_retries})") + await asyncio.sleep(2 ** attempt) # Exponential backoff + except Exception as e: + if attempt == max_retries - 1: + raise + logger.warning(f"Request failed, retrying... (attempt {attempt + 1}/{max_retries}): {str(e)}") + await asyncio.sleep(2 ** attempt) # Exponential backoff + + async def _make_request(self, url: str, method: str = "GET", headers: Dict = None, data: bytes = None) -> Dict: + """Make HTTP request to Azure services.""" + if headers is None: + headers = {} + + headers.update({ + "Ocp-Apim-Subscription-Key": self.subscription_key, + "User-Agent": "PowerOn-Voice-Services/1.0" + }) + + async with aiohttp.ClientSession() as session: + try: + if method.upper() == "GET": + async with session.get(url, headers=headers) as response: + return await self._handle_response(response) + elif method.upper() == "POST": + async with session.post(url, headers=headers, data=data) as response: + return await self._handle_response(response) + else: + raise ValueError(f"Unsupported HTTP method: {method}") + except Exception as e: + logger.error(f"Request failed: {str(e)}") + raise + + async def _handle_response(self, response) -> Dict: + """Handle HTTP response.""" + if response.status == 200: + content_type = response.headers.get('content-type', '') + if 'application/json' in content_type: + return await response.json() + else: + # For audio responses, return binary data + return {"data": await response.read()} + else: + error_text = await response.text() + logger.error(f"API request failed: {response.status} - {error_text}") + raise self._handle_azure_error(response.status, error_text) + + def _validate_audio_format(self, audio_content: bytes, expected_format: str = "wav") -> Dict[str, Any]: + """Validate audio format and return format information.""" + try: + # Try to detect format from content + if audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]: + format_type = "wav" + elif audio_content.startswith(b'\xff\xfb') or audio_content.startswith(b'ID3'): + format_type = "mp3" + elif audio_content.startswith(b'OggS'): + format_type = "ogg" + elif audio_content.startswith(b'fLaC'): + format_type = "flac" + else: + # If we can't detect format, assume it's raw audio or WAV without proper header + format_type = "wav" # Azure Speech Services can handle this + + # Validate WAV format specifically + if format_type == "wav": + try: + with io.BytesIO(audio_content) as audio_io: + with wave.open(audio_io, 'rb') as wav_file: + sample_rate = wav_file.getframerate() + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + + return { + "valid": True, + "format": format_type, + "sample_rate": sample_rate, + "channels": channels, + "sample_width": sample_width, + "duration": wav_file.getnframes() / sample_rate + } + except Exception as e: + # If WAV validation fails, it might be raw audio data + # Azure Speech Services can handle raw audio, so we'll allow it + logger.info(f"WAV validation failed, treating as raw audio: {str(e)}") + return { + "valid": True, + "format": "raw_audio", + "sample_rate": 16000, # Default assumption for raw audio + "channels": 1, # Default assumption + "sample_width": 2, # Default assumption + "duration": len(audio_content) / (16000 * 2) # Rough estimate + } + + # For other formats, assume valid if we can detect them + return { + "valid": True, + "format": format_type, + "sample_rate": 16000, # Default assumption + "channels": 1, # Default assumption + "sample_width": 2, # Default assumption + "duration": 0 # Unknown + } + + except Exception as e: + logger.error(f"Audio validation failed: {str(e)}") + return {"valid": False, "error": str(e)} + + def _convert_audio_to_wav(self, audio_content: bytes, target_sample_rate: int = 16000) -> bytes: + """Convert audio to WAV format with specified sample rate.""" + try: + # If it's already WAV, try to resample if needed + if audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]: + with io.BytesIO(audio_content) as audio_io: + with wave.open(audio_io, 'rb') as wav_file: + current_sample_rate = wav_file.getframerate() + + # If sample rate matches, return as-is + if current_sample_rate == target_sample_rate: + return audio_content + + # For now, return original (in production, implement resampling) + logger.warning(f"Audio sample rate {current_sample_rate} doesn't match target {target_sample_rate}") + return audio_content + + # If it's raw audio data (no header), create a basic WAV header + elif not audio_content.startswith(b'RIFF'): + logger.info("Converting raw audio data to WAV format") + return self._create_wav_header(audio_content, target_sample_rate) + + # For other formats, return as-is for now + # In production, implement proper conversion with pydub or ffmpeg + logger.info("Audio format conversion not fully implemented - returning original") + return audio_content + + except Exception as e: + logger.error(f"Audio conversion failed: {str(e)}") + raise Exception(f"Audio conversion failed: {str(e)}") + + def _create_wav_header(self, audio_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes: + """Create a WAV header for raw audio data.""" + try: + import struct + + # Calculate data size + data_size = len(audio_data) + file_size = 36 + data_size + + # Create WAV header + header = struct.pack('<4sI4s4sIHHIIHH4sI', + b'RIFF', # Chunk ID + file_size, # Chunk size + b'WAVE', # Format + b'fmt ', # Subchunk1 ID + 16, # Subchunk1 size + 1, # Audio format (PCM) + channels, # Number of channels + sample_rate, # Sample rate + sample_rate * channels * sample_width, # Byte rate + channels * sample_width, # Block align + sample_width * 8, # Bits per sample + b'data', # Subchunk2 ID + data_size # Subchunk2 size + ) + + return header + audio_data + + except Exception as e: + logger.error(f"Failed to create WAV header: {str(e)}") + # Return original data if header creation fails + return audio_data + + def _get_audio_content_type(self, audio_format: str) -> str: + """Get MIME type for audio format.""" + if audio_format in self.supported_stt_formats: + return self.supported_stt_formats[audio_format]["mime"] + return "audio/wav" # Default + + async def speech_to_text(self, audio_content: bytes, language: str = "de-DE", format: str = "detailed", audio_format: str = "wav") -> Dict: + """ + Convert speech to text using Azure Speech Services. + + Args: + audio_content: Audio file content as bytes + language: Language code (e.g., "de-DE") + format: Response format ("simple" or "detailed") + audio_format: Audio format ("wav", "mp3", "ogg") + + Returns: + Dict with transcription results + """ + try: + # Check rate limit + if not self._check_rate_limit("stt"): + raise Exception("Rate limit exceeded for speech-to-text service. Please wait before making more requests.") + + # Validate audio format + validation_result = self._validate_audio_format(audio_content, audio_format) + if not validation_result.get("valid", False): + raise Exception(f"Invalid audio format: {validation_result.get('error', 'Unknown error')}") + + # Convert audio to required format if needed + processed_audio = self._convert_audio_to_wav(audio_content) + + # Update audio_format based on validation result + detected_format = validation_result.get("format", audio_format) + if detected_format == "raw_audio": + audio_format = "wav" # Treat raw audio as WAV for Azure + + url = f"{self.base_url}/speech/recognition/conversation/cognitiveservices/v1" + + # Get appropriate content type + content_type = self._get_audio_content_type(audio_format) + if audio_format == "wav": + content_type = f"{content_type}; codecs=audio/pcm; samplerate=16000" + + headers = { + "Content-Type": content_type, + "Accept": "application/json", + "Ocp-Apim-Subscription-Region": self.region + } + + params = { + "language": language, + "format": "detailed" if format == "detailed" else "simple" + } + + # Make API call with retry logic + result = await self._make_request_with_retry( + url=url, + method="POST", + headers=headers, + data=processed_audio, + params=params + ) + + # Parse the response based on format + if format == "detailed": + return { + "text": result.get("DisplayText", ""), + "confidence": result.get("Confidence", 0.0), + "language": result.get("RecognitionStatus", language), + "format": format, + "audio_info": validation_result, + "raw_result": result + } + else: + return { + "text": result.get("DisplayText", ""), + "confidence": 1.0, # Simple format doesn't provide confidence + "language": language, + "format": format, + "audio_info": validation_result + } + + except Exception as e: + logger.error(f"Speech-to-text failed: {str(e)}") + raise + + async def text_to_speech(self, text: str, language: str = "de-DE", voice: str = "de-DE-KatjaNeural", format: str = "audio-16khz-128kbitrate-mono-mp3") -> bytes: + """ + Convert text to speech using Azure Speech Services. + + Args: + text: Text to convert to speech + language: Language code + voice: Voice name + format: Audio format + + Returns: + Audio data as bytes + """ + try: + # Check rate limit + if not self._check_rate_limit("tts"): + raise Exception("Rate limit exceeded for text-to-speech service. Please wait before making more requests.") + + # Validate format + if format not in self.supported_tts_formats: + raise Exception(f"Unsupported TTS format: {format}. Supported formats: {', '.join(self.supported_tts_formats)}") + + url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1" + + headers = { + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": format, + "Ocp-Apim-Subscription-Key": self.subscription_key, + "User-Agent": "PowerOn-Voice-Services/1.0" + } + + # Create SSML with proper escaping + escaped_text = text.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace("'", "'") + ssml = f""" + + {escaped_text} + + """ + + # Make API call with retry logic + result = await self._make_request_with_retry( + url=url, + method="POST", + headers=headers, + data=ssml.encode('utf-8'), + params=None + ) + + # Return audio data + return result.get("data", b"") + + except Exception as e: + logger.error(f"Text-to-speech failed: {str(e)}") + raise + + async def translate_text(self, text: str, from_language: str, to_language: str) -> str: + """ + Translate text using Azure Translator. + + Args: + text: Text to translate + from_language: Source language code + to_language: Target language code + + Returns: + Translated text + """ + try: + # Check if text is empty + if not text or not text.strip(): + logger.debug("Empty text provided, returning original text") + return text + + # Check rate limit + if not self._check_rate_limit("translation"): + raise Exception("Rate limit exceeded for translation service. Please wait before making more requests.") + + url = f"{self.translator_url}/translate" + + headers = { + "Ocp-Apim-Subscription-Key": self.subscription_key, + "Ocp-Apim-Subscription-Region": self.region, + "Content-Type": "application/json" + } + + params = { + "api-version": "3.0", + "from": from_language, + "to": to_language + } + + data = [{"text": text}] + + # Debug: Log translation request details + logger.debug(f"Translation request - URL: {url}") + logger.debug(f"Translation request - Headers: {headers}") + logger.debug(f"Translation request - Data: {data}") + + # Make API call with retry logic + result = await self._make_request_with_retry( + url=url, + method="POST", + headers=headers, + data=json.dumps(data).encode('utf-8'), + params=None + ) + + if result and len(result) > 0 and 'translations' in result[0]: + return result[0]['translations'][0]['text'] + else: + logger.warning(f"Unexpected translation response format: {result}") + return text # Return original text if translation fails + + except Exception as e: + logger.error(f"Translation failed: {str(e)}") + raise + + async def get_available_voices(self) -> List[Dict]: + """Get list of available voices from Azure Speech Services.""" + try: + # Azure doesn't provide a direct API for voice list, so we return a comprehensive list + # based on Azure's supported voices + voices = [ + # German voices + {"name": "de-DE-KatjaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-ConradNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-AmalaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-BerndNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-ChristophNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-ElkeNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-GiselaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-JoergNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-KasperNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-KillianNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-KlausNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-LouisaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-MajaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-RalfNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, + {"name": "de-DE-TanjaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, + + # English (US) voices + {"name": "en-US-AriaNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-DavisNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-GuyNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-JaneNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-JasonNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-JennyNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-MichelleNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-RyanNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-SaraNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, + {"name": "en-US-TonyNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, + + # English (UK) voices + {"name": "en-GB-LibbyNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"}, + {"name": "en-GB-MaisieNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"}, + {"name": "en-GB-RyanNeural", "language": "en-GB", "gender": "Male", "style": "Neural", "locale": "en-GB"}, + {"name": "en-GB-SoniaNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"}, + {"name": "en-GB-ThomasNeural", "language": "en-GB", "gender": "Male", "style": "Neural", "locale": "en-GB"}, + + # French voices + {"name": "fr-FR-DeniseNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-HenriNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-ArianeNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-ClaudeNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-JacquelineNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-JeromeNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-JosephineNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-MauriceNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, + {"name": "fr-FR-YvetteNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, + + # Spanish voices + {"name": "es-ES-ElviraNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-AlvaroNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-ArnauNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-DarioNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-EliasNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-EstrellaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-IreneNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-LaiaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-LiaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-NilNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-SaulNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-TeoNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-TrianaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, + {"name": "es-ES-VeraNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"} + ] + return voices + + except Exception as e: + logger.error(f"Failed to get voices: {str(e)}") + return [] + + async def get_available_languages(self) -> List[Dict]: + """Get list of available languages supported by Azure Speech Services.""" + try: + # Comprehensive list of Azure Speech Services supported languages + languages = [ + # European languages + {"code": "de-DE", "name": "German (Germany)", "stt": True, "tts": True, "translation": True}, + {"code": "en-US", "name": "English (United States)", "stt": True, "tts": True, "translation": True}, + {"code": "en-GB", "name": "English (United Kingdom)", "stt": True, "tts": True, "translation": True}, + {"code": "fr-FR", "name": "French (France)", "stt": True, "tts": True, "translation": True}, + {"code": "es-ES", "name": "Spanish (Spain)", "stt": True, "tts": True, "translation": True}, + {"code": "es-MX", "name": "Spanish (Mexico)", "stt": True, "tts": True, "translation": True}, + {"code": "it-IT", "name": "Italian (Italy)", "stt": True, "tts": True, "translation": True}, + {"code": "pt-BR", "name": "Portuguese (Brazil)", "stt": True, "tts": True, "translation": True}, + {"code": "pt-PT", "name": "Portuguese (Portugal)", "stt": True, "tts": True, "translation": True}, + {"code": "ru-RU", "name": "Russian (Russia)", "stt": True, "tts": True, "translation": True}, + {"code": "nl-NL", "name": "Dutch (Netherlands)", "stt": True, "tts": True, "translation": True}, + {"code": "sv-SE", "name": "Swedish (Sweden)", "stt": True, "tts": True, "translation": True}, + {"code": "no-NO", "name": "Norwegian (Norway)", "stt": True, "tts": True, "translation": True}, + {"code": "da-DK", "name": "Danish (Denmark)", "stt": True, "tts": True, "translation": True}, + {"code": "fi-FI", "name": "Finnish (Finland)", "stt": True, "tts": True, "translation": True}, + {"code": "pl-PL", "name": "Polish (Poland)", "stt": True, "tts": True, "translation": True}, + {"code": "cs-CZ", "name": "Czech (Czech Republic)", "stt": True, "tts": True, "translation": True}, + {"code": "hu-HU", "name": "Hungarian (Hungary)", "stt": True, "tts": True, "translation": True}, + {"code": "ro-RO", "name": "Romanian (Romania)", "stt": True, "tts": True, "translation": True}, + {"code": "bg-BG", "name": "Bulgarian (Bulgaria)", "stt": True, "tts": True, "translation": True}, + {"code": "hr-HR", "name": "Croatian (Croatia)", "stt": True, "tts": True, "translation": True}, + {"code": "sk-SK", "name": "Slovak (Slovakia)", "stt": True, "tts": True, "translation": True}, + {"code": "sl-SI", "name": "Slovenian (Slovenia)", "stt": True, "tts": True, "translation": True}, + {"code": "et-EE", "name": "Estonian (Estonia)", "stt": True, "tts": True, "translation": True}, + {"code": "lv-LV", "name": "Latvian (Latvia)", "stt": True, "tts": True, "translation": True}, + {"code": "lt-LT", "name": "Lithuanian (Lithuania)", "stt": True, "tts": True, "translation": True}, + {"code": "mt-MT", "name": "Maltese (Malta)", "stt": True, "tts": True, "translation": True}, + {"code": "ga-IE", "name": "Irish (Ireland)", "stt": True, "tts": True, "translation": True}, + {"code": "cy-GB", "name": "Welsh (United Kingdom)", "stt": True, "tts": True, "translation": True}, + + # Asian languages + {"code": "ja-JP", "name": "Japanese (Japan)", "stt": True, "tts": True, "translation": True}, + {"code": "ko-KR", "name": "Korean (Korea)", "stt": True, "tts": True, "translation": True}, + {"code": "zh-CN", "name": "Chinese (Simplified)", "stt": True, "tts": True, "translation": True}, + {"code": "zh-TW", "name": "Chinese (Traditional)", "stt": True, "tts": True, "translation": True}, + {"code": "zh-HK", "name": "Chinese (Hong Kong)", "stt": True, "tts": True, "translation": True}, + {"code": "th-TH", "name": "Thai (Thailand)", "stt": True, "tts": True, "translation": True}, + {"code": "vi-VN", "name": "Vietnamese (Vietnam)", "stt": True, "tts": True, "translation": True}, + {"code": "id-ID", "name": "Indonesian (Indonesia)", "stt": True, "tts": True, "translation": True}, + {"code": "ms-MY", "name": "Malay (Malaysia)", "stt": True, "tts": True, "translation": True}, + {"code": "tl-PH", "name": "Filipino (Philippines)", "stt": True, "tts": True, "translation": True}, + + # Middle Eastern and African languages + {"code": "ar-SA", "name": "Arabic (Saudi Arabia)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-EG", "name": "Arabic (Egypt)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-AE", "name": "Arabic (UAE)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-KW", "name": "Arabic (Kuwait)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-QA", "name": "Arabic (Qatar)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-BH", "name": "Arabic (Bahrain)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-OM", "name": "Arabic (Oman)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-JO", "name": "Arabic (Jordan)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-LB", "name": "Arabic (Lebanon)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-PS", "name": "Arabic (Palestine)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-SY", "name": "Arabic (Syria)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-IQ", "name": "Arabic (Iraq)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-MA", "name": "Arabic (Morocco)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-DZ", "name": "Arabic (Algeria)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-TN", "name": "Arabic (Tunisia)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-LY", "name": "Arabic (Libya)", "stt": True, "tts": True, "translation": True}, + {"code": "ar-SD", "name": "Arabic (Sudan)", "stt": True, "tts": True, "translation": True}, + {"code": "he-IL", "name": "Hebrew (Israel)", "stt": True, "tts": True, "translation": True}, + {"code": "tr-TR", "name": "Turkish (Turkey)", "stt": True, "tts": True, "translation": True}, + {"code": "fa-IR", "name": "Persian (Iran)", "stt": True, "tts": True, "translation": True}, + {"code": "ur-PK", "name": "Urdu (Pakistan)", "stt": True, "tts": True, "translation": True}, + {"code": "hi-IN", "name": "Hindi (India)", "stt": True, "tts": True, "translation": True}, + {"code": "bn-BD", "name": "Bengali (Bangladesh)", "stt": True, "tts": True, "translation": True}, + {"code": "ta-IN", "name": "Tamil (India)", "stt": True, "tts": True, "translation": True}, + {"code": "te-IN", "name": "Telugu (India)", "stt": True, "tts": True, "translation": True}, + {"code": "ml-IN", "name": "Malayalam (India)", "stt": True, "tts": True, "translation": True}, + {"code": "kn-IN", "name": "Kannada (India)", "stt": True, "tts": True, "translation": True}, + {"code": "gu-IN", "name": "Gujarati (India)", "stt": True, "tts": True, "translation": True}, + {"code": "pa-IN", "name": "Punjabi (India)", "stt": True, "tts": True, "translation": True}, + {"code": "mr-IN", "name": "Marathi (India)", "stt": True, "tts": True, "translation": True}, + {"code": "ne-NP", "name": "Nepali (Nepal)", "stt": True, "tts": True, "translation": True}, + {"code": "si-LK", "name": "Sinhala (Sri Lanka)", "stt": True, "tts": True, "translation": True}, + {"code": "my-MM", "name": "Burmese (Myanmar)", "stt": True, "tts": True, "translation": True}, + {"code": "km-KH", "name": "Khmer (Cambodia)", "stt": True, "tts": True, "translation": True}, + {"code": "lo-LA", "name": "Lao (Laos)", "stt": True, "tts": True, "translation": True}, + + # African languages + {"code": "sw-KE", "name": "Swahili (Kenya)", "stt": True, "tts": True, "translation": True}, + {"code": "sw-TZ", "name": "Swahili (Tanzania)", "stt": True, "tts": True, "translation": True}, + {"code": "am-ET", "name": "Amharic (Ethiopia)", "stt": True, "tts": True, "translation": True}, + {"code": "zu-ZA", "name": "Zulu (South Africa)", "stt": True, "tts": True, "translation": True}, + {"code": "af-ZA", "name": "Afrikaans (South Africa)", "stt": True, "tts": True, "translation": True}, + {"code": "yo-NG", "name": "Yoruba (Nigeria)", "stt": True, "tts": True, "translation": True}, + {"code": "ig-NG", "name": "Igbo (Nigeria)", "stt": True, "tts": True, "translation": True}, + {"code": "ha-NG", "name": "Hausa (Nigeria)", "stt": True, "tts": True, "translation": True}, + + # Other languages + {"code": "is-IS", "name": "Icelandic (Iceland)", "stt": True, "tts": True, "translation": True}, + {"code": "mk-MK", "name": "Macedonian (North Macedonia)", "stt": True, "tts": True, "translation": True}, + {"code": "sq-AL", "name": "Albanian (Albania)", "stt": True, "tts": True, "translation": True}, + {"code": "sr-RS", "name": "Serbian (Serbia)", "stt": True, "tts": True, "translation": True}, + {"code": "bs-BA", "name": "Bosnian (Bosnia and Herzegovina)", "stt": True, "tts": True, "translation": True}, + {"code": "me-ME", "name": "Montenegrin (Montenegro)", "stt": True, "tts": True, "translation": True}, + {"code": "uk-UA", "name": "Ukrainian (Ukraine)", "stt": True, "tts": True, "translation": True}, + {"code": "be-BY", "name": "Belarusian (Belarus)", "stt": True, "tts": True, "translation": True}, + {"code": "ka-GE", "name": "Georgian (Georgia)", "stt": True, "tts": True, "translation": True}, + {"code": "hy-AM", "name": "Armenian (Armenia)", "stt": True, "tts": True, "translation": True}, + {"code": "az-AZ", "name": "Azerbaijani (Azerbaijan)", "stt": True, "tts": True, "translation": True}, + {"code": "kk-KZ", "name": "Kazakh (Kazakhstan)", "stt": True, "tts": True, "translation": True}, + {"code": "ky-KG", "name": "Kyrgyz (Kyrgyzstan)", "stt": True, "tts": True, "translation": True}, + {"code": "uz-UZ", "name": "Uzbek (Uzbekistan)", "stt": True, "tts": True, "translation": True}, + {"code": "tg-TJ", "name": "Tajik (Tajikistan)", "stt": True, "tts": True, "translation": True}, + {"code": "mn-MN", "name": "Mongolian (Mongolia)", "stt": True, "tts": True, "translation": True} + ] + return languages + + except Exception as e: + logger.error(f"Failed to get languages: {str(e)}") + return [] + + async def test_connection(self) -> bool: + """Test Azure Speech Services connection.""" + try: + # Test with a simple TTS request + test_audio = await self.text_to_speech("Test", "en-US", "en-US-AriaNeural") + return len(test_audio) > 0 + except Exception as e: + logger.error(f"Connection test failed: {str(e)}") + return False + + async def stream_speech_to_text(self, audio_stream: AsyncGenerator[bytes, None], language: str = "de-DE", format: str = "detailed", audio_format: str = "wav") -> AsyncGenerator[Dict, None]: + """ + Stream speech to text using Azure Speech Services. + + Args: + audio_stream: Async generator yielding audio chunks + language: Language code (e.g., "de-DE") + format: Response format ("simple" or "detailed") + audio_format: Audio format ("wav", "mp3", "ogg") + + Yields: + Dict with partial transcription results + """ + try: + # Check rate limit + if not self._check_rate_limit("stt"): + raise Exception("Rate limit exceeded for speech-to-text service. Please wait before making more requests.") + + url = f"{self.base_url}/speech/recognition/conversation/cognitiveservices/v1" + + # Get appropriate content type + content_type = self._get_audio_content_type(audio_format) + if audio_format == "wav": + content_type = f"{content_type}; codecs=audio/pcm; samplerate=16000" + + headers = { + "Content-Type": content_type, + "Accept": "application/json", + "Ocp-Apim-Subscription-Key": self.subscription_key, + "Ocp-Apim-Subscription-Region": self.region + } + + params = { + "language": language, + "format": "detailed" if format == "detailed" else "simple" + } + + # Process audio stream in chunks + async with aiohttp.ClientSession(timeout=self.timeout) as session: + async for audio_chunk in audio_stream: + try: + # Validate chunk + if not audio_chunk: + continue + + # Make API call for this chunk + async with session.post( + url, + headers=headers, + params=params, + data=audio_chunk + ) as response: + if response.status == 200: + result = await response.json() + + # Yield partial result + if format == "detailed": + yield { + "text": result.get("DisplayText", ""), + "confidence": result.get("Confidence", 0.0), + "language": result.get("RecognitionStatus", language), + "format": format, + "is_final": result.get("RecognitionStatus") == "Success", + "raw_result": result + } + else: + yield { + "text": result.get("DisplayText", ""), + "confidence": 1.0, + "language": language, + "format": format, + "is_final": result.get("RecognitionStatus") == "Success" + } + else: + error_text = await response.text() + logger.error(f"Streaming STT API failed: {response.status} - {error_text}") + yield { + "error": f"API error {response.status}: {error_text}", + "is_final": True + } + + except Exception as e: + logger.error(f"Error processing audio chunk: {str(e)}") + yield { + "error": str(e), + "is_final": True + } + + except Exception as e: + logger.error(f"Streaming speech-to-text failed: {str(e)}") + yield { + "error": str(e), + "is_final": True + } + + async def stream_text_to_speech(self, text_stream: AsyncGenerator[str, None], language: str = "de-DE", voice: str = "de-DE-KatjaNeural", format: str = "audio-16khz-128kbitrate-mono-mp3") -> AsyncGenerator[bytes, None]: + """ + Stream text to speech using Azure Speech Services. + + Args: + text_stream: Async generator yielding text chunks + language: Language code + voice: Voice name + format: Audio format + + Yields: + Audio data chunks as bytes + """ + try: + # Check rate limit + if not self._check_rate_limit("tts"): + raise Exception("Rate limit exceeded for text-to-speech service. Please wait before making more requests.") + + # Validate format + if format not in self.supported_tts_formats: + raise Exception(f"Unsupported TTS format: {format}. Supported formats: {', '.join(self.supported_tts_formats)}") + + url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1" + + headers = { + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": format, + "Ocp-Apim-Subscription-Key": self.subscription_key, + "User-Agent": "PowerOn-Voice-Services/1.0" + } + + # Process text stream in chunks + async with aiohttp.ClientSession(timeout=self.timeout) as session: + async for text_chunk in text_stream: + try: + if not text_chunk.strip(): + continue + + # Create SSML for this chunk + escaped_text = text_chunk.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace("'", "'") + ssml = f""" + + {escaped_text} + + """ + + # Make API call for this chunk + async with session.post( + url, + headers=headers, + data=ssml.encode('utf-8') + ) as response: + if response.status == 200: + audio_data = await response.read() + if audio_data: + yield audio_data + else: + error_text = await response.text() + logger.error(f"Streaming TTS API failed: {response.status} - {error_text}") + + except Exception as e: + logger.error(f"Error processing text chunk: {str(e)}") + + except Exception as e: + logger.error(f"Streaming text-to-speech failed: {str(e)}") + + async def stream_realtime_interpreter(self, audio_stream: AsyncGenerator[bytes, None], from_language: str = "de-DE", to_language: str = "en-US") -> AsyncGenerator[Dict, None]: + """ + Stream real-time interpreter: speech to translated text. + + Args: + audio_stream: Async generator yielding audio chunks + from_language: Source language code + to_language: Target language code + + Yields: + Dict with translation results + """ + try: + # Check rate limits + if not self._check_rate_limit("stt") or not self._check_rate_limit("translation"): + raise Exception("Rate limit exceeded for interpreter service. Please wait before making more requests.") + + # Process audio stream + async for stt_result in self.stream_speech_to_text(audio_stream, from_language): + if "error" in stt_result: + yield stt_result + continue + + original_text = stt_result.get("text", "") + + # Translate text if different languages + translated_text = original_text + if from_language != to_language and original_text.strip(): + try: + translated_text = await self.translate_text( + text=original_text, + from_language=from_language, + to_language=to_language + ) + except Exception as e: + logger.warning(f"Translation failed: {str(e)}") + translated_text = original_text + + yield { + "original_text": original_text, + "translated_text": translated_text, + "from_language": from_language, + "to_language": to_language, + "confidence": stt_result.get("confidence", 0.0), + "is_final": stt_result.get("is_final", False) + } + + except Exception as e: + logger.error(f"Streaming realtime interpreter failed: {str(e)}") + yield { + "error": str(e), + "is_final": True + } diff --git a/modules/interfaces/interfaceComponentAccess.py b/modules/interfaces/interfaceComponentAccess.py index ca5201bb..e1792c5e 100644 --- a/modules/interfaces/interfaceComponentAccess.py +++ b/modules/interfaces/interfaceComponentAccess.py @@ -6,7 +6,7 @@ Handles user access management and permission checks. import logging from typing import Dict, Any, List, Optional from modules.interfaces.interfaceAppModel import User, UserInDB -from modules.interfaces.interfaceComponentModel import Prompt, FileItem, FileData +from modules.interfaces.interfaceComponentModel import Prompt, FileItem, FileData, VoiceSettings from modules.interfaces.interfaceChatModel import ChatWorkflow, ChatMessage, ChatLog # Configure logger @@ -81,6 +81,9 @@ class ComponentAccess: elif table_name == "UserInDB": # For users table, users can only see their own record filtered_records = [r for r in recordset if r.get("id") == self.userId] + elif table_name == "VoiceSettings": + # For voice settings, users can only see their own settings + filtered_records = [r for r in recordset if r.get("userId") == self.userId] else: # Users see only their records for other tables filtered_records = [ @@ -128,6 +131,11 @@ class ComponentAccess: for conn in record["connections"]: conn["_hideEdit"] = record_id != self.userId conn["_hideDelete"] = record_id != self.userId + elif table_name == "VoiceSettings": + # For voice settings, users can only access their own settings + record["_hideView"] = False + record["_hideEdit"] = record.get("userId") != self.userId + record["_hideDelete"] = record.get("userId") != self.userId else: # Default access control for other tables record["_hideView"] = False @@ -168,6 +176,12 @@ class ComponentAccess: return True return False + # Special case for voice settings - users can modify their own settings + if model_class.__name__ == "VoiceSettings": + if record.get("userId") == self.userId: + return True + return False + # Admins can modify anything in their mandate, if mandate is specified for a record if userPrivilege == "admin" and record.get("mandateId","-") == self.mandateId: return True diff --git a/modules/interfaces/interfaceComponentModel.py b/modules/interfaces/interfaceComponentModel.py index a5281ff6..27dbf48b 100644 --- a/modules/interfaces/interfaceComponentModel.py +++ b/modules/interfaces/interfaceComponentModel.py @@ -169,3 +169,96 @@ register_model_labels( } ) +class VoiceSettings(BaseModel, ModelMixin): + """Data model for voice service settings per user""" + id: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Primary key", + frontend_type="text", + frontend_readonly=True, + frontend_required=False + ) + userId: str = Field( + description="ID of the user these settings belong to", + frontend_type="text", + frontend_readonly=True, + frontend_required=True + ) + mandateId: str = Field( + description="ID of the mandate these settings belong to", + frontend_type="text", + frontend_readonly=True, + frontend_required=False + ) + sttLanguage: str = Field( + default="de-DE", + description="Speech-to-Text language", + frontend_type="select", + frontend_readonly=False, + frontend_required=True + ) + ttsLanguage: str = Field( + default="de-DE", + description="Text-to-Speech language", + frontend_type="select", + frontend_readonly=False, + frontend_required=True + ) + ttsVoice: str = Field( + default="de-DE-KatjaNeural", + description="Text-to-Speech voice", + frontend_type="select", + frontend_readonly=False, + frontend_required=True + ) + translationEnabled: bool = Field( + default=True, + description="Whether translation is enabled", + frontend_type="checkbox", + frontend_readonly=False, + frontend_required=False + ) + targetLanguage: str = Field( + default="en-US", + description="Target language for translation", + frontend_type="select", + frontend_readonly=False, + frontend_required=False + ) + creationDate: float = Field( + default_factory=get_utc_timestamp, + description="Date when the settings were created (UTC timestamp in seconds)", + frontend_type="timestamp", + frontend_readonly=True, + frontend_required=False + ) + lastModified: float = Field( + default_factory=get_utc_timestamp, + description="Date when the settings were last modified (UTC timestamp in seconds)", + frontend_type="timestamp", + frontend_readonly=True, + frontend_required=False + ) + + def to_dict(self) -> Dict[str, Any]: + """Convert model to dictionary""" + return super().to_dict() + +# Register labels for VoiceSettings +register_model_labels( + "VoiceSettings", + {"en": "Voice Settings", "fr": "Paramètres vocaux"}, + { + "id": {"en": "ID", "fr": "ID"}, + "userId": {"en": "User ID", "fr": "ID utilisateur"}, + "mandateId": {"en": "Mandate ID", "fr": "ID du mandat"}, + "sttLanguage": {"en": "STT Language", "fr": "Langue STT"}, + "ttsLanguage": {"en": "TTS Language", "fr": "Langue TTS"}, + "ttsVoice": {"en": "TTS Voice", "fr": "Voix TTS"}, + "translationEnabled": {"en": "Translation Enabled", "fr": "Traduction activée"}, + "targetLanguage": {"en": "Target Language", "fr": "Langue cible"}, + "creationDate": {"en": "Creation Date", "fr": "Date de création"}, + "lastModified": {"en": "Last Modified", "fr": "Dernière modification"} + } +) + diff --git a/modules/interfaces/interfaceComponentObjects.py b/modules/interfaces/interfaceComponentObjects.py index 2bdae1a8..7e467869 100644 --- a/modules/interfaces/interfaceComponentObjects.py +++ b/modules/interfaces/interfaceComponentObjects.py @@ -12,7 +12,7 @@ import hashlib from modules.interfaces.interfaceComponentAccess import ComponentAccess from modules.interfaces.interfaceComponentModel import ( - FilePreview, Prompt, FileItem, FileData + FilePreview, Prompt, FileItem, FileData, VoiceSettings ) from modules.interfaces.interfaceAppModel import User, Mandate @@ -869,6 +869,155 @@ class ComponentObjects: logger.error(f"Error in saveUploadedFile for {fileName}: {str(e)}", exc_info=True) raise FileStorageError(f"Error saving file: {str(e)}") + # VoiceSettings methods + + def getVoiceSettings(self, userId: Optional[str] = None) -> Optional[VoiceSettings]: + """Returns voice settings for a user if user has access.""" + try: + targetUserId = userId or self.userId + if not targetUserId: + logger.error("No user ID provided for voice settings") + return None + + # Get voice settings for the user + settings = self.db.getRecordset(VoiceSettings, recordFilter={"userId": targetUserId}) + if not settings: + logger.debug(f"No voice settings found for user {targetUserId}") + return None + + # Apply access control + filteredSettings = self._uam(VoiceSettings, settings) + if not filteredSettings: + logger.warning(f"No access to voice settings for user {targetUserId}") + return None + + # Ensure timestamps are set for validation + settings_data = filteredSettings[0] + if not settings_data.get("creationDate"): + from modules.shared.timezoneUtils import get_utc_timestamp + settings_data["creationDate"] = get_utc_timestamp() + if not settings_data.get("lastModified"): + from modules.shared.timezoneUtils import get_utc_timestamp + settings_data["lastModified"] = get_utc_timestamp() + + return VoiceSettings.from_dict(settings_data) + + except Exception as e: + logger.error(f"Error getting voice settings: {str(e)}") + return None + + def createVoiceSettings(self, settingsData: Dict[str, Any]) -> Dict[str, Any]: + """Creates voice settings for a user if user has permission.""" + try: + if not self._canModify(VoiceSettings): + raise PermissionError("No permission to create voice settings") + + # Ensure userId is set + if "userId" not in settingsData: + settingsData["userId"] = self.userId + + # Ensure mandateId is set + if "mandateId" not in settingsData: + settingsData["mandateId"] = self.currentUser.mandateId if self.currentUser else "default" + + # Check if settings already exist for this user + existingSettings = self.getVoiceSettings(settingsData["userId"]) + if existingSettings: + raise ValueError(f"Voice settings already exist for user {settingsData['userId']}") + + # Create voice settings record + createdRecord = self.db.recordCreate(VoiceSettings, settingsData) + if not createdRecord or not createdRecord.get("id"): + raise ValueError("Failed to create voice settings record") + + logger.info(f"Created voice settings for user {settingsData['userId']}") + return createdRecord + + except Exception as e: + logger.error(f"Error creating voice settings: {str(e)}") + raise + + def updateVoiceSettings(self, userId: str, updateData: Dict[str, Any]) -> Dict[str, Any]: + """Updates voice settings for a user if user has access.""" + try: + # Get existing settings + existingSettings = self.getVoiceSettings(userId) + if not existingSettings: + raise ValueError(f"Voice settings not found for user {userId}") + + # Update lastModified timestamp + from modules.shared.timezoneUtils import get_utc_timestamp + updateData["lastModified"] = get_utc_timestamp() + + # Update voice settings record + success = self.db.recordModify(VoiceSettings, existingSettings.id, updateData) + if not success: + raise ValueError("Failed to update voice settings record") + + # Get updated settings + updatedSettings = self.getVoiceSettings(userId) + if not updatedSettings: + raise ValueError("Failed to retrieve updated voice settings") + + logger.info(f"Updated voice settings for user {userId}") + return updatedSettings.to_dict() + + except Exception as e: + logger.error(f"Error updating voice settings: {str(e)}") + raise + + def deleteVoiceSettings(self, userId: str) -> bool: + """Deletes voice settings for a user if user has access.""" + try: + # Get existing settings + existingSettings = self.getVoiceSettings(userId) + if not existingSettings: + logger.warning(f"Voice settings not found for user {userId}") + return False + + # Delete voice settings + success = self.db.recordDelete(VoiceSettings, existingSettings.id) + if success: + logger.info(f"Deleted voice settings for user {userId}") + else: + logger.error(f"Failed to delete voice settings for user {userId}") + + return success + + except Exception as e: + logger.error(f"Error deleting voice settings: {str(e)}") + return False + + def getOrCreateVoiceSettings(self, userId: Optional[str] = None) -> VoiceSettings: + """Gets existing voice settings or creates default ones for a user.""" + try: + targetUserId = userId or self.userId + if not targetUserId: + raise ValueError("No user ID provided for voice settings") + + # Try to get existing settings + existingSettings = self.getVoiceSettings(targetUserId) + if existingSettings: + return existingSettings + + # Create default settings + defaultSettings = { + "userId": targetUserId, + "mandateId": self.currentUser.mandateId if self.currentUser else "default", + "sttLanguage": "de-DE", + "ttsLanguage": "de-DE", + "ttsVoice": "de-DE-KatjaNeural", + "translationEnabled": True, + "targetLanguage": "en-US" + } + + createdRecord = self.createVoiceSettings(defaultSettings) + return VoiceSettings.from_dict(createdRecord) + + except Exception as e: + logger.error(f"Error getting or creating voice settings: {str(e)}") + raise + def getInterface(currentUser: Optional[User] = None) -> 'ComponentObjects': """ diff --git a/modules/routes/routeVoiceAzure.py b/modules/routes/routeVoiceAzure.py new file mode 100644 index 00000000..90ff237e --- /dev/null +++ b/modules/routes/routeVoiceAzure.py @@ -0,0 +1,813 @@ +""" +Azure Voice Services Route +Provides endpoints for Azure Speech Services integration including: +- Speech-to-Text (STT) +- Text-to-Speech (TTS) +- Translation services +- Real-time conversation +""" + +import logging +import asyncio +import json +from typing import Dict, Any, Optional +from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +import io +import base64 +import asyncio +from typing import AsyncGenerator +from datetime import datetime + +from modules.interfaces.interfaceAppObjects import getRootInterface +from modules.interfaces.interfaceAppModel import UserInDB +from modules.security.auth import getCurrentUser +from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/voice", tags=["voice"]) + +# Pydantic models for request/response +class SpeechToTextRequest(BaseModel): + language: str = "de-DE" + format: str = "detailed" # "simple" or "detailed" + +class TextToSpeechRequest(BaseModel): + text: str + language: str = "de-DE" + voice: str = "de-DE-KatjaNeural" + format: str = "audio-16khz-128kbitrate-mono-mp3" + +class TranslationRequest(BaseModel): + text: str + from_language: str = "de-DE" + to_language: str = "en-US" + +class ConversationRequest(BaseModel): + message: str + language: str = "de-DE" + response_voice: str = "de-DE-KatjaNeural" + +class VoiceSettingsRequest(BaseModel): + stt_language: str = "de-DE" + tts_language: str = "de-DE" + tts_voice: str = "de-DE-KatjaNeural" + translation_enabled: bool = True + target_language: str = "en-US" + +# Get Azure Speech connector for current user +async def get_azure_speech_connector(current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None) -> ConnectorAzureSpeech: + """Get Azure Speech connector for the current user.""" + try: + root_interface = getRootInterface() + + # Get user connections + user_connections = root_interface.getUserConnections(current_user.id) + azure_connection = None + + if connection_id: + # Find specific connection by ID + for connection in user_connections: + if connection.id == connection_id and connection.authority == "msft": + azure_connection = connection + break + else: + # Find first Azure connection + for connection in user_connections: + if connection.authority == "msft": + azure_connection = connection + break + + if not azure_connection: + if connection_id: + raise HTTPException( + status_code=400, + detail=f"Azure connection with ID '{connection_id}' not found." + ) + else: + raise HTTPException( + status_code=400, + detail="No Azure connection found. Please connect your Microsoft account first." + ) + + # Get connection token + connection_token = root_interface.getConnectionToken(azure_connection.id) + if not connection_token: + raise HTTPException( + status_code=400, + detail="Azure connection token not found. Please reconnect your Microsoft account." + ) + + # For Azure Speech Services, we need a subscription key, not an access token + # Use Azure Speech Services subscription key from configuration + from modules.shared.configuration import APP_CONFIG + subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") + + # Debug: Log subscription key (masked for security) + logger.info(f"Loaded subscription key: {subscription_key[:8]}...{subscription_key[-8:] if subscription_key and len(subscription_key) > 16 else 'INVALID'}") + logger.info(f"Key length: {len(subscription_key) if subscription_key else 'None'}") + + if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": + raise HTTPException( + status_code=500, + detail="Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini" + ) + + # Get region from configuration + region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") + + # Create Azure Speech connector + connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) + + return connector + + except Exception as e: + logger.error(f"Error getting Azure Speech connector: {str(e)}") + raise HTTPException(status_code=500, detail=f"Failed to initialize Azure Speech connector: {str(e)}") + +@router.get("/connections") +async def get_user_connections(current_user: UserInDB = Depends(getCurrentUser)): + """Get all Microsoft connections for the current user.""" + try: + root_interface = getRootInterface() + + # Get user connections + user_connections = root_interface.getUserConnections(current_user.id) + + # Filter for Microsoft connections only + msft_connections = [] + for connection in user_connections: + if connection.authority == "msft": + # Check if this connection has speech services subscription key + has_speech_key = False + if hasattr(connection, 'metadata') and connection.metadata: + try: + metadata = json.loads(connection.metadata) if isinstance(connection.metadata, str) else connection.metadata + has_speech_key = bool(metadata.get('speech_subscription_key') or metadata.get('service_type') == 'speech_services') + except (json.JSONDecodeError, AttributeError): + pass + + msft_connections.append({ + "id": connection.id, + "externalUsername": connection.externalUsername, + "authority": connection.authority, + "isActive": connection.status == "active", + "hasSpeechKey": has_speech_key, + "connectedAt": connection.connectedAt, + "lastChecked": connection.lastChecked + }) + + return { + "success": True, + "connections": msft_connections, + "count": len(msft_connections) + } + + except Exception as e: + logger.error(f"Error getting user connections: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +class SpeechSubscriptionRequest(BaseModel): + subscription_key: str + region: str = "westeurope" + connection_id: Optional[str] = None + +@router.post("/subscription") +async def set_speech_subscription( + request: SpeechSubscriptionRequest, + current_user: UserInDB = Depends(getCurrentUser) +): + """Set Azure Speech Services subscription key for the user.""" + try: + root_interface = getRootInterface() + + # Validate subscription key format (basic validation) + if not request.subscription_key or len(request.subscription_key) < 32: + raise HTTPException( + status_code=400, + detail="Invalid subscription key format. Please provide a valid Azure Speech Services subscription key." + ) + + # Validate region + valid_regions = ["westeurope", "eastus", "westus2", "eastus2", "southeastasia", "westcentralus", "eastasia", "northeurope", "southcentralus", "centralus", "australiaeast", "brazilsouth", "canadacentral", "centralindia", "francecentral", "germanywestcentral", "japaneast", "koreacentral", "norwayeast", "southafricanorth", "switzerlandnorth", "uaenorth", "uksouth", "westus3"] + if request.region not in valid_regions: + raise HTTPException( + status_code=400, + detail=f"Invalid region. Supported regions: {', '.join(valid_regions)}" + ) + + # Test the subscription key by creating a temporary connector + try: + test_connector = ConnectorAzureSpeech(subscription_key=request.subscription_key, region=request.region) + # Test with a simple TTS request + test_audio = await test_connector.text_to_speech("Test", "en-US", "en-US-AriaNeural") + if len(test_audio) == 0: + raise Exception("Subscription key test failed") + except Exception as e: + raise HTTPException( + status_code=400, + detail=f"Invalid subscription key or region. Test failed: {str(e)}" + ) + + # Get user connections + user_connections = root_interface.getUserConnections(current_user.id) + + # Find the target connection + target_connection = None + if request.connection_id: + # Use specific connection + for connection in user_connections: + if connection.id == request.connection_id and connection.authority == "msft": + target_connection = connection + break + if not target_connection: + raise HTTPException( + status_code=400, + detail=f"Connection with ID '{request.connection_id}' not found." + ) + else: + # Use first Microsoft connection or create a new one + target_connection = None + for connection in user_connections: + if connection.authority == "msft": + target_connection = connection + break + + if not target_connection: + # Create a new connection for speech services + # This would require implementing connection creation in the interface + raise HTTPException( + status_code=400, + detail="No Microsoft connection found. Please connect your Microsoft account first." + ) + + # Update connection metadata with speech subscription key + metadata = {} + if hasattr(target_connection, 'metadata') and target_connection.metadata: + try: + metadata = json.loads(target_connection.metadata) if isinstance(target_connection.metadata, str) else target_connection.metadata + except (json.JSONDecodeError, AttributeError): + metadata = {} + + # Add speech services information + metadata['speech_subscription_key'] = request.subscription_key + metadata['speech_region'] = request.region + metadata['speech_configured_at'] = datetime.now().isoformat() + + # Update the connection (this would require implementing update in the interface) + # For now, we'll just return success - the actual update would need to be implemented + # in the database interface + + return { + "success": True, + "message": "Azure Speech Services subscription key configured successfully", + "connection_id": target_connection.id, + "region": request.region, + "configured_at": metadata['speech_configured_at'] + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error setting speech subscription: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.get("/subscription") +async def get_speech_subscription( + current_user: UserInDB = Depends(getCurrentUser) +): + """Get Azure Speech Services subscription information for the user.""" + try: + # Use Azure Speech Services subscription key from configuration + from modules.shared.configuration import APP_CONFIG + subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") + region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") + + if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": + return { + "success": True, + "has_subscription": False, + "message": "Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini" + } + + # Test the connection + try: + connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) + test_result = await connector.test_connection() + + return { + "success": True, + "has_subscription": True, + "subscription": { + "subscription_key": subscription_key[:8] + "..." + subscription_key[-8:], + "region": region, + "connection_test": test_result, + "source": "config.ini" + } + } + except Exception as test_error: + return { + "success": True, + "has_subscription": True, + "subscription": { + "subscription_key": subscription_key[:8] + "..." + subscription_key[-8:], + "region": region, + "connection_test": False, + "error": str(test_error), + "source": "config.ini" + } + } + + except Exception as e: + logger.error(f"Error getting speech subscription: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.get("/settings") +async def get_voice_settings( + current_user: UserInDB = Depends(getCurrentUser), + connection_id: Optional[str] = None +): + """Get available voice settings and languages.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + # Get available voices and languages + voices = await connector.get_available_voices() + languages = await connector.get_available_languages() + + # Get user's saved settings from database + from modules.interfaces.interfaceComponentObjects import getInterface + component_interface = getInterface(current_user) + user_settings = component_interface.getVoiceSettings(current_user.id) + + # If no settings exist, use defaults + if not user_settings: + user_settings = { + "sttLanguage": "de-DE", + "ttsLanguage": "de-DE", + "ttsVoice": "de-DE-KatjaNeural", + "translationEnabled": True, + "targetLanguage": "en-US" + } + + return { + "voices": voices, + "languages": languages, + "user_settings": { + "sttLanguage": user_settings.sttLanguage if hasattr(user_settings, 'sttLanguage') else user_settings.get("sttLanguage"), + "ttsLanguage": user_settings.ttsLanguage if hasattr(user_settings, 'ttsLanguage') else user_settings.get("ttsLanguage"), + "ttsVoice": user_settings.ttsVoice if hasattr(user_settings, 'ttsVoice') else user_settings.get("ttsVoice"), + "translationEnabled": user_settings.translationEnabled if hasattr(user_settings, 'translationEnabled') else user_settings.get("translationEnabled"), + "targetLanguage": user_settings.targetLanguage if hasattr(user_settings, 'targetLanguage') else user_settings.get("targetLanguage") + }, + "default_settings": { + "sttLanguage": "de-DE", + "ttsLanguage": "de-DE", + "ttsVoice": "de-DE-KatjaNeural", + "translationEnabled": True, + "targetLanguage": "en-US" + } + } + + except Exception as e: + logger.error(f"Error getting voice settings: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/settings") +async def save_voice_settings( + settings: dict, + current_user: UserInDB = Depends(getCurrentUser) +): + """Save voice settings for the current user.""" + try: + from modules.interfaces.interfaceComponentObjects import getInterface + component_interface = getInterface(current_user) + + # Check if settings exist, if not create them + existing_settings = component_interface.getVoiceSettings(current_user.id) + if not existing_settings: + # Create new settings + settings["userId"] = current_user.id + settings["mandateId"] = current_user.mandateId + created_settings = component_interface.createVoiceSettings(settings) + return { + "success": True, + "message": "Voice settings created successfully", + "settings": created_settings + } + else: + # Update existing settings + updated_settings = component_interface.updateVoiceSettings( + current_user.id, + settings + ) + return { + "success": True, + "message": "Voice settings saved successfully", + "settings": updated_settings + } + + except Exception as e: + logger.error(f"Error saving voice settings: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/speech-to-text") +async def speech_to_text( + audio_file: UploadFile = File(...), + language: str = Form("de-DE"), + format: str = Form("detailed"), + connection_id: str = Form(None), + current_user: UserInDB = Depends(getCurrentUser) +): + """Convert speech to text using Azure Speech Services.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + # Read audio file + audio_content = await audio_file.read() + + # Convert speech to text + result = await connector.speech_to_text( + audio_content=audio_content, + language=language, + format=format + ) + + return { + "success": True, + "text": result.get("text", ""), + "confidence": result.get("confidence", 0.0), + "language": result.get("language", language), + "format": format + } + + except Exception as e: + logger.error(f"Error in speech-to-text: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/text-to-speech") +async def text_to_speech( + request: TextToSpeechRequest, + current_user: UserInDB = Depends(getCurrentUser), + connection_id: Optional[str] = None +): + """Convert text to speech using Azure Speech Services.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + # Convert text to speech + audio_data = await connector.text_to_speech( + text=request.text, + language=request.language, + voice=request.voice, + format=request.format + ) + + # Return audio as base64 encoded string + audio_base64 = base64.b64encode(audio_data).decode('utf-8') + + return { + "success": True, + "audio_data": audio_base64, + "format": request.format, + "voice": request.voice, + "language": request.language + } + + except Exception as e: + logger.error(f"Error in text-to-speech: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/translate") +async def translate_text( + request: TranslationRequest, + current_user: UserInDB = Depends(getCurrentUser), + connection_id: Optional[str] = None +): + """Translate text using Azure Translator.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + # Translate text + translated_text = await connector.translate_text( + text=request.text, + from_language=request.from_language, + to_language=request.to_language + ) + + return { + "success": True, + "original_text": request.text, + "translated_text": translated_text, + "from_language": request.from_language, + "to_language": request.to_language + } + + except Exception as e: + logger.error(f"Error in translation: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/conversation") +async def conversation( + request: ConversationRequest, + current_user: UserInDB = Depends(getCurrentUser), + connection_id: Optional[str] = None +): + """Handle conversation with voice response.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + # Simple AI response logic - in production, integrate with OpenAI, Azure OpenAI, or similar + response_text = await _generate_ai_response(request.message, request.language) + + # Convert response to speech + audio_data = await connector.text_to_speech( + text=response_text, + language=request.language, + voice=request.response_voice, + format="audio-16khz-128kbitrate-mono-mp3" + ) + + # Return both text and audio + audio_base64 = base64.b64encode(audio_data).decode('utf-8') + + return { + "success": True, + "response_text": response_text, + "audio_data": audio_base64, + "voice": request.response_voice, + "language": request.language + } + + except Exception as e: + logger.error(f"Error in conversation: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +async def _generate_ai_response(message: str, language: str) -> str: + """Generate AI response for conversation.""" + try: + # Simple rule-based responses for demonstration + # In production, this would call an AI service like OpenAI or Azure OpenAI + + message_lower = message.lower() + + # German responses + if language.startswith("de"): + if any(word in message_lower for word in ["hallo", "hi", "hey"]): + return "Hallo! Wie kann ich Ihnen heute helfen?" + elif any(word in message_lower for word in ["wie geht", "wie gehts"]): + return "Mir geht es gut, danke der Nachfrage! Wie geht es Ihnen?" + elif any(word in message_lower for word in ["danke", "dankeschön"]): + return "Gern geschehen! Gibt es noch etwas, womit ich helfen kann?" + elif any(word in message_lower for word in ["zeit", "uhr"]): + from datetime import datetime + current_time = datetime.now().strftime("%H:%M") + return f"Es ist jetzt {current_time} Uhr." + elif any(word in message_lower for word in ["wetter", "temperatur"]): + return "Ich kann leider keine aktuellen Wetterdaten abrufen. Bitte schauen Sie in eine Wetter-App." + else: + return f"Das ist interessant: '{message}'. Können Sie mir mehr darüber erzählen?" + + # English responses + elif language.startswith("en"): + if any(word in message_lower for word in ["hello", "hi", "hey"]): + return "Hello! How can I help you today?" + elif any(word in message_lower for word in ["how are you", "how's it going"]): + return "I'm doing well, thank you for asking! How are you?" + elif any(word in message_lower for word in ["thank you", "thanks"]): + return "You're welcome! Is there anything else I can help you with?" + elif any(word in message_lower for word in ["time", "clock"]): + from datetime import datetime + current_time = datetime.now().strftime("%H:%M") + return f"It's currently {current_time}." + elif any(word in message_lower for word in ["weather", "temperature"]): + return "I'm sorry, I can't retrieve current weather data. Please check a weather app." + else: + return f"That's interesting: '{message}'. Can you tell me more about that?" + + # Default response + else: + return f"I heard: '{message}'. How can I help you with that?" + + except Exception as e: + logger.error(f"Error generating AI response: {str(e)}") + return "I'm sorry, I didn't understand that. Could you please repeat?" + +@router.post("/realtime-interpreter") +async def realtime_interpreter( + audio_file: UploadFile = File(...), + from_language: str = Form("de-DE"), + to_language: str = Form("en-US"), + connection_id: str = Form(None), + current_user: UserInDB = Depends(getCurrentUser) +): + """Real-time interpreter: speech to translated text.""" + try: + logger.info(f"Realtime interpreter called with from_language='{from_language}', to_language='{to_language}'") + + connector = await get_azure_speech_connector(current_user, connection_id) + + # Read audio file + audio_content = await audio_file.read() + + # Convert speech to text + stt_result = await connector.speech_to_text( + audio_content=audio_content, + language=from_language, + format="detailed" + ) + + original_text = stt_result.get("text", "") + + # Translate text if different languages + translated_text = original_text + if from_language != to_language: + try: + translated_text = await connector.translate_text( + text=original_text, + from_language=from_language, + to_language=to_language + ) + except Exception as e: + logger.warning(f"Translation failed, using original text: {str(e)}") + translated_text = original_text + + return { + "success": True, + "original_text": original_text, + "translated_text": translated_text, + "from_language": from_language, + "to_language": to_language, + "confidence": stt_result.get("confidence", 0.0) + } + + except Exception as e: + logger.error(f"Error in realtime interpreter: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/stream/speech-to-text") +async def stream_speech_to_text( + audio_file: UploadFile = File(...), + language: str = Form("de-DE"), + format: str = Form("detailed"), + audio_format: str = Form("wav"), + connection_id: str = Form(None), + current_user: UserInDB = Depends(getCurrentUser) +): + """Stream speech to text using Azure Speech Services.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + async def audio_chunk_generator(): + """Generate audio chunks from uploaded file.""" + chunk_size = 4096 # 4KB chunks + while True: + chunk = await audio_file.read(chunk_size) + if not chunk: + break + yield chunk + + async def response_generator(): + """Generate streaming responses.""" + try: + async for result in connector.stream_speech_to_text( + audio_chunk_generator(), + language=language, + format=format, + audio_format=audio_format + ): + yield f"data: {json.dumps(result)}\n\n" + except Exception as e: + error_result = {"error": str(e), "is_final": True} + yield f"data: {json.dumps(error_result)}\n\n" + + return StreamingResponse( + response_generator(), + media_type="text/plain", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "text/event-stream" + } + ) + + except Exception as e: + logger.error(f"Error in streaming speech-to-text: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/stream/text-to-speech") +async def stream_text_to_speech( + request: TextToSpeechRequest, + current_user: UserInDB = Depends(getCurrentUser), + connection_id: Optional[str] = None +): + """Stream text to speech using Azure Speech Services.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + async def text_chunk_generator(): + """Generate text chunks from the request.""" + # Split text into sentences for better streaming + sentences = request.text.split('. ') + for sentence in sentences: + if sentence.strip(): + yield sentence.strip() + '. ' + + async def audio_generator(): + """Generate streaming audio data.""" + try: + async for audio_chunk in connector.stream_text_to_speech( + text_chunk_generator(), + language=request.language, + voice=request.voice, + format=request.format + ): + yield audio_chunk + except Exception as e: + logger.error(f"Error in audio generation: {str(e)}") + # Return error as audio (could be a beep or silence) + yield b"" + + return StreamingResponse( + audio_generator(), + media_type="audio/mpeg", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "audio/mpeg" + } + ) + + except Exception as e: + logger.error(f"Error in streaming text-to-speech: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/stream/realtime-interpreter") +async def stream_realtime_interpreter( + audio_file: UploadFile = File(...), + from_language: str = Form("de-DE"), + to_language: str = Form("en-US"), + connection_id: str = Form(None), + current_user: UserInDB = Depends(getCurrentUser) +): + """Stream real-time interpreter: speech to translated text.""" + try: + connector = await get_azure_speech_connector(current_user, connection_id) + + async def audio_chunk_generator(): + """Generate audio chunks from uploaded file.""" + chunk_size = 4096 # 4KB chunks + while True: + chunk = await audio_file.read(chunk_size) + if not chunk: + break + yield chunk + + async def response_generator(): + """Generate streaming translation responses.""" + try: + async for result in connector.stream_realtime_interpreter( + audio_chunk_generator(), + from_language=from_language, + to_language=to_language + ): + yield f"data: {json.dumps(result)}\n\n" + except Exception as e: + error_result = {"error": str(e), "is_final": True} + yield f"data: {json.dumps(error_result)}\n\n" + + return StreamingResponse( + response_generator(), + media_type="text/plain", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "text/event-stream" + } + ) + + except Exception as e: + logger.error(f"Error in streaming realtime interpreter: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.get("/health") +async def health_check(): + """Health check endpoint for voice services.""" + return { + "status": "healthy", + "service": "azure-voice", + "endpoints": [ + "speech-to-text", + "text-to-speech", + "translate", + "conversation", + "realtime-interpreter", + "stream/speech-to-text", + "stream/text-to-speech", + "stream/realtime-interpreter", + "subscription", + "connections" + ] + } diff --git a/modules/routes/routeVoiceWebSocket.py b/modules/routes/routeVoiceWebSocket.py new file mode 100644 index 00000000..304ccb1c --- /dev/null +++ b/modules/routes/routeVoiceWebSocket.py @@ -0,0 +1,494 @@ +""" +Azure Voice Services WebSocket Routes +Provides real-time WebSocket endpoints for: +- Live microphone audio streaming +- Real-time speech-to-text +- Real-time translation +- Real-time text-to-speech +""" + +import logging +import asyncio +import json +from typing import Dict, Any, Optional, List +from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, Query +from fastapi.websockets import WebSocketState +import base64 +import io +from datetime import datetime + +from modules.interfaces.interfaceAppObjects import getRootInterface +from modules.interfaces.interfaceAppModel import UserInDB +from modules.security.auth import getCurrentUser +from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/voice/ws", tags=["voice-websocket"]) + +class ConnectionManager: + """Manages WebSocket connections for real-time voice services.""" + + def __init__(self): + self.active_connections: Dict[str, WebSocket] = {} + self.user_connections: Dict[str, str] = {} # user_id -> connection_id + self.connection_metadata: Dict[str, Dict] = {} + + async def connect(self, websocket: WebSocket, connection_id: str, user_id: str, metadata: Dict = None): + """Accept a WebSocket connection.""" + await websocket.accept() + self.active_connections[connection_id] = websocket + self.user_connections[user_id] = connection_id + self.connection_metadata[connection_id] = metadata or {} + logger.info(f"WebSocket connected: {connection_id} for user {user_id}") + + def disconnect(self, connection_id: str): + """Remove a WebSocket connection.""" + if connection_id in self.active_connections: + del self.active_connections[connection_id] + # Remove from user mapping + user_id = None + for uid, cid in self.user_connections.items(): + if cid == connection_id: + user_id = uid + break + if user_id: + del self.user_connections[user_id] + if connection_id in self.connection_metadata: + del self.connection_metadata[connection_id] + logger.info(f"WebSocket disconnected: {connection_id}") + + async def send_message(self, connection_id: str, message: Dict): + """Send a message to a specific connection.""" + if connection_id in self.active_connections: + websocket = self.active_connections[connection_id] + try: + if websocket.client_state == WebSocketState.CONNECTED: + await websocket.send_text(json.dumps(message)) + else: + self.disconnect(connection_id) + except Exception as e: + logger.error(f"Error sending message to {connection_id}: {str(e)}") + self.disconnect(connection_id) + + async def send_error(self, connection_id: str, error: str, error_code: str = "GENERAL_ERROR"): + """Send an error message to a connection.""" + await self.send_message(connection_id, { + "type": "error", + "error": error, + "error_code": error_code, + "timestamp": datetime.now().isoformat() + }) + + def get_connection_metadata(self, connection_id: str) -> Dict: + """Get metadata for a connection.""" + return self.connection_metadata.get(connection_id, {}) + +# Global connection manager +manager = ConnectionManager() + +async def get_azure_speech_connector_ws(user_id: str) -> Optional[ConnectorAzureSpeech]: + """Get Azure Speech connector for WebSocket user.""" + try: + root_interface = getRootInterface() + + # Get user by ID + user = root_interface.getUser(user_id) + if not user: + logger.error(f"User with ID {user_id} not found") + return None + + # Get user connections + user_connections = root_interface.getUserConnections(user_id) + azure_connection = None + + # Find first Azure connection + for connection in user_connections: + if connection.authority == "msft": + azure_connection = connection + break + + if not azure_connection: + logger.error(f"No Azure connection found for user {user_id}") + return None + + # Get connection token + connection_token = root_interface.getConnectionToken(azure_connection.id) + if not connection_token: + logger.error(f"No connection token found for user {user_id}") + return None + + # Use Azure Speech Services subscription key from configuration + from modules.shared.configuration import APP_CONFIG + subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") + + if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": + logger.error("Azure Speech Services subscription key not configured") + return None + + # Get region from configuration + region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") + + # Create Azure Speech connector + connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) + return connector + + except Exception as e: + logger.error(f"Error getting Azure Speech connector for WebSocket: {str(e)}") + return None + +@router.websocket("/realtime-interpreter") +async def websocket_realtime_interpreter( + websocket: WebSocket, + user_id: str = Query(...), + from_language: str = Query("de-DE"), + to_language: str = Query("en-US"), + audio_format: str = Query("wav") +): + """WebSocket endpoint for real-time interpreter with live audio streaming.""" + connection_id = f"interpreter_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + try: + # Connect WebSocket + await manager.connect(websocket, connection_id, user_id, { + "service": "realtime_interpreter", + "from_language": from_language, + "to_language": to_language, + "audio_format": audio_format + }) + + # Get Azure Speech connector + connector = await get_azure_speech_connector_ws(user_id) + if not connector: + await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR") + return + + # Send connection confirmation + await manager.send_message(connection_id, { + "type": "connected", + "connection_id": connection_id, + "service": "realtime_interpreter", + "from_language": from_language, + "to_language": to_language, + "timestamp": datetime.now().isoformat() + }) + + # Process incoming audio chunks + audio_buffer = io.BytesIO() + chunk_count = 0 + + while True: + try: + # Receive message from client + data = await websocket.receive_text() + message = json.loads(data) + + if message.get("type") == "audio_chunk": + # Decode base64 audio data + audio_data = base64.b64decode(message.get("data", "")) + audio_buffer.write(audio_data) + chunk_count += 1 + + # Process every 10 chunks or when buffer is large enough + if chunk_count % 10 == 0 or len(audio_data) > 8192: + audio_buffer.seek(0) + audio_content = audio_buffer.read() + + if len(audio_content) > 0: + try: + # Convert speech to text + stt_result = await connector.speech_to_text( + audio_content=audio_content, + language=from_language, + format="detailed", + audio_format=audio_format + ) + + original_text = stt_result.get("text", "") + + # Translate if different languages + translated_text = original_text + if from_language != to_language and original_text.strip(): + try: + translated_text = await connector.translate_text( + text=original_text, + from_language=from_language, + to_language=to_language + ) + except Exception as e: + logger.warning(f"Translation failed: {str(e)}") + translated_text = original_text + + # Send result back to client + await manager.send_message(connection_id, { + "type": "translation_result", + "original_text": original_text, + "translated_text": translated_text, + "from_language": from_language, + "to_language": to_language, + "confidence": stt_result.get("confidence", 0.0), + "timestamp": datetime.now().isoformat() + }) + + except Exception as e: + logger.error(f"Error processing audio chunk: {str(e)}") + await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR") + + # Reset buffer + audio_buffer = io.BytesIO() + chunk_count = 0 + + elif message.get("type") == "ping": + # Respond to ping with pong + await manager.send_message(connection_id, { + "type": "pong", + "timestamp": datetime.now().isoformat() + }) + + elif message.get("type") == "disconnect": + # Client requested disconnect + break + + except WebSocketDisconnect: + break + except json.JSONDecodeError: + await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON") + except Exception as e: + logger.error(f"Error processing WebSocket message: {str(e)}") + await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR") + + except WebSocketDisconnect: + pass + except Exception as e: + logger.error(f"WebSocket error: {str(e)}") + try: + await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR") + except: + pass + finally: + manager.disconnect(connection_id) + +@router.websocket("/speech-to-text") +async def websocket_speech_to_text( + websocket: WebSocket, + user_id: str = Query(...), + language: str = Query("de-DE"), + audio_format: str = Query("wav") +): + """WebSocket endpoint for real-time speech-to-text with live audio streaming.""" + connection_id = f"stt_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + try: + # Connect WebSocket + await manager.connect(websocket, connection_id, user_id, { + "service": "speech_to_text", + "language": language, + "audio_format": audio_format + }) + + # Get Azure Speech connector + connector = await get_azure_speech_connector_ws(user_id) + if not connector: + await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR") + return + + # Send connection confirmation + await manager.send_message(connection_id, { + "type": "connected", + "connection_id": connection_id, + "service": "speech_to_text", + "language": language, + "timestamp": datetime.now().isoformat() + }) + + # Process incoming audio chunks + audio_buffer = io.BytesIO() + chunk_count = 0 + + while True: + try: + # Receive message from client + data = await websocket.receive_text() + message = json.loads(data) + + if message.get("type") == "audio_chunk": + # Decode base64 audio data + audio_data = base64.b64decode(message.get("data", "")) + audio_buffer.write(audio_data) + chunk_count += 1 + + # Process every 10 chunks or when buffer is large enough + if chunk_count % 10 == 0 or len(audio_data) > 8192: + audio_buffer.seek(0) + audio_content = audio_buffer.read() + + if len(audio_content) > 0: + try: + # Convert speech to text + stt_result = await connector.speech_to_text( + audio_content=audio_content, + language=language, + format="detailed", + audio_format=audio_format + ) + + # Send result back to client + await manager.send_message(connection_id, { + "type": "transcription_result", + "text": stt_result.get("text", ""), + "confidence": stt_result.get("confidence", 0.0), + "language": stt_result.get("language", language), + "is_final": stt_result.get("RecognitionStatus") == "Success", + "timestamp": datetime.now().isoformat() + }) + + except Exception as e: + logger.error(f"Error processing audio chunk: {str(e)}") + await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR") + + # Reset buffer + audio_buffer = io.BytesIO() + chunk_count = 0 + + elif message.get("type") == "ping": + # Respond to ping with pong + await manager.send_message(connection_id, { + "type": "pong", + "timestamp": datetime.now().isoformat() + }) + + elif message.get("type") == "disconnect": + # Client requested disconnect + break + + except WebSocketDisconnect: + break + except json.JSONDecodeError: + await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON") + except Exception as e: + logger.error(f"Error processing WebSocket message: {str(e)}") + await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR") + + except WebSocketDisconnect: + pass + except Exception as e: + logger.error(f"WebSocket error: {str(e)}") + try: + await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR") + except: + pass + finally: + manager.disconnect(connection_id) + +@router.websocket("/text-to-speech") +async def websocket_text_to_speech( + websocket: WebSocket, + user_id: str = Query(...), + language: str = Query("de-DE"), + voice: str = Query("de-DE-KatjaNeural"), + audio_format: str = Query("audio-16khz-128kbitrate-mono-mp3") +): + """WebSocket endpoint for real-time text-to-speech streaming.""" + connection_id = f"tts_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + try: + # Connect WebSocket + await manager.connect(websocket, connection_id, user_id, { + "service": "text_to_speech", + "language": language, + "voice": voice, + "audio_format": audio_format + }) + + # Get Azure Speech connector + connector = await get_azure_speech_connector_ws(user_id) + if not connector: + await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR") + return + + # Send connection confirmation + await manager.send_message(connection_id, { + "type": "connected", + "connection_id": connection_id, + "service": "text_to_speech", + "language": language, + "voice": voice, + "timestamp": datetime.now().isoformat() + }) + + while True: + try: + # Receive message from client + data = await websocket.receive_text() + message = json.loads(data) + + if message.get("type") == "text_to_speak": + text = message.get("text", "") + + if text.strip(): + try: + # Convert text to speech + audio_data = await connector.text_to_speech( + text=text, + language=language, + voice=voice, + format=audio_format + ) + + # Send audio data back to client + audio_base64 = base64.b64encode(audio_data).decode('utf-8') + await manager.send_message(connection_id, { + "type": "audio_data", + "audio_data": audio_base64, + "format": audio_format, + "voice": voice, + "text": text, + "timestamp": datetime.now().isoformat() + }) + + except Exception as e: + logger.error(f"Error converting text to speech: {str(e)}") + await manager.send_error(connection_id, f"TTS failed: {str(e)}", "TTS_ERROR") + + elif message.get("type") == "ping": + # Respond to ping with pong + await manager.send_message(connection_id, { + "type": "pong", + "timestamp": datetime.now().isoformat() + }) + + elif message.get("type") == "disconnect": + # Client requested disconnect + break + + except WebSocketDisconnect: + break + except json.JSONDecodeError: + await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON") + except Exception as e: + logger.error(f"Error processing WebSocket message: {str(e)}") + await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR") + + except WebSocketDisconnect: + pass + except Exception as e: + logger.error(f"WebSocket error: {str(e)}") + try: + await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR") + except: + pass + finally: + manager.disconnect(connection_id) + +@router.get("/status") +async def websocket_status(): + """Get WebSocket connection status.""" + return { + "active_connections": len(manager.active_connections), + "connected_users": len(manager.user_connections), + "services": { + "realtime_interpreter": len([c for c in manager.connection_metadata.values() if c.get("service") == "realtime_interpreter"]), + "speech_to_text": len([c for c in manager.connection_metadata.values() if c.get("service") == "speech_to_text"]), + "text_to_speech": len([c for c in manager.connection_metadata.values() if c.get("service") == "text_to_speech"]) + } + } diff --git a/modules/services/serviceDeltaSync.py b/modules/services/serviceDeltaSync.py index a45200cf..d8376db4 100644 --- a/modules/services/serviceDeltaSync.py +++ b/modules/services/serviceDeltaSync.py @@ -249,8 +249,7 @@ async def perform_sync_jira_delta_group() -> bool: bool: True if synchronization was successful, False otherwise """ try: - #TODO: ADAPT to prod - if APP_ENV_TYPE != "dev" and APP_ENV_TYPE != "prod": + if APP_ENV_TYPE != "prod": logger.info("JIRA to SharePoint synchronization: TASK to run only in PROD") return True