diff --git a/app.py b/app.py index 3c4069f3..24da5c20 100644 --- a/app.py +++ b/app.py @@ -249,8 +249,5 @@ app.include_router(msftRouter) from modules.routes.routeSecurityGoogle import router as googleRouter app.include_router(googleRouter) -from modules.routes.routeVoiceAzure import router as voiceRouter -app.include_router(voiceRouter) - -from modules.routes.routeVoiceWebSocket import router as voiceWebSocketRouter -app.include_router(voiceWebSocketRouter) \ No newline at end of file +from modules.routes.routeVoiceGoogle import router as voiceGoogleRouter +app.include_router(voiceGoogleRouter) \ No newline at end of file diff --git a/config.ini b/config.ini index a42ed4c1..a434bf8b 100644 --- a/config.ini +++ b/config.ini @@ -48,9 +48,8 @@ Service_GOOGLE_CLIENT_SECRET = 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 # Tavily Web Search configuration Connector_WebTavily_API_KEY = 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 -# Azure Speech Services configuration -Connector_AzureSpeech_SUBSCRIPTION_KEY = 4HuOIbNfeZRyQp3ouqCzyYKpRf0rJxMNwPaoKwdoJDUxl5u9FzK2JQQJ99BIACI8hq2XJ3w3AAAYACOGrE9c -Connector_AzureSpeech_REGION = switzerlandnorth +# Google Cloud Speech Services configuration +# Set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory # Web Search configuration Web_Search_MAX_QUERY_LENGTH = 400 diff --git a/debug_audio/audio_20250913_223438.wav b/debug_audio/audio_20250913_223438.wav new file mode 100644 index 00000000..eb2196e3 Binary files /dev/null and b/debug_audio/audio_20250913_223438.wav differ diff --git a/debug_audio/audio_20250913_223658.wav b/debug_audio/audio_20250913_223658.wav new file mode 100644 index 00000000..5ecabd88 Binary files /dev/null and b/debug_audio/audio_20250913_223658.wav differ diff --git a/debug_audio/audio_20250913_224003.wav b/debug_audio/audio_20250913_224003.wav new file mode 100644 index 00000000..ee47d364 Binary files /dev/null and b/debug_audio/audio_20250913_224003.wav differ diff --git a/debug_audio/audio_20250913_224258.wav b/debug_audio/audio_20250913_224258.wav new file mode 100644 index 00000000..ea776be1 Binary files /dev/null and b/debug_audio/audio_20250913_224258.wav differ diff --git a/debug_audio/audio_20250913_224524.wav b/debug_audio/audio_20250913_224524.wav new file mode 100644 index 00000000..c9f15fa1 Binary files /dev/null and b/debug_audio/audio_20250913_224524.wav differ diff --git a/debug_audio/audio_20250913_224801.wav b/debug_audio/audio_20250913_224801.wav new file mode 100644 index 00000000..8d63b2b6 Binary files /dev/null and b/debug_audio/audio_20250913_224801.wav differ diff --git a/debug_audio/audio_20250913_230817.wav b/debug_audio/audio_20250913_230817.wav new file mode 100644 index 00000000..44ee620e Binary files /dev/null and b/debug_audio/audio_20250913_230817.wav differ diff --git a/debug_audio/audio_20250913_230927.wav b/debug_audio/audio_20250913_230927.wav new file mode 100644 index 00000000..70eff7a5 Binary files /dev/null and b/debug_audio/audio_20250913_230927.wav differ diff --git a/debug_audio/audio_20250913_231253.wav b/debug_audio/audio_20250913_231253.wav new file mode 100644 index 00000000..b40442eb Binary files /dev/null and b/debug_audio/audio_20250913_231253.wav differ diff --git a/debug_audio/audio_20250913_231321.wav b/debug_audio/audio_20250913_231321.wav new file mode 100644 index 00000000..241fb907 Binary files /dev/null and b/debug_audio/audio_20250913_231321.wav differ diff --git a/debug_audio/audio_20250913_231611.wav b/debug_audio/audio_20250913_231611.wav new file mode 100644 index 00000000..197a766e Binary files /dev/null and b/debug_audio/audio_20250913_231611.wav differ diff --git a/debug_audio/audio_20250913_231935.wav b/debug_audio/audio_20250913_231935.wav new file mode 100644 index 00000000..ee0da4f6 Binary files /dev/null and b/debug_audio/audio_20250913_231935.wav differ diff --git a/debug_audio/audio_20250913_232141.wav b/debug_audio/audio_20250913_232141.wav new file mode 100644 index 00000000..567219ea Binary files /dev/null and b/debug_audio/audio_20250913_232141.wav differ diff --git a/debug_audio/audio_20250913_232309.wav b/debug_audio/audio_20250913_232309.wav new file mode 100644 index 00000000..4ff7c326 Binary files /dev/null and b/debug_audio/audio_20250913_232309.wav differ diff --git a/debug_audio/audio_20250913_232518.wav b/debug_audio/audio_20250913_232518.wav new file mode 100644 index 00000000..4330c519 Binary files /dev/null and b/debug_audio/audio_20250913_232518.wav differ diff --git a/debug_audio/audio_20250913_232659.wav b/debug_audio/audio_20250913_232659.wav new file mode 100644 index 00000000..17135101 Binary files /dev/null and b/debug_audio/audio_20250913_232659.wav differ diff --git a/debug_audio/audio_20250913_232941.wav b/debug_audio/audio_20250913_232941.wav new file mode 100644 index 00000000..0b5be013 Binary files /dev/null and b/debug_audio/audio_20250913_232941.wav differ diff --git a/debug_audio/audio_20250913_233053.wav b/debug_audio/audio_20250913_233053.wav new file mode 100644 index 00000000..2cfc44ee Binary files /dev/null and b/debug_audio/audio_20250913_233053.wav differ diff --git a/debug_audio/audio_20250913_233155.wav b/debug_audio/audio_20250913_233155.wav new file mode 100644 index 00000000..0923cf6b Binary files /dev/null and b/debug_audio/audio_20250913_233155.wav differ diff --git a/debug_audio/audio_20250913_233607.wav b/debug_audio/audio_20250913_233607.wav new file mode 100644 index 00000000..36b7918b Binary files /dev/null and b/debug_audio/audio_20250913_233607.wav differ diff --git a/debug_audio/audio_20250913_234106.wav b/debug_audio/audio_20250913_234106.wav new file mode 100644 index 00000000..0bb11f95 Binary files /dev/null and b/debug_audio/audio_20250913_234106.wav differ diff --git a/debug_audio/audio_20250913_234245.wav b/debug_audio/audio_20250913_234245.wav new file mode 100644 index 00000000..d1ce6299 Binary files /dev/null and b/debug_audio/audio_20250913_234245.wav differ diff --git a/debug_audio/audio_20250913_234843.wav b/debug_audio/audio_20250913_234843.wav new file mode 100644 index 00000000..de817ca9 Binary files /dev/null and b/debug_audio/audio_20250913_234843.wav differ diff --git a/debug_audio/audio_20250913_235136.wav b/debug_audio/audio_20250913_235136.wav new file mode 100644 index 00000000..7ef75e0a Binary files /dev/null and b/debug_audio/audio_20250913_235136.wav differ diff --git a/debug_audio/audio_20250913_235409.wav b/debug_audio/audio_20250913_235409.wav new file mode 100644 index 00000000..3248f9b3 Binary files /dev/null and b/debug_audio/audio_20250913_235409.wav differ diff --git a/force_reauth.py b/force_reauth.py deleted file mode 100644 index 71a6df0c..00000000 --- a/force_reauth.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to force Microsoft re-authentication for SharePoint access. -This will disconnect the existing connection and provide a new login URL. -""" - -import requests -import json - -# Configuration -BASE_URL = "http://localhost:8000" # Adjust if your server runs on different port -CONNECTION_ID = "cc62583d-3a68-44b6-8283-726725916a7e" # From the logs - -def force_reauth(): - """Force Microsoft re-authentication by disconnecting and providing new login URL.""" - - print("🔄 Forcing Microsoft re-authentication for SharePoint access...") - - # Step 1: Disconnect existing connection - print(f"1. Disconnecting connection {CONNECTION_ID}...") - disconnect_url = f"{BASE_URL}/api/connections/{CONNECTION_ID}/disconnect" - - try: - response = requests.post(disconnect_url) - if response.status_code == 200: - print("✅ Connection disconnected successfully") - else: - print(f"❌ Failed to disconnect: {response.status_code} - {response.text}") - return - except Exception as e: - print(f"❌ Error disconnecting: {e}") - return - - # Step 2: Get new login URL - print("2. Getting new Microsoft login URL...") - login_url = f"{BASE_URL}/api/msft/login?state=connection&connectionId={CONNECTION_ID}" - - print(f"\n🔗 Please visit this URL to re-authenticate with SharePoint permissions:") - print(f" {login_url}") - print("\nAfter re-authentication, the JIRA sync should work with SharePoint access.") - print("\nNote: The new token will include Sites.ReadWrite.All and Files.ReadWrite.All scopes.") - -if __name__ == "__main__": - force_reauth() diff --git a/modules/connectors/connectorAzureSpeech.py b/modules/connectors/connectorAzureSpeech.py deleted file mode 100644 index e8d7a683..00000000 --- a/modules/connectors/connectorAzureSpeech.py +++ /dev/null @@ -1,928 +0,0 @@ -""" -Azure Speech Services Connector -Handles integration with Azure Speech Services for: -- Speech-to-Text (STT) -- Text-to-Speech (TTS) -- Translation services -""" - -import logging -import asyncio -import json -import base64 -from typing import Dict, Any, Optional, List, AsyncGenerator -import aiohttp -import io -import wave -import struct -import tempfile -import os -import time -from pathlib import Path -from datetime import datetime, timedelta - -logger = logging.getLogger(__name__) - -class ConnectorAzureSpeech: - """Connector for Azure Speech Services.""" - - def __init__(self, subscription_key: str, region: str = "westeurope"): - """ - Initialize Azure Speech connector. - - Args: - subscription_key: Azure Speech Services subscription key - region: Azure region (default: westeurope) - """ - self.subscription_key = subscription_key - self.region = region - self.base_url = f"https://{region}.stt.speech.microsoft.com" - self.translator_url = "https://api.cognitive.microsofttranslator.com" - - # Supported audio formats - self.supported_stt_formats = { - "wav": {"mime": "audio/wav", "codec": "audio/pcm", "sample_rate": 16000}, - "mp3": {"mime": "audio/mp3", "codec": "audio/mp3", "sample_rate": 16000}, - "ogg": {"mime": "audio/ogg", "codec": "audio/ogg", "sample_rate": 16000} - } - - self.supported_tts_formats = [ - "audio-16khz-128kbitrate-mono-mp3", - "audio-16khz-32kbitrate-mono-mp3", - "audio-16khz-64kbitrate-mono-mp3", - "audio-24khz-160kbitrate-mono-mp3", - "audio-24khz-48kbitrate-mono-mp3", - "audio-24khz-96kbitrate-mono-mp3", - "audio-48khz-192kbitrate-mono-mp3", - "audio-48khz-96kbitrate-mono-mp3", - "riff-16khz-16bit-mono-pcm", - "riff-24khz-16bit-mono-pcm", - "riff-48khz-16bit-mono-pcm" - ] - - # Rate limiting - self.rate_limits = { - "stt": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0}, - "tts": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0}, - "translation": {"requests_per_minute": 20, "last_reset": time.time(), "request_count": 0} - } - - # Request timeout settings - self.timeout = aiohttp.ClientTimeout(total=30, connect=10) - - def _check_rate_limit(self, service_type: str) -> bool: - """Check if rate limit is exceeded for a service type.""" - current_time = time.time() - rate_limit = self.rate_limits[service_type] - - # Reset counter if minute has passed - if current_time - rate_limit["last_reset"] >= 60: - rate_limit["request_count"] = 0 - rate_limit["last_reset"] = current_time - - # Check if limit exceeded - if rate_limit["request_count"] >= rate_limit["requests_per_minute"]: - return False - - # Increment counter - rate_limit["request_count"] += 1 - return True - - def _handle_azure_error(self, response_status: int, error_text: str) -> Exception: - """Handle Azure API errors with specific error messages.""" - if response_status == 401: - return Exception("Authentication failed. Please check your Azure Speech Services subscription key.") - elif response_status == 403: - return Exception("Access forbidden. Please check your Azure Speech Services permissions.") - elif response_status == 429: - return Exception("Rate limit exceeded. Please wait before making more requests.") - elif response_status == 400: - return Exception(f"Bad request: {error_text}") - elif response_status == 500: - return Exception("Azure Speech Services internal error. Please try again later.") - elif response_status == 503: - return Exception("Azure Speech Services temporarily unavailable. Please try again later.") - else: - return Exception(f"Azure API error {response_status}: {error_text}") - - async def _make_request_with_retry(self, url: str, method: str = "GET", headers: Dict = None, data: bytes = None, params: Dict = None, max_retries: int = 3) -> Dict: - """Make HTTP request with retry logic.""" - if headers is None: - headers = {} - - headers.update({ - "Ocp-Apim-Subscription-Key": self.subscription_key, - "User-Agent": "PowerOn-Voice-Services/1.0" - }) - - # Debug: Log subscription key (masked for security) - logger.debug(f"Using subscription key: {self.subscription_key[:8]}...{self.subscription_key[-8:] if len(self.subscription_key) > 16 else 'SHORT'}") - logger.debug(f"Request URL: {url}") - logger.debug(f"Request params: {params}") - - for attempt in range(max_retries): - try: - async with aiohttp.ClientSession(timeout=self.timeout) as session: - if method.upper() == "GET": - async with session.get(url, headers=headers, params=params) as response: - return await self._handle_response(response) - elif method.upper() == "POST": - async with session.post(url, headers=headers, data=data, params=params) as response: - return await self._handle_response(response) - else: - raise ValueError(f"Unsupported HTTP method: {method}") - except asyncio.TimeoutError: - if attempt == max_retries - 1: - raise Exception("Request timeout after multiple retries") - logger.warning(f"Request timeout, retrying... (attempt {attempt + 1}/{max_retries})") - await asyncio.sleep(2 ** attempt) # Exponential backoff - except Exception as e: - if attempt == max_retries - 1: - raise - logger.warning(f"Request failed, retrying... (attempt {attempt + 1}/{max_retries}): {str(e)}") - await asyncio.sleep(2 ** attempt) # Exponential backoff - - async def _make_request(self, url: str, method: str = "GET", headers: Dict = None, data: bytes = None) -> Dict: - """Make HTTP request to Azure services.""" - if headers is None: - headers = {} - - headers.update({ - "Ocp-Apim-Subscription-Key": self.subscription_key, - "User-Agent": "PowerOn-Voice-Services/1.0" - }) - - async with aiohttp.ClientSession() as session: - try: - if method.upper() == "GET": - async with session.get(url, headers=headers) as response: - return await self._handle_response(response) - elif method.upper() == "POST": - async with session.post(url, headers=headers, data=data) as response: - return await self._handle_response(response) - else: - raise ValueError(f"Unsupported HTTP method: {method}") - except Exception as e: - logger.error(f"Request failed: {str(e)}") - raise - - async def _handle_response(self, response) -> Dict: - """Handle HTTP response.""" - if response.status == 200: - content_type = response.headers.get('content-type', '') - if 'application/json' in content_type: - return await response.json() - else: - # For audio responses, return binary data - return {"data": await response.read()} - else: - error_text = await response.text() - logger.error(f"API request failed: {response.status} - {error_text}") - raise self._handle_azure_error(response.status, error_text) - - def _validate_audio_format(self, audio_content: bytes, expected_format: str = "wav") -> Dict[str, Any]: - """Validate audio format and return format information.""" - try: - # Try to detect format from content - if audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]: - format_type = "wav" - elif audio_content.startswith(b'\xff\xfb') or audio_content.startswith(b'ID3'): - format_type = "mp3" - elif audio_content.startswith(b'OggS'): - format_type = "ogg" - elif audio_content.startswith(b'fLaC'): - format_type = "flac" - else: - # If we can't detect format, assume it's raw audio or WAV without proper header - format_type = "wav" # Azure Speech Services can handle this - - # Validate WAV format specifically - if format_type == "wav": - try: - with io.BytesIO(audio_content) as audio_io: - with wave.open(audio_io, 'rb') as wav_file: - sample_rate = wav_file.getframerate() - channels = wav_file.getnchannels() - sample_width = wav_file.getsampwidth() - - return { - "valid": True, - "format": format_type, - "sample_rate": sample_rate, - "channels": channels, - "sample_width": sample_width, - "duration": wav_file.getnframes() / sample_rate - } - except Exception as e: - # If WAV validation fails, it might be raw audio data - # Azure Speech Services can handle raw audio, so we'll allow it - logger.info(f"WAV validation failed, treating as raw audio: {str(e)}") - return { - "valid": True, - "format": "raw_audio", - "sample_rate": 16000, # Default assumption for raw audio - "channels": 1, # Default assumption - "sample_width": 2, # Default assumption - "duration": len(audio_content) / (16000 * 2) # Rough estimate - } - - # For other formats, assume valid if we can detect them - return { - "valid": True, - "format": format_type, - "sample_rate": 16000, # Default assumption - "channels": 1, # Default assumption - "sample_width": 2, # Default assumption - "duration": 0 # Unknown - } - - except Exception as e: - logger.error(f"Audio validation failed: {str(e)}") - return {"valid": False, "error": str(e)} - - def _convert_audio_to_wav(self, audio_content: bytes, target_sample_rate: int = 16000) -> bytes: - """Convert audio to WAV format with specified sample rate.""" - try: - # If it's already WAV, try to resample if needed - if audio_content.startswith(b'RIFF') and b'WAVE' in audio_content[:12]: - with io.BytesIO(audio_content) as audio_io: - with wave.open(audio_io, 'rb') as wav_file: - current_sample_rate = wav_file.getframerate() - - # If sample rate matches, return as-is - if current_sample_rate == target_sample_rate: - return audio_content - - # For now, return original (in production, implement resampling) - logger.warning(f"Audio sample rate {current_sample_rate} doesn't match target {target_sample_rate}") - return audio_content - - # If it's raw audio data (no header), create a basic WAV header - elif not audio_content.startswith(b'RIFF'): - logger.info("Converting raw audio data to WAV format") - return self._create_wav_header(audio_content, target_sample_rate) - - # For other formats, return as-is for now - # In production, implement proper conversion with pydub or ffmpeg - logger.info("Audio format conversion not fully implemented - returning original") - return audio_content - - except Exception as e: - logger.error(f"Audio conversion failed: {str(e)}") - raise Exception(f"Audio conversion failed: {str(e)}") - - def _create_wav_header(self, audio_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes: - """Create a WAV header for raw audio data.""" - try: - import struct - - # Calculate data size - data_size = len(audio_data) - file_size = 36 + data_size - - # Create WAV header - header = struct.pack('<4sI4s4sIHHIIHH4sI', - b'RIFF', # Chunk ID - file_size, # Chunk size - b'WAVE', # Format - b'fmt ', # Subchunk1 ID - 16, # Subchunk1 size - 1, # Audio format (PCM) - channels, # Number of channels - sample_rate, # Sample rate - sample_rate * channels * sample_width, # Byte rate - channels * sample_width, # Block align - sample_width * 8, # Bits per sample - b'data', # Subchunk2 ID - data_size # Subchunk2 size - ) - - return header + audio_data - - except Exception as e: - logger.error(f"Failed to create WAV header: {str(e)}") - # Return original data if header creation fails - return audio_data - - def _get_audio_content_type(self, audio_format: str) -> str: - """Get MIME type for audio format.""" - if audio_format in self.supported_stt_formats: - return self.supported_stt_formats[audio_format]["mime"] - return "audio/wav" # Default - - async def speech_to_text(self, audio_content: bytes, language: str = "de-DE", format: str = "detailed", audio_format: str = "wav") -> Dict: - """ - Convert speech to text using Azure Speech Services. - - Args: - audio_content: Audio file content as bytes - language: Language code (e.g., "de-DE") - format: Response format ("simple" or "detailed") - audio_format: Audio format ("wav", "mp3", "ogg") - - Returns: - Dict with transcription results - """ - try: - # Check rate limit - if not self._check_rate_limit("stt"): - raise Exception("Rate limit exceeded for speech-to-text service. Please wait before making more requests.") - - # Validate audio format - validation_result = self._validate_audio_format(audio_content, audio_format) - if not validation_result.get("valid", False): - raise Exception(f"Invalid audio format: {validation_result.get('error', 'Unknown error')}") - - # Convert audio to required format if needed - processed_audio = self._convert_audio_to_wav(audio_content) - - # Update audio_format based on validation result - detected_format = validation_result.get("format", audio_format) - if detected_format == "raw_audio": - audio_format = "wav" # Treat raw audio as WAV for Azure - - url = f"{self.base_url}/speech/recognition/conversation/cognitiveservices/v1" - - # Get appropriate content type - content_type = self._get_audio_content_type(audio_format) - if audio_format == "wav": - content_type = f"{content_type}; codecs=audio/pcm; samplerate=16000" - - headers = { - "Content-Type": content_type, - "Accept": "application/json", - "Ocp-Apim-Subscription-Region": self.region - } - - params = { - "language": language, - "format": "detailed" if format == "detailed" else "simple" - } - - # Make API call with retry logic - result = await self._make_request_with_retry( - url=url, - method="POST", - headers=headers, - data=processed_audio, - params=params - ) - - # Parse the response based on format - if format == "detailed": - return { - "text": result.get("DisplayText", ""), - "confidence": result.get("Confidence", 0.0), - "language": result.get("RecognitionStatus", language), - "format": format, - "audio_info": validation_result, - "raw_result": result - } - else: - return { - "text": result.get("DisplayText", ""), - "confidence": 1.0, # Simple format doesn't provide confidence - "language": language, - "format": format, - "audio_info": validation_result - } - - except Exception as e: - logger.error(f"Speech-to-text failed: {str(e)}") - raise - - async def text_to_speech(self, text: str, language: str = "de-DE", voice: str = "de-DE-KatjaNeural", format: str = "audio-16khz-128kbitrate-mono-mp3") -> bytes: - """ - Convert text to speech using Azure Speech Services. - - Args: - text: Text to convert to speech - language: Language code - voice: Voice name - format: Audio format - - Returns: - Audio data as bytes - """ - try: - # Check rate limit - if not self._check_rate_limit("tts"): - raise Exception("Rate limit exceeded for text-to-speech service. Please wait before making more requests.") - - # Validate format - if format not in self.supported_tts_formats: - raise Exception(f"Unsupported TTS format: {format}. Supported formats: {', '.join(self.supported_tts_formats)}") - - url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1" - - headers = { - "Content-Type": "application/ssml+xml", - "X-Microsoft-OutputFormat": format, - "Ocp-Apim-Subscription-Key": self.subscription_key, - "User-Agent": "PowerOn-Voice-Services/1.0" - } - - # Create SSML with proper escaping - escaped_text = text.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace("'", "'") - ssml = f""" - - {escaped_text} - - """ - - # Make API call with retry logic - result = await self._make_request_with_retry( - url=url, - method="POST", - headers=headers, - data=ssml.encode('utf-8'), - params=None - ) - - # Return audio data - return result.get("data", b"") - - except Exception as e: - logger.error(f"Text-to-speech failed: {str(e)}") - raise - - async def translate_text(self, text: str, from_language: str, to_language: str) -> str: - """ - Translate text using Azure Translator. - - Args: - text: Text to translate - from_language: Source language code - to_language: Target language code - - Returns: - Translated text - """ - try: - # Check if text is empty - if not text or not text.strip(): - logger.debug("Empty text provided, returning original text") - return text - - # Check rate limit - if not self._check_rate_limit("translation"): - raise Exception("Rate limit exceeded for translation service. Please wait before making more requests.") - - url = f"{self.translator_url}/translate" - - headers = { - "Ocp-Apim-Subscription-Key": self.subscription_key, - "Ocp-Apim-Subscription-Region": self.region, - "Content-Type": "application/json" - } - - params = { - "api-version": "3.0", - "from": from_language, - "to": to_language - } - - data = [{"text": text}] - - # Debug: Log translation request details - logger.debug(f"Translation request - URL: {url}") - logger.debug(f"Translation request - Headers: {headers}") - logger.debug(f"Translation request - Data: {data}") - - # Make API call with retry logic - result = await self._make_request_with_retry( - url=url, - method="POST", - headers=headers, - data=json.dumps(data).encode('utf-8'), - params=None - ) - - if result and len(result) > 0 and 'translations' in result[0]: - return result[0]['translations'][0]['text'] - else: - logger.warning(f"Unexpected translation response format: {result}") - return text # Return original text if translation fails - - except Exception as e: - logger.error(f"Translation failed: {str(e)}") - raise - - async def get_available_voices(self) -> List[Dict]: - """Get list of available voices from Azure Speech Services.""" - try: - # Azure doesn't provide a direct API for voice list, so we return a comprehensive list - # based on Azure's supported voices - voices = [ - # German voices - {"name": "de-DE-KatjaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-ConradNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-AmalaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-BerndNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-ChristophNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-ElkeNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-GiselaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-JoergNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-KasperNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-KillianNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-KlausNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-LouisaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-MajaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-RalfNeural", "language": "de-DE", "gender": "Male", "style": "Neural", "locale": "de-DE"}, - {"name": "de-DE-TanjaNeural", "language": "de-DE", "gender": "Female", "style": "Neural", "locale": "de-DE"}, - - # English (US) voices - {"name": "en-US-AriaNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-DavisNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-GuyNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-JaneNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-JasonNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-JennyNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-MichelleNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-RyanNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-SaraNeural", "language": "en-US", "gender": "Female", "style": "Neural", "locale": "en-US"}, - {"name": "en-US-TonyNeural", "language": "en-US", "gender": "Male", "style": "Neural", "locale": "en-US"}, - - # English (UK) voices - {"name": "en-GB-LibbyNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"}, - {"name": "en-GB-MaisieNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"}, - {"name": "en-GB-RyanNeural", "language": "en-GB", "gender": "Male", "style": "Neural", "locale": "en-GB"}, - {"name": "en-GB-SoniaNeural", "language": "en-GB", "gender": "Female", "style": "Neural", "locale": "en-GB"}, - {"name": "en-GB-ThomasNeural", "language": "en-GB", "gender": "Male", "style": "Neural", "locale": "en-GB"}, - - # French voices - {"name": "fr-FR-DeniseNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-HenriNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-ArianeNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-ClaudeNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-JacquelineNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-JeromeNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-JosephineNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-MauriceNeural", "language": "fr-FR", "gender": "Male", "style": "Neural", "locale": "fr-FR"}, - {"name": "fr-FR-YvetteNeural", "language": "fr-FR", "gender": "Female", "style": "Neural", "locale": "fr-FR"}, - - # Spanish voices - {"name": "es-ES-ElviraNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-AlvaroNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-ArnauNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-DarioNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-EliasNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-EstrellaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-IreneNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-LaiaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-LiaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-NilNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-SaulNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-TeoNeural", "language": "es-ES", "gender": "Male", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-TrianaNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"}, - {"name": "es-ES-VeraNeural", "language": "es-ES", "gender": "Female", "style": "Neural", "locale": "es-ES"} - ] - return voices - - except Exception as e: - logger.error(f"Failed to get voices: {str(e)}") - return [] - - async def get_available_languages(self) -> List[Dict]: - """Get list of available languages supported by Azure Speech Services.""" - try: - # Comprehensive list of Azure Speech Services supported languages - languages = [ - # European languages - {"code": "de-DE", "name": "German (Germany)", "stt": True, "tts": True, "translation": True}, - {"code": "en-US", "name": "English (United States)", "stt": True, "tts": True, "translation": True}, - {"code": "en-GB", "name": "English (United Kingdom)", "stt": True, "tts": True, "translation": True}, - {"code": "fr-FR", "name": "French (France)", "stt": True, "tts": True, "translation": True}, - {"code": "es-ES", "name": "Spanish (Spain)", "stt": True, "tts": True, "translation": True}, - {"code": "es-MX", "name": "Spanish (Mexico)", "stt": True, "tts": True, "translation": True}, - {"code": "it-IT", "name": "Italian (Italy)", "stt": True, "tts": True, "translation": True}, - {"code": "pt-BR", "name": "Portuguese (Brazil)", "stt": True, "tts": True, "translation": True}, - {"code": "pt-PT", "name": "Portuguese (Portugal)", "stt": True, "tts": True, "translation": True}, - {"code": "ru-RU", "name": "Russian (Russia)", "stt": True, "tts": True, "translation": True}, - {"code": "nl-NL", "name": "Dutch (Netherlands)", "stt": True, "tts": True, "translation": True}, - {"code": "sv-SE", "name": "Swedish (Sweden)", "stt": True, "tts": True, "translation": True}, - {"code": "no-NO", "name": "Norwegian (Norway)", "stt": True, "tts": True, "translation": True}, - {"code": "da-DK", "name": "Danish (Denmark)", "stt": True, "tts": True, "translation": True}, - {"code": "fi-FI", "name": "Finnish (Finland)", "stt": True, "tts": True, "translation": True}, - {"code": "pl-PL", "name": "Polish (Poland)", "stt": True, "tts": True, "translation": True}, - {"code": "cs-CZ", "name": "Czech (Czech Republic)", "stt": True, "tts": True, "translation": True}, - {"code": "hu-HU", "name": "Hungarian (Hungary)", "stt": True, "tts": True, "translation": True}, - {"code": "ro-RO", "name": "Romanian (Romania)", "stt": True, "tts": True, "translation": True}, - {"code": "bg-BG", "name": "Bulgarian (Bulgaria)", "stt": True, "tts": True, "translation": True}, - {"code": "hr-HR", "name": "Croatian (Croatia)", "stt": True, "tts": True, "translation": True}, - {"code": "sk-SK", "name": "Slovak (Slovakia)", "stt": True, "tts": True, "translation": True}, - {"code": "sl-SI", "name": "Slovenian (Slovenia)", "stt": True, "tts": True, "translation": True}, - {"code": "et-EE", "name": "Estonian (Estonia)", "stt": True, "tts": True, "translation": True}, - {"code": "lv-LV", "name": "Latvian (Latvia)", "stt": True, "tts": True, "translation": True}, - {"code": "lt-LT", "name": "Lithuanian (Lithuania)", "stt": True, "tts": True, "translation": True}, - {"code": "mt-MT", "name": "Maltese (Malta)", "stt": True, "tts": True, "translation": True}, - {"code": "ga-IE", "name": "Irish (Ireland)", "stt": True, "tts": True, "translation": True}, - {"code": "cy-GB", "name": "Welsh (United Kingdom)", "stt": True, "tts": True, "translation": True}, - - # Asian languages - {"code": "ja-JP", "name": "Japanese (Japan)", "stt": True, "tts": True, "translation": True}, - {"code": "ko-KR", "name": "Korean (Korea)", "stt": True, "tts": True, "translation": True}, - {"code": "zh-CN", "name": "Chinese (Simplified)", "stt": True, "tts": True, "translation": True}, - {"code": "zh-TW", "name": "Chinese (Traditional)", "stt": True, "tts": True, "translation": True}, - {"code": "zh-HK", "name": "Chinese (Hong Kong)", "stt": True, "tts": True, "translation": True}, - {"code": "th-TH", "name": "Thai (Thailand)", "stt": True, "tts": True, "translation": True}, - {"code": "vi-VN", "name": "Vietnamese (Vietnam)", "stt": True, "tts": True, "translation": True}, - {"code": "id-ID", "name": "Indonesian (Indonesia)", "stt": True, "tts": True, "translation": True}, - {"code": "ms-MY", "name": "Malay (Malaysia)", "stt": True, "tts": True, "translation": True}, - {"code": "tl-PH", "name": "Filipino (Philippines)", "stt": True, "tts": True, "translation": True}, - - # Middle Eastern and African languages - {"code": "ar-SA", "name": "Arabic (Saudi Arabia)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-EG", "name": "Arabic (Egypt)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-AE", "name": "Arabic (UAE)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-KW", "name": "Arabic (Kuwait)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-QA", "name": "Arabic (Qatar)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-BH", "name": "Arabic (Bahrain)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-OM", "name": "Arabic (Oman)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-JO", "name": "Arabic (Jordan)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-LB", "name": "Arabic (Lebanon)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-PS", "name": "Arabic (Palestine)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-SY", "name": "Arabic (Syria)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-IQ", "name": "Arabic (Iraq)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-MA", "name": "Arabic (Morocco)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-DZ", "name": "Arabic (Algeria)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-TN", "name": "Arabic (Tunisia)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-LY", "name": "Arabic (Libya)", "stt": True, "tts": True, "translation": True}, - {"code": "ar-SD", "name": "Arabic (Sudan)", "stt": True, "tts": True, "translation": True}, - {"code": "he-IL", "name": "Hebrew (Israel)", "stt": True, "tts": True, "translation": True}, - {"code": "tr-TR", "name": "Turkish (Turkey)", "stt": True, "tts": True, "translation": True}, - {"code": "fa-IR", "name": "Persian (Iran)", "stt": True, "tts": True, "translation": True}, - {"code": "ur-PK", "name": "Urdu (Pakistan)", "stt": True, "tts": True, "translation": True}, - {"code": "hi-IN", "name": "Hindi (India)", "stt": True, "tts": True, "translation": True}, - {"code": "bn-BD", "name": "Bengali (Bangladesh)", "stt": True, "tts": True, "translation": True}, - {"code": "ta-IN", "name": "Tamil (India)", "stt": True, "tts": True, "translation": True}, - {"code": "te-IN", "name": "Telugu (India)", "stt": True, "tts": True, "translation": True}, - {"code": "ml-IN", "name": "Malayalam (India)", "stt": True, "tts": True, "translation": True}, - {"code": "kn-IN", "name": "Kannada (India)", "stt": True, "tts": True, "translation": True}, - {"code": "gu-IN", "name": "Gujarati (India)", "stt": True, "tts": True, "translation": True}, - {"code": "pa-IN", "name": "Punjabi (India)", "stt": True, "tts": True, "translation": True}, - {"code": "mr-IN", "name": "Marathi (India)", "stt": True, "tts": True, "translation": True}, - {"code": "ne-NP", "name": "Nepali (Nepal)", "stt": True, "tts": True, "translation": True}, - {"code": "si-LK", "name": "Sinhala (Sri Lanka)", "stt": True, "tts": True, "translation": True}, - {"code": "my-MM", "name": "Burmese (Myanmar)", "stt": True, "tts": True, "translation": True}, - {"code": "km-KH", "name": "Khmer (Cambodia)", "stt": True, "tts": True, "translation": True}, - {"code": "lo-LA", "name": "Lao (Laos)", "stt": True, "tts": True, "translation": True}, - - # African languages - {"code": "sw-KE", "name": "Swahili (Kenya)", "stt": True, "tts": True, "translation": True}, - {"code": "sw-TZ", "name": "Swahili (Tanzania)", "stt": True, "tts": True, "translation": True}, - {"code": "am-ET", "name": "Amharic (Ethiopia)", "stt": True, "tts": True, "translation": True}, - {"code": "zu-ZA", "name": "Zulu (South Africa)", "stt": True, "tts": True, "translation": True}, - {"code": "af-ZA", "name": "Afrikaans (South Africa)", "stt": True, "tts": True, "translation": True}, - {"code": "yo-NG", "name": "Yoruba (Nigeria)", "stt": True, "tts": True, "translation": True}, - {"code": "ig-NG", "name": "Igbo (Nigeria)", "stt": True, "tts": True, "translation": True}, - {"code": "ha-NG", "name": "Hausa (Nigeria)", "stt": True, "tts": True, "translation": True}, - - # Other languages - {"code": "is-IS", "name": "Icelandic (Iceland)", "stt": True, "tts": True, "translation": True}, - {"code": "mk-MK", "name": "Macedonian (North Macedonia)", "stt": True, "tts": True, "translation": True}, - {"code": "sq-AL", "name": "Albanian (Albania)", "stt": True, "tts": True, "translation": True}, - {"code": "sr-RS", "name": "Serbian (Serbia)", "stt": True, "tts": True, "translation": True}, - {"code": "bs-BA", "name": "Bosnian (Bosnia and Herzegovina)", "stt": True, "tts": True, "translation": True}, - {"code": "me-ME", "name": "Montenegrin (Montenegro)", "stt": True, "tts": True, "translation": True}, - {"code": "uk-UA", "name": "Ukrainian (Ukraine)", "stt": True, "tts": True, "translation": True}, - {"code": "be-BY", "name": "Belarusian (Belarus)", "stt": True, "tts": True, "translation": True}, - {"code": "ka-GE", "name": "Georgian (Georgia)", "stt": True, "tts": True, "translation": True}, - {"code": "hy-AM", "name": "Armenian (Armenia)", "stt": True, "tts": True, "translation": True}, - {"code": "az-AZ", "name": "Azerbaijani (Azerbaijan)", "stt": True, "tts": True, "translation": True}, - {"code": "kk-KZ", "name": "Kazakh (Kazakhstan)", "stt": True, "tts": True, "translation": True}, - {"code": "ky-KG", "name": "Kyrgyz (Kyrgyzstan)", "stt": True, "tts": True, "translation": True}, - {"code": "uz-UZ", "name": "Uzbek (Uzbekistan)", "stt": True, "tts": True, "translation": True}, - {"code": "tg-TJ", "name": "Tajik (Tajikistan)", "stt": True, "tts": True, "translation": True}, - {"code": "mn-MN", "name": "Mongolian (Mongolia)", "stt": True, "tts": True, "translation": True} - ] - return languages - - except Exception as e: - logger.error(f"Failed to get languages: {str(e)}") - return [] - - async def test_connection(self) -> bool: - """Test Azure Speech Services connection.""" - try: - # Test with a simple TTS request - test_audio = await self.text_to_speech("Test", "en-US", "en-US-AriaNeural") - return len(test_audio) > 0 - except Exception as e: - logger.error(f"Connection test failed: {str(e)}") - return False - - async def stream_speech_to_text(self, audio_stream: AsyncGenerator[bytes, None], language: str = "de-DE", format: str = "detailed", audio_format: str = "wav") -> AsyncGenerator[Dict, None]: - """ - Stream speech to text using Azure Speech Services. - - Args: - audio_stream: Async generator yielding audio chunks - language: Language code (e.g., "de-DE") - format: Response format ("simple" or "detailed") - audio_format: Audio format ("wav", "mp3", "ogg") - - Yields: - Dict with partial transcription results - """ - try: - # Check rate limit - if not self._check_rate_limit("stt"): - raise Exception("Rate limit exceeded for speech-to-text service. Please wait before making more requests.") - - url = f"{self.base_url}/speech/recognition/conversation/cognitiveservices/v1" - - # Get appropriate content type - content_type = self._get_audio_content_type(audio_format) - if audio_format == "wav": - content_type = f"{content_type}; codecs=audio/pcm; samplerate=16000" - - headers = { - "Content-Type": content_type, - "Accept": "application/json", - "Ocp-Apim-Subscription-Key": self.subscription_key, - "Ocp-Apim-Subscription-Region": self.region - } - - params = { - "language": language, - "format": "detailed" if format == "detailed" else "simple" - } - - # Process audio stream in chunks - async with aiohttp.ClientSession(timeout=self.timeout) as session: - async for audio_chunk in audio_stream: - try: - # Validate chunk - if not audio_chunk: - continue - - # Make API call for this chunk - async with session.post( - url, - headers=headers, - params=params, - data=audio_chunk - ) as response: - if response.status == 200: - result = await response.json() - - # Yield partial result - if format == "detailed": - yield { - "text": result.get("DisplayText", ""), - "confidence": result.get("Confidence", 0.0), - "language": result.get("RecognitionStatus", language), - "format": format, - "is_final": result.get("RecognitionStatus") == "Success", - "raw_result": result - } - else: - yield { - "text": result.get("DisplayText", ""), - "confidence": 1.0, - "language": language, - "format": format, - "is_final": result.get("RecognitionStatus") == "Success" - } - else: - error_text = await response.text() - logger.error(f"Streaming STT API failed: {response.status} - {error_text}") - yield { - "error": f"API error {response.status}: {error_text}", - "is_final": True - } - - except Exception as e: - logger.error(f"Error processing audio chunk: {str(e)}") - yield { - "error": str(e), - "is_final": True - } - - except Exception as e: - logger.error(f"Streaming speech-to-text failed: {str(e)}") - yield { - "error": str(e), - "is_final": True - } - - async def stream_text_to_speech(self, text_stream: AsyncGenerator[str, None], language: str = "de-DE", voice: str = "de-DE-KatjaNeural", format: str = "audio-16khz-128kbitrate-mono-mp3") -> AsyncGenerator[bytes, None]: - """ - Stream text to speech using Azure Speech Services. - - Args: - text_stream: Async generator yielding text chunks - language: Language code - voice: Voice name - format: Audio format - - Yields: - Audio data chunks as bytes - """ - try: - # Check rate limit - if not self._check_rate_limit("tts"): - raise Exception("Rate limit exceeded for text-to-speech service. Please wait before making more requests.") - - # Validate format - if format not in self.supported_tts_formats: - raise Exception(f"Unsupported TTS format: {format}. Supported formats: {', '.join(self.supported_tts_formats)}") - - url = f"https://{self.region}.tts.speech.microsoft.com/cognitiveservices/v1" - - headers = { - "Content-Type": "application/ssml+xml", - "X-Microsoft-OutputFormat": format, - "Ocp-Apim-Subscription-Key": self.subscription_key, - "User-Agent": "PowerOn-Voice-Services/1.0" - } - - # Process text stream in chunks - async with aiohttp.ClientSession(timeout=self.timeout) as session: - async for text_chunk in text_stream: - try: - if not text_chunk.strip(): - continue - - # Create SSML for this chunk - escaped_text = text_chunk.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """).replace("'", "'") - ssml = f""" - - {escaped_text} - - """ - - # Make API call for this chunk - async with session.post( - url, - headers=headers, - data=ssml.encode('utf-8') - ) as response: - if response.status == 200: - audio_data = await response.read() - if audio_data: - yield audio_data - else: - error_text = await response.text() - logger.error(f"Streaming TTS API failed: {response.status} - {error_text}") - - except Exception as e: - logger.error(f"Error processing text chunk: {str(e)}") - - except Exception as e: - logger.error(f"Streaming text-to-speech failed: {str(e)}") - - async def stream_realtime_interpreter(self, audio_stream: AsyncGenerator[bytes, None], from_language: str = "de-DE", to_language: str = "en-US") -> AsyncGenerator[Dict, None]: - """ - Stream real-time interpreter: speech to translated text. - - Args: - audio_stream: Async generator yielding audio chunks - from_language: Source language code - to_language: Target language code - - Yields: - Dict with translation results - """ - try: - # Check rate limits - if not self._check_rate_limit("stt") or not self._check_rate_limit("translation"): - raise Exception("Rate limit exceeded for interpreter service. Please wait before making more requests.") - - # Process audio stream - async for stt_result in self.stream_speech_to_text(audio_stream, from_language): - if "error" in stt_result: - yield stt_result - continue - - original_text = stt_result.get("text", "") - - # Translate text if different languages - translated_text = original_text - if from_language != to_language and original_text.strip(): - try: - translated_text = await self.translate_text( - text=original_text, - from_language=from_language, - to_language=to_language - ) - except Exception as e: - logger.warning(f"Translation failed: {str(e)}") - translated_text = original_text - - yield { - "original_text": original_text, - "translated_text": translated_text, - "from_language": from_language, - "to_language": to_language, - "confidence": stt_result.get("confidence", 0.0), - "is_final": stt_result.get("is_final", False) - } - - except Exception as e: - logger.error(f"Streaming realtime interpreter failed: {str(e)}") - yield { - "error": str(e), - "is_final": True - } diff --git a/modules/connectors/connectorGoogleSpeech.py b/modules/connectors/connectorGoogleSpeech.py new file mode 100644 index 00000000..002f88de --- /dev/null +++ b/modules/connectors/connectorGoogleSpeech.py @@ -0,0 +1,285 @@ +""" +Google Cloud Speech-to-Text and Translation Connector +Replaces Azure Speech Services with Google Cloud APIs +""" + +import os +import io +import logging +import asyncio +from typing import Dict, Optional, Any +from google.cloud import speech +from google.cloud import translate_v2 as translate + +logger = logging.getLogger(__name__) + +class ConnectorGoogleSpeech: + """ + Google Cloud Speech-to-Text and Translation connector. + Handles audio processing, speech recognition, and translation. + """ + + def __init__(self, credentials_path: Optional[str] = None): + """ + Initialize Google Cloud Speech and Translation clients. + + Args: + credentials_path: Path to Google Cloud service account JSON file + """ + try: + # Set up authentication + if credentials_path: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path + + # Initialize clients + self.speech_client = speech.SpeechClient() + self.translate_client = translate.Client() + + logger.info("✅ Google Cloud Speech and Translation clients initialized successfully") + + except Exception as e: + logger.error(f"❌ Failed to initialize Google Cloud clients: {e}") + raise + + async def speech_to_text(self, audio_content: bytes, language: str = "de-DE", + sample_rate: int = 16000, channels: int = 1) -> Dict: + """ + Convert speech to text using Google Cloud Speech-to-Text API. + + Args: + audio_content: Raw audio data (PCM format) + language: Language code (e.g., 'de-DE', 'en-US') + sample_rate: Audio sample rate (default: 16000 Hz) + channels: Number of audio channels (default: 1) + + Returns: + Dict containing transcribed text, confidence, and metadata + """ + try: + logger.info(f"🎤 Processing audio with Google Cloud Speech-to-Text") + logger.info(f"📊 Audio: {len(audio_content)} bytes, {sample_rate}Hz, {channels}ch") + + # Configure audio settings + audio = speech.RecognitionAudio(content=audio_content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=sample_rate, + audio_channel_count=channels, + language_code=language, + enable_automatic_punctuation=True, + model="latest_long" # Use the latest model + ) + + # Perform speech recognition + logger.info("🔄 Sending audio to Google Cloud Speech-to-Text...") + response = self.speech_client.recognize(config=config, audio=audio) + + # Process results + if response.results: + result = response.results[0] + if result.alternatives: + alternative = result.alternatives[0] + transcribed_text = alternative.transcript + confidence = alternative.confidence + + logger.info(f"✅ Transcription successful: '{transcribed_text}' (confidence: {confidence:.2f})") + + return { + "success": True, + "text": transcribed_text, + "confidence": confidence, + "language": language, + "raw_result": { + "transcript": transcribed_text, + "confidence": confidence, + "language_code": language + } + } + else: + logger.warning("⚠️ No transcription alternatives found") + return { + "success": False, + "text": "", + "confidence": 0.0, + "error": "No transcription alternatives found" + } + else: + logger.warning("⚠️ No recognition results from Google Cloud") + return { + "success": False, + "text": "", + "confidence": 0.0, + "error": "No recognition results" + } + + except Exception as e: + logger.error(f"❌ Google Cloud Speech-to-Text error: {e}") + return { + "success": False, + "text": "", + "confidence": 0.0, + "error": str(e) + } + + async def translate_text(self, text: str, target_language: str = "en", + source_language: str = "de") -> Dict: + """ + Translate text using Google Cloud Translation API. + + Args: + text: Text to translate + target_language: Target language code (e.g., 'en', 'de') + source_language: Source language code (e.g., 'de', 'en') + + Returns: + Dict containing translated text and metadata + """ + try: + if not text.strip(): + logger.warning("⚠️ Empty text provided for translation") + return { + "success": False, + "translated_text": "", + "error": "Empty text provided" + } + + logger.info(f"🌐 Translating: '{text}' ({source_language} -> {target_language})") + + # Perform translation + result = self.translate_client.translate( + text, + source_language=source_language, + target_language=target_language + ) + + translated_text = result['translatedText'] + detected_language = result.get('detectedSourceLanguage', source_language) + + logger.info(f"✅ Translation successful: '{translated_text}'") + + return { + "success": True, + "translated_text": translated_text, + "source_language": detected_language, + "target_language": target_language, + "original_text": text + } + + except Exception as e: + logger.error(f"❌ Google Cloud Translation error: {e}") + return { + "success": False, + "translated_text": "", + "error": str(e) + } + + async def speech_to_translated_text(self, audio_content: bytes, + from_language: str = "de-DE", + to_language: str = "en") -> Dict: + """ + Complete pipeline: Speech-to-Text + Translation. + + Args: + audio_content: Raw audio data + from_language: Source language for speech recognition + to_language: Target language for translation + + Returns: + Dict containing original text, translated text, and metadata + """ + try: + logger.info(f"🔄 Starting speech-to-translation pipeline: {from_language} -> {to_language}") + + # Step 1: Speech-to-Text + speech_result = await self.speech_to_text( + audio_content=audio_content, + language=from_language + ) + + if not speech_result["success"]: + return { + "success": False, + "original_text": "", + "translated_text": "", + "error": f"Speech recognition failed: {speech_result.get('error', 'Unknown error')}" + } + + original_text = speech_result["text"] + + # Step 2: Translation + translation_result = await self.translate_text( + text=original_text, + source_language=from_language.split('-')[0], # Convert 'de-DE' to 'de' + target_language=to_language.split('-')[0] # Convert 'en-US' to 'en' + ) + + if not translation_result["success"]: + return { + "success": False, + "original_text": original_text, + "translated_text": "", + "error": f"Translation failed: {translation_result.get('error', 'Unknown error')}" + } + + translated_text = translation_result["translated_text"] + + logger.info(f"✅ Complete pipeline successful:") + logger.info(f" Original: '{original_text}'") + logger.info(f" Translated: '{translated_text}'") + + return { + "success": True, + "original_text": original_text, + "translated_text": translated_text, + "confidence": speech_result["confidence"], + "source_language": from_language, + "target_language": to_language + } + + except Exception as e: + logger.error(f"❌ Speech-to-translation pipeline error: {e}") + return { + "success": False, + "original_text": "", + "translated_text": "", + "error": str(e) + } + + def validate_audio_format(self, audio_content: bytes) -> Dict: + """ + Validate audio format for Google Cloud Speech-to-Text. + + Args: + audio_content: Raw audio data + + Returns: + Dict containing validation results + """ + try: + # Google Cloud Speech-to-Text supports various formats + # We'll do basic validation + if len(audio_content) < 100: + return { + "valid": False, + "error": "Audio too short (less than 100 bytes)" + } + + # Check if it looks like PCM audio (basic check) + if len(audio_content) % 2 != 0: + return { + "valid": False, + "error": "Audio data length is odd (not 16-bit PCM)" + } + + return { + "valid": True, + "format": "pcm", + "size": len(audio_content), + "estimated_duration": len(audio_content) / (16000 * 2) # Rough estimate for 16kHz, 16-bit + } + + except Exception as e: + return { + "valid": False, + "error": f"Validation error: {e}" + } diff --git a/modules/routes/routeVoiceAzure.py b/modules/routes/routeVoiceAzure.py deleted file mode 100644 index 90ff237e..00000000 --- a/modules/routes/routeVoiceAzure.py +++ /dev/null @@ -1,813 +0,0 @@ -""" -Azure Voice Services Route -Provides endpoints for Azure Speech Services integration including: -- Speech-to-Text (STT) -- Text-to-Speech (TTS) -- Translation services -- Real-time conversation -""" - -import logging -import asyncio -import json -from typing import Dict, Any, Optional -from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form -from fastapi.responses import StreamingResponse -from pydantic import BaseModel -import io -import base64 -import asyncio -from typing import AsyncGenerator -from datetime import datetime - -from modules.interfaces.interfaceAppObjects import getRootInterface -from modules.interfaces.interfaceAppModel import UserInDB -from modules.security.auth import getCurrentUser -from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/voice", tags=["voice"]) - -# Pydantic models for request/response -class SpeechToTextRequest(BaseModel): - language: str = "de-DE" - format: str = "detailed" # "simple" or "detailed" - -class TextToSpeechRequest(BaseModel): - text: str - language: str = "de-DE" - voice: str = "de-DE-KatjaNeural" - format: str = "audio-16khz-128kbitrate-mono-mp3" - -class TranslationRequest(BaseModel): - text: str - from_language: str = "de-DE" - to_language: str = "en-US" - -class ConversationRequest(BaseModel): - message: str - language: str = "de-DE" - response_voice: str = "de-DE-KatjaNeural" - -class VoiceSettingsRequest(BaseModel): - stt_language: str = "de-DE" - tts_language: str = "de-DE" - tts_voice: str = "de-DE-KatjaNeural" - translation_enabled: bool = True - target_language: str = "en-US" - -# Get Azure Speech connector for current user -async def get_azure_speech_connector(current_user: UserInDB = Depends(getCurrentUser), connection_id: Optional[str] = None) -> ConnectorAzureSpeech: - """Get Azure Speech connector for the current user.""" - try: - root_interface = getRootInterface() - - # Get user connections - user_connections = root_interface.getUserConnections(current_user.id) - azure_connection = None - - if connection_id: - # Find specific connection by ID - for connection in user_connections: - if connection.id == connection_id and connection.authority == "msft": - azure_connection = connection - break - else: - # Find first Azure connection - for connection in user_connections: - if connection.authority == "msft": - azure_connection = connection - break - - if not azure_connection: - if connection_id: - raise HTTPException( - status_code=400, - detail=f"Azure connection with ID '{connection_id}' not found." - ) - else: - raise HTTPException( - status_code=400, - detail="No Azure connection found. Please connect your Microsoft account first." - ) - - # Get connection token - connection_token = root_interface.getConnectionToken(azure_connection.id) - if not connection_token: - raise HTTPException( - status_code=400, - detail="Azure connection token not found. Please reconnect your Microsoft account." - ) - - # For Azure Speech Services, we need a subscription key, not an access token - # Use Azure Speech Services subscription key from configuration - from modules.shared.configuration import APP_CONFIG - subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") - - # Debug: Log subscription key (masked for security) - logger.info(f"Loaded subscription key: {subscription_key[:8]}...{subscription_key[-8:] if subscription_key and len(subscription_key) > 16 else 'INVALID'}") - logger.info(f"Key length: {len(subscription_key) if subscription_key else 'None'}") - - if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": - raise HTTPException( - status_code=500, - detail="Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini" - ) - - # Get region from configuration - region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") - - # Create Azure Speech connector - connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) - - return connector - - except Exception as e: - logger.error(f"Error getting Azure Speech connector: {str(e)}") - raise HTTPException(status_code=500, detail=f"Failed to initialize Azure Speech connector: {str(e)}") - -@router.get("/connections") -async def get_user_connections(current_user: UserInDB = Depends(getCurrentUser)): - """Get all Microsoft connections for the current user.""" - try: - root_interface = getRootInterface() - - # Get user connections - user_connections = root_interface.getUserConnections(current_user.id) - - # Filter for Microsoft connections only - msft_connections = [] - for connection in user_connections: - if connection.authority == "msft": - # Check if this connection has speech services subscription key - has_speech_key = False - if hasattr(connection, 'metadata') and connection.metadata: - try: - metadata = json.loads(connection.metadata) if isinstance(connection.metadata, str) else connection.metadata - has_speech_key = bool(metadata.get('speech_subscription_key') or metadata.get('service_type') == 'speech_services') - except (json.JSONDecodeError, AttributeError): - pass - - msft_connections.append({ - "id": connection.id, - "externalUsername": connection.externalUsername, - "authority": connection.authority, - "isActive": connection.status == "active", - "hasSpeechKey": has_speech_key, - "connectedAt": connection.connectedAt, - "lastChecked": connection.lastChecked - }) - - return { - "success": True, - "connections": msft_connections, - "count": len(msft_connections) - } - - except Exception as e: - logger.error(f"Error getting user connections: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -class SpeechSubscriptionRequest(BaseModel): - subscription_key: str - region: str = "westeurope" - connection_id: Optional[str] = None - -@router.post("/subscription") -async def set_speech_subscription( - request: SpeechSubscriptionRequest, - current_user: UserInDB = Depends(getCurrentUser) -): - """Set Azure Speech Services subscription key for the user.""" - try: - root_interface = getRootInterface() - - # Validate subscription key format (basic validation) - if not request.subscription_key or len(request.subscription_key) < 32: - raise HTTPException( - status_code=400, - detail="Invalid subscription key format. Please provide a valid Azure Speech Services subscription key." - ) - - # Validate region - valid_regions = ["westeurope", "eastus", "westus2", "eastus2", "southeastasia", "westcentralus", "eastasia", "northeurope", "southcentralus", "centralus", "australiaeast", "brazilsouth", "canadacentral", "centralindia", "francecentral", "germanywestcentral", "japaneast", "koreacentral", "norwayeast", "southafricanorth", "switzerlandnorth", "uaenorth", "uksouth", "westus3"] - if request.region not in valid_regions: - raise HTTPException( - status_code=400, - detail=f"Invalid region. Supported regions: {', '.join(valid_regions)}" - ) - - # Test the subscription key by creating a temporary connector - try: - test_connector = ConnectorAzureSpeech(subscription_key=request.subscription_key, region=request.region) - # Test with a simple TTS request - test_audio = await test_connector.text_to_speech("Test", "en-US", "en-US-AriaNeural") - if len(test_audio) == 0: - raise Exception("Subscription key test failed") - except Exception as e: - raise HTTPException( - status_code=400, - detail=f"Invalid subscription key or region. Test failed: {str(e)}" - ) - - # Get user connections - user_connections = root_interface.getUserConnections(current_user.id) - - # Find the target connection - target_connection = None - if request.connection_id: - # Use specific connection - for connection in user_connections: - if connection.id == request.connection_id and connection.authority == "msft": - target_connection = connection - break - if not target_connection: - raise HTTPException( - status_code=400, - detail=f"Connection with ID '{request.connection_id}' not found." - ) - else: - # Use first Microsoft connection or create a new one - target_connection = None - for connection in user_connections: - if connection.authority == "msft": - target_connection = connection - break - - if not target_connection: - # Create a new connection for speech services - # This would require implementing connection creation in the interface - raise HTTPException( - status_code=400, - detail="No Microsoft connection found. Please connect your Microsoft account first." - ) - - # Update connection metadata with speech subscription key - metadata = {} - if hasattr(target_connection, 'metadata') and target_connection.metadata: - try: - metadata = json.loads(target_connection.metadata) if isinstance(target_connection.metadata, str) else target_connection.metadata - except (json.JSONDecodeError, AttributeError): - metadata = {} - - # Add speech services information - metadata['speech_subscription_key'] = request.subscription_key - metadata['speech_region'] = request.region - metadata['speech_configured_at'] = datetime.now().isoformat() - - # Update the connection (this would require implementing update in the interface) - # For now, we'll just return success - the actual update would need to be implemented - # in the database interface - - return { - "success": True, - "message": "Azure Speech Services subscription key configured successfully", - "connection_id": target_connection.id, - "region": request.region, - "configured_at": metadata['speech_configured_at'] - } - - except HTTPException: - raise - except Exception as e: - logger.error(f"Error setting speech subscription: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.get("/subscription") -async def get_speech_subscription( - current_user: UserInDB = Depends(getCurrentUser) -): - """Get Azure Speech Services subscription information for the user.""" - try: - # Use Azure Speech Services subscription key from configuration - from modules.shared.configuration import APP_CONFIG - subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") - region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") - - if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": - return { - "success": True, - "has_subscription": False, - "message": "Azure Speech Services subscription key not configured. Please set Connector_AzureSpeech_SUBSCRIPTION_KEY in config.ini" - } - - # Test the connection - try: - connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) - test_result = await connector.test_connection() - - return { - "success": True, - "has_subscription": True, - "subscription": { - "subscription_key": subscription_key[:8] + "..." + subscription_key[-8:], - "region": region, - "connection_test": test_result, - "source": "config.ini" - } - } - except Exception as test_error: - return { - "success": True, - "has_subscription": True, - "subscription": { - "subscription_key": subscription_key[:8] + "..." + subscription_key[-8:], - "region": region, - "connection_test": False, - "error": str(test_error), - "source": "config.ini" - } - } - - except Exception as e: - logger.error(f"Error getting speech subscription: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.get("/settings") -async def get_voice_settings( - current_user: UserInDB = Depends(getCurrentUser), - connection_id: Optional[str] = None -): - """Get available voice settings and languages.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - # Get available voices and languages - voices = await connector.get_available_voices() - languages = await connector.get_available_languages() - - # Get user's saved settings from database - from modules.interfaces.interfaceComponentObjects import getInterface - component_interface = getInterface(current_user) - user_settings = component_interface.getVoiceSettings(current_user.id) - - # If no settings exist, use defaults - if not user_settings: - user_settings = { - "sttLanguage": "de-DE", - "ttsLanguage": "de-DE", - "ttsVoice": "de-DE-KatjaNeural", - "translationEnabled": True, - "targetLanguage": "en-US" - } - - return { - "voices": voices, - "languages": languages, - "user_settings": { - "sttLanguage": user_settings.sttLanguage if hasattr(user_settings, 'sttLanguage') else user_settings.get("sttLanguage"), - "ttsLanguage": user_settings.ttsLanguage if hasattr(user_settings, 'ttsLanguage') else user_settings.get("ttsLanguage"), - "ttsVoice": user_settings.ttsVoice if hasattr(user_settings, 'ttsVoice') else user_settings.get("ttsVoice"), - "translationEnabled": user_settings.translationEnabled if hasattr(user_settings, 'translationEnabled') else user_settings.get("translationEnabled"), - "targetLanguage": user_settings.targetLanguage if hasattr(user_settings, 'targetLanguage') else user_settings.get("targetLanguage") - }, - "default_settings": { - "sttLanguage": "de-DE", - "ttsLanguage": "de-DE", - "ttsVoice": "de-DE-KatjaNeural", - "translationEnabled": True, - "targetLanguage": "en-US" - } - } - - except Exception as e: - logger.error(f"Error getting voice settings: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/settings") -async def save_voice_settings( - settings: dict, - current_user: UserInDB = Depends(getCurrentUser) -): - """Save voice settings for the current user.""" - try: - from modules.interfaces.interfaceComponentObjects import getInterface - component_interface = getInterface(current_user) - - # Check if settings exist, if not create them - existing_settings = component_interface.getVoiceSettings(current_user.id) - if not existing_settings: - # Create new settings - settings["userId"] = current_user.id - settings["mandateId"] = current_user.mandateId - created_settings = component_interface.createVoiceSettings(settings) - return { - "success": True, - "message": "Voice settings created successfully", - "settings": created_settings - } - else: - # Update existing settings - updated_settings = component_interface.updateVoiceSettings( - current_user.id, - settings - ) - return { - "success": True, - "message": "Voice settings saved successfully", - "settings": updated_settings - } - - except Exception as e: - logger.error(f"Error saving voice settings: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/speech-to-text") -async def speech_to_text( - audio_file: UploadFile = File(...), - language: str = Form("de-DE"), - format: str = Form("detailed"), - connection_id: str = Form(None), - current_user: UserInDB = Depends(getCurrentUser) -): - """Convert speech to text using Azure Speech Services.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - # Read audio file - audio_content = await audio_file.read() - - # Convert speech to text - result = await connector.speech_to_text( - audio_content=audio_content, - language=language, - format=format - ) - - return { - "success": True, - "text": result.get("text", ""), - "confidence": result.get("confidence", 0.0), - "language": result.get("language", language), - "format": format - } - - except Exception as e: - logger.error(f"Error in speech-to-text: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/text-to-speech") -async def text_to_speech( - request: TextToSpeechRequest, - current_user: UserInDB = Depends(getCurrentUser), - connection_id: Optional[str] = None -): - """Convert text to speech using Azure Speech Services.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - # Convert text to speech - audio_data = await connector.text_to_speech( - text=request.text, - language=request.language, - voice=request.voice, - format=request.format - ) - - # Return audio as base64 encoded string - audio_base64 = base64.b64encode(audio_data).decode('utf-8') - - return { - "success": True, - "audio_data": audio_base64, - "format": request.format, - "voice": request.voice, - "language": request.language - } - - except Exception as e: - logger.error(f"Error in text-to-speech: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/translate") -async def translate_text( - request: TranslationRequest, - current_user: UserInDB = Depends(getCurrentUser), - connection_id: Optional[str] = None -): - """Translate text using Azure Translator.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - # Translate text - translated_text = await connector.translate_text( - text=request.text, - from_language=request.from_language, - to_language=request.to_language - ) - - return { - "success": True, - "original_text": request.text, - "translated_text": translated_text, - "from_language": request.from_language, - "to_language": request.to_language - } - - except Exception as e: - logger.error(f"Error in translation: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/conversation") -async def conversation( - request: ConversationRequest, - current_user: UserInDB = Depends(getCurrentUser), - connection_id: Optional[str] = None -): - """Handle conversation with voice response.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - # Simple AI response logic - in production, integrate with OpenAI, Azure OpenAI, or similar - response_text = await _generate_ai_response(request.message, request.language) - - # Convert response to speech - audio_data = await connector.text_to_speech( - text=response_text, - language=request.language, - voice=request.response_voice, - format="audio-16khz-128kbitrate-mono-mp3" - ) - - # Return both text and audio - audio_base64 = base64.b64encode(audio_data).decode('utf-8') - - return { - "success": True, - "response_text": response_text, - "audio_data": audio_base64, - "voice": request.response_voice, - "language": request.language - } - - except Exception as e: - logger.error(f"Error in conversation: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -async def _generate_ai_response(message: str, language: str) -> str: - """Generate AI response for conversation.""" - try: - # Simple rule-based responses for demonstration - # In production, this would call an AI service like OpenAI or Azure OpenAI - - message_lower = message.lower() - - # German responses - if language.startswith("de"): - if any(word in message_lower for word in ["hallo", "hi", "hey"]): - return "Hallo! Wie kann ich Ihnen heute helfen?" - elif any(word in message_lower for word in ["wie geht", "wie gehts"]): - return "Mir geht es gut, danke der Nachfrage! Wie geht es Ihnen?" - elif any(word in message_lower for word in ["danke", "dankeschön"]): - return "Gern geschehen! Gibt es noch etwas, womit ich helfen kann?" - elif any(word in message_lower for word in ["zeit", "uhr"]): - from datetime import datetime - current_time = datetime.now().strftime("%H:%M") - return f"Es ist jetzt {current_time} Uhr." - elif any(word in message_lower for word in ["wetter", "temperatur"]): - return "Ich kann leider keine aktuellen Wetterdaten abrufen. Bitte schauen Sie in eine Wetter-App." - else: - return f"Das ist interessant: '{message}'. Können Sie mir mehr darüber erzählen?" - - # English responses - elif language.startswith("en"): - if any(word in message_lower for word in ["hello", "hi", "hey"]): - return "Hello! How can I help you today?" - elif any(word in message_lower for word in ["how are you", "how's it going"]): - return "I'm doing well, thank you for asking! How are you?" - elif any(word in message_lower for word in ["thank you", "thanks"]): - return "You're welcome! Is there anything else I can help you with?" - elif any(word in message_lower for word in ["time", "clock"]): - from datetime import datetime - current_time = datetime.now().strftime("%H:%M") - return f"It's currently {current_time}." - elif any(word in message_lower for word in ["weather", "temperature"]): - return "I'm sorry, I can't retrieve current weather data. Please check a weather app." - else: - return f"That's interesting: '{message}'. Can you tell me more about that?" - - # Default response - else: - return f"I heard: '{message}'. How can I help you with that?" - - except Exception as e: - logger.error(f"Error generating AI response: {str(e)}") - return "I'm sorry, I didn't understand that. Could you please repeat?" - -@router.post("/realtime-interpreter") -async def realtime_interpreter( - audio_file: UploadFile = File(...), - from_language: str = Form("de-DE"), - to_language: str = Form("en-US"), - connection_id: str = Form(None), - current_user: UserInDB = Depends(getCurrentUser) -): - """Real-time interpreter: speech to translated text.""" - try: - logger.info(f"Realtime interpreter called with from_language='{from_language}', to_language='{to_language}'") - - connector = await get_azure_speech_connector(current_user, connection_id) - - # Read audio file - audio_content = await audio_file.read() - - # Convert speech to text - stt_result = await connector.speech_to_text( - audio_content=audio_content, - language=from_language, - format="detailed" - ) - - original_text = stt_result.get("text", "") - - # Translate text if different languages - translated_text = original_text - if from_language != to_language: - try: - translated_text = await connector.translate_text( - text=original_text, - from_language=from_language, - to_language=to_language - ) - except Exception as e: - logger.warning(f"Translation failed, using original text: {str(e)}") - translated_text = original_text - - return { - "success": True, - "original_text": original_text, - "translated_text": translated_text, - "from_language": from_language, - "to_language": to_language, - "confidence": stt_result.get("confidence", 0.0) - } - - except Exception as e: - logger.error(f"Error in realtime interpreter: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/stream/speech-to-text") -async def stream_speech_to_text( - audio_file: UploadFile = File(...), - language: str = Form("de-DE"), - format: str = Form("detailed"), - audio_format: str = Form("wav"), - connection_id: str = Form(None), - current_user: UserInDB = Depends(getCurrentUser) -): - """Stream speech to text using Azure Speech Services.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - async def audio_chunk_generator(): - """Generate audio chunks from uploaded file.""" - chunk_size = 4096 # 4KB chunks - while True: - chunk = await audio_file.read(chunk_size) - if not chunk: - break - yield chunk - - async def response_generator(): - """Generate streaming responses.""" - try: - async for result in connector.stream_speech_to_text( - audio_chunk_generator(), - language=language, - format=format, - audio_format=audio_format - ): - yield f"data: {json.dumps(result)}\n\n" - except Exception as e: - error_result = {"error": str(e), "is_final": True} - yield f"data: {json.dumps(error_result)}\n\n" - - return StreamingResponse( - response_generator(), - media_type="text/plain", - headers={ - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "Content-Type": "text/event-stream" - } - ) - - except Exception as e: - logger.error(f"Error in streaming speech-to-text: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/stream/text-to-speech") -async def stream_text_to_speech( - request: TextToSpeechRequest, - current_user: UserInDB = Depends(getCurrentUser), - connection_id: Optional[str] = None -): - """Stream text to speech using Azure Speech Services.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - async def text_chunk_generator(): - """Generate text chunks from the request.""" - # Split text into sentences for better streaming - sentences = request.text.split('. ') - for sentence in sentences: - if sentence.strip(): - yield sentence.strip() + '. ' - - async def audio_generator(): - """Generate streaming audio data.""" - try: - async for audio_chunk in connector.stream_text_to_speech( - text_chunk_generator(), - language=request.language, - voice=request.voice, - format=request.format - ): - yield audio_chunk - except Exception as e: - logger.error(f"Error in audio generation: {str(e)}") - # Return error as audio (could be a beep or silence) - yield b"" - - return StreamingResponse( - audio_generator(), - media_type="audio/mpeg", - headers={ - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "Content-Type": "audio/mpeg" - } - ) - - except Exception as e: - logger.error(f"Error in streaming text-to-speech: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/stream/realtime-interpreter") -async def stream_realtime_interpreter( - audio_file: UploadFile = File(...), - from_language: str = Form("de-DE"), - to_language: str = Form("en-US"), - connection_id: str = Form(None), - current_user: UserInDB = Depends(getCurrentUser) -): - """Stream real-time interpreter: speech to translated text.""" - try: - connector = await get_azure_speech_connector(current_user, connection_id) - - async def audio_chunk_generator(): - """Generate audio chunks from uploaded file.""" - chunk_size = 4096 # 4KB chunks - while True: - chunk = await audio_file.read(chunk_size) - if not chunk: - break - yield chunk - - async def response_generator(): - """Generate streaming translation responses.""" - try: - async for result in connector.stream_realtime_interpreter( - audio_chunk_generator(), - from_language=from_language, - to_language=to_language - ): - yield f"data: {json.dumps(result)}\n\n" - except Exception as e: - error_result = {"error": str(e), "is_final": True} - yield f"data: {json.dumps(error_result)}\n\n" - - return StreamingResponse( - response_generator(), - media_type="text/plain", - headers={ - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "Content-Type": "text/event-stream" - } - ) - - except Exception as e: - logger.error(f"Error in streaming realtime interpreter: {str(e)}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.get("/health") -async def health_check(): - """Health check endpoint for voice services.""" - return { - "status": "healthy", - "service": "azure-voice", - "endpoints": [ - "speech-to-text", - "text-to-speech", - "translate", - "conversation", - "realtime-interpreter", - "stream/speech-to-text", - "stream/text-to-speech", - "stream/realtime-interpreter", - "subscription", - "connections" - ] - } diff --git a/modules/routes/routeVoiceGoogle.py b/modules/routes/routeVoiceGoogle.py new file mode 100644 index 00000000..a44921e6 --- /dev/null +++ b/modules/routes/routeVoiceGoogle.py @@ -0,0 +1,268 @@ +""" +Google Cloud Voice Services Routes +Replaces Azure voice services with Google Cloud Speech-to-Text and Translation +""" + +import os +import logging +from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException +from typing import Optional +from modules.connectors.connectorGoogleSpeech import ConnectorGoogleSpeech +from modules.security.auth import getCurrentUser +from modules.interfaces.interfaceAppModel import User + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/voice-google", tags=["voice-google"]) + +# Global connector instance +_google_speech_connector = None + +def get_google_speech_connector() -> ConnectorGoogleSpeech: + """Get or create Google Cloud Speech connector instance.""" + global _google_speech_connector + + if _google_speech_connector is None: + try: + # Get credentials path from environment or config + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if not credentials_path: + # Try to find credentials in common locations + possible_paths = [ + "credentials/google-service-account.json", + "config/google-credentials.json", + "google-credentials.json" + ] + + for path in possible_paths: + if os.path.exists(path): + credentials_path = path + break + + if not credentials_path: + raise HTTPException( + status_code=500, + detail="Google Cloud credentials not found. Please set GOOGLE_APPLICATION_CREDENTIALS environment variable or place credentials file in project directory." + ) + + _google_speech_connector = ConnectorGoogleSpeech(credentials_path) + logger.info("✅ Google Cloud Speech connector initialized") + + except Exception as e: + logger.error(f"❌ Failed to initialize Google Cloud Speech connector: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to initialize Google Cloud Speech connector: {str(e)}" + ) + + return _google_speech_connector + +@router.post("/speech-to-text") +async def speech_to_text( + audio_file: UploadFile = File(...), + language: str = Form("de-DE"), + current_user: User = Depends(getCurrentUser) +): + """Convert speech to text using Google Cloud Speech-to-Text API.""" + try: + logger.info(f"🎤 Speech-to-text request: {audio_file.filename}, language: {language}") + + # Read audio file + audio_content = await audio_file.read() + logger.info(f"📊 Audio file size: {len(audio_content)} bytes") + + # Validate audio format + connector = get_google_speech_connector() + validation = connector.validate_audio_format(audio_content) + + if not validation["valid"]: + raise HTTPException( + status_code=400, + detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}" + ) + + # Perform speech recognition + result = await connector.speech_to_text( + audio_content=audio_content, + language=language + ) + + if result["success"]: + return { + "success": True, + "text": result["text"], + "confidence": result["confidence"], + "language": result["language"], + "audio_info": { + "size": len(audio_content), + "format": validation["format"], + "estimated_duration": validation.get("estimated_duration", 0) + } + } + else: + raise HTTPException( + status_code=400, + detail=f"Speech recognition failed: {result.get('error', 'Unknown error')}" + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Speech-to-text error: {e}") + raise HTTPException( + status_code=500, + detail=f"Speech-to-text processing failed: {str(e)}" + ) + +@router.post("/translate") +async def translate_text( + text: str = Form(...), + source_language: str = Form("de"), + target_language: str = Form("en"), + current_user: User = Depends(getCurrentUser) +): + """Translate text using Google Cloud Translation API.""" + try: + logger.info(f"🌐 Translation request: '{text}' ({source_language} -> {target_language})") + + if not text.strip(): + raise HTTPException( + status_code=400, + detail="Empty text provided for translation" + ) + + # Perform translation + connector = get_google_speech_connector() + result = await connector.translate_text( + text=text, + source_language=source_language, + target_language=target_language + ) + + if result["success"]: + return { + "success": True, + "original_text": result["original_text"], + "translated_text": result["translated_text"], + "source_language": result["source_language"], + "target_language": result["target_language"] + } + else: + raise HTTPException( + status_code=400, + detail=f"Translation failed: {result.get('error', 'Unknown error')}" + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Translation error: {e}") + raise HTTPException( + status_code=500, + detail=f"Translation processing failed: {str(e)}" + ) + +@router.post("/realtime-interpreter") +async def realtime_interpreter( + audio_file: UploadFile = File(...), + from_language: str = Form("de-DE"), + to_language: str = Form("en-US"), + connection_id: str = Form(None), + current_user: User = Depends(getCurrentUser) +): + """Real-time interpreter: speech to translated text using Google Cloud APIs.""" + try: + logger.info(f"🔄 Real-time interpreter request: {audio_file.filename}") + logger.info(f" From: {from_language} -> To: {to_language}") + + # Read audio file + audio_content = await audio_file.read() + logger.info(f"📊 Audio file size: {len(audio_content)} bytes") + + # Save audio file for debugging + debug_filename = f"debug_audio/audio_google_{audio_file.filename}" + os.makedirs("debug_audio", exist_ok=True) + with open(debug_filename, "wb") as f: + f.write(audio_content) + logger.info(f"💾 Saved audio file for debugging: {debug_filename}") + + # Validate audio format + connector = get_google_speech_connector() + validation = connector.validate_audio_format(audio_content) + + if not validation["valid"]: + raise HTTPException( + status_code=400, + detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}" + ) + + # Perform complete pipeline: Speech-to-Text + Translation + result = await connector.speech_to_translated_text( + audio_content=audio_content, + from_language=from_language, + to_language=to_language + ) + + if result["success"]: + logger.info(f"✅ Real-time interpreter successful:") + logger.info(f" Original: '{result['original_text']}'") + logger.info(f" Translated: '{result['translated_text']}'") + + return { + "success": True, + "original_text": result["original_text"], + "translated_text": result["translated_text"], + "confidence": result["confidence"], + "source_language": result["source_language"], + "target_language": result["target_language"], + "audio_info": { + "size": len(audio_content), + "format": validation["format"], + "estimated_duration": validation.get("estimated_duration", 0) + } + } + else: + raise HTTPException( + status_code=400, + detail=f"Real-time interpreter failed: {result.get('error', 'Unknown error')}" + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"❌ Real-time interpreter error: {e}") + raise HTTPException( + status_code=500, + detail=f"Real-time interpreter processing failed: {str(e)}" + ) + +@router.get("/health") +async def health_check(current_user: User = Depends(getCurrentUser)): + """Health check for Google Cloud voice services.""" + try: + connector = get_google_speech_connector() + + # Test with a simple translation + test_result = await connector.translate_text( + text="Hello", + source_language="en", + target_language="de" + ) + + if test_result["success"]: + return { + "status": "healthy", + "service": "Google Cloud Speech-to-Text & Translation", + "test_translation": test_result["translated_text"] + } + else: + return { + "status": "unhealthy", + "error": test_result.get("error", "Unknown error") + } + + except Exception as e: + logger.error(f"❌ Health check failed: {e}") + return { + "status": "unhealthy", + "error": str(e) + } diff --git a/modules/routes/routeVoiceWebSocket.py b/modules/routes/routeVoiceWebSocket.py deleted file mode 100644 index 304ccb1c..00000000 --- a/modules/routes/routeVoiceWebSocket.py +++ /dev/null @@ -1,494 +0,0 @@ -""" -Azure Voice Services WebSocket Routes -Provides real-time WebSocket endpoints for: -- Live microphone audio streaming -- Real-time speech-to-text -- Real-time translation -- Real-time text-to-speech -""" - -import logging -import asyncio -import json -from typing import Dict, Any, Optional, List -from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, Query -from fastapi.websockets import WebSocketState -import base64 -import io -from datetime import datetime - -from modules.interfaces.interfaceAppObjects import getRootInterface -from modules.interfaces.interfaceAppModel import UserInDB -from modules.security.auth import getCurrentUser -from modules.connectors.connectorAzureSpeech import ConnectorAzureSpeech - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/voice/ws", tags=["voice-websocket"]) - -class ConnectionManager: - """Manages WebSocket connections for real-time voice services.""" - - def __init__(self): - self.active_connections: Dict[str, WebSocket] = {} - self.user_connections: Dict[str, str] = {} # user_id -> connection_id - self.connection_metadata: Dict[str, Dict] = {} - - async def connect(self, websocket: WebSocket, connection_id: str, user_id: str, metadata: Dict = None): - """Accept a WebSocket connection.""" - await websocket.accept() - self.active_connections[connection_id] = websocket - self.user_connections[user_id] = connection_id - self.connection_metadata[connection_id] = metadata or {} - logger.info(f"WebSocket connected: {connection_id} for user {user_id}") - - def disconnect(self, connection_id: str): - """Remove a WebSocket connection.""" - if connection_id in self.active_connections: - del self.active_connections[connection_id] - # Remove from user mapping - user_id = None - for uid, cid in self.user_connections.items(): - if cid == connection_id: - user_id = uid - break - if user_id: - del self.user_connections[user_id] - if connection_id in self.connection_metadata: - del self.connection_metadata[connection_id] - logger.info(f"WebSocket disconnected: {connection_id}") - - async def send_message(self, connection_id: str, message: Dict): - """Send a message to a specific connection.""" - if connection_id in self.active_connections: - websocket = self.active_connections[connection_id] - try: - if websocket.client_state == WebSocketState.CONNECTED: - await websocket.send_text(json.dumps(message)) - else: - self.disconnect(connection_id) - except Exception as e: - logger.error(f"Error sending message to {connection_id}: {str(e)}") - self.disconnect(connection_id) - - async def send_error(self, connection_id: str, error: str, error_code: str = "GENERAL_ERROR"): - """Send an error message to a connection.""" - await self.send_message(connection_id, { - "type": "error", - "error": error, - "error_code": error_code, - "timestamp": datetime.now().isoformat() - }) - - def get_connection_metadata(self, connection_id: str) -> Dict: - """Get metadata for a connection.""" - return self.connection_metadata.get(connection_id, {}) - -# Global connection manager -manager = ConnectionManager() - -async def get_azure_speech_connector_ws(user_id: str) -> Optional[ConnectorAzureSpeech]: - """Get Azure Speech connector for WebSocket user.""" - try: - root_interface = getRootInterface() - - # Get user by ID - user = root_interface.getUser(user_id) - if not user: - logger.error(f"User with ID {user_id} not found") - return None - - # Get user connections - user_connections = root_interface.getUserConnections(user_id) - azure_connection = None - - # Find first Azure connection - for connection in user_connections: - if connection.authority == "msft": - azure_connection = connection - break - - if not azure_connection: - logger.error(f"No Azure connection found for user {user_id}") - return None - - # Get connection token - connection_token = root_interface.getConnectionToken(azure_connection.id) - if not connection_token: - logger.error(f"No connection token found for user {user_id}") - return None - - # Use Azure Speech Services subscription key from configuration - from modules.shared.configuration import APP_CONFIG - subscription_key = APP_CONFIG.get("Connector_AzureSpeech_SUBSCRIPTION_KEY") - - if not subscription_key or subscription_key == "your-azure-speech-subscription-key-here": - logger.error("Azure Speech Services subscription key not configured") - return None - - # Get region from configuration - region = APP_CONFIG.get("Connector_AzureSpeech_REGION", "westeurope") - - # Create Azure Speech connector - connector = ConnectorAzureSpeech(subscription_key=subscription_key, region=region) - return connector - - except Exception as e: - logger.error(f"Error getting Azure Speech connector for WebSocket: {str(e)}") - return None - -@router.websocket("/realtime-interpreter") -async def websocket_realtime_interpreter( - websocket: WebSocket, - user_id: str = Query(...), - from_language: str = Query("de-DE"), - to_language: str = Query("en-US"), - audio_format: str = Query("wav") -): - """WebSocket endpoint for real-time interpreter with live audio streaming.""" - connection_id = f"interpreter_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - - try: - # Connect WebSocket - await manager.connect(websocket, connection_id, user_id, { - "service": "realtime_interpreter", - "from_language": from_language, - "to_language": to_language, - "audio_format": audio_format - }) - - # Get Azure Speech connector - connector = await get_azure_speech_connector_ws(user_id) - if not connector: - await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR") - return - - # Send connection confirmation - await manager.send_message(connection_id, { - "type": "connected", - "connection_id": connection_id, - "service": "realtime_interpreter", - "from_language": from_language, - "to_language": to_language, - "timestamp": datetime.now().isoformat() - }) - - # Process incoming audio chunks - audio_buffer = io.BytesIO() - chunk_count = 0 - - while True: - try: - # Receive message from client - data = await websocket.receive_text() - message = json.loads(data) - - if message.get("type") == "audio_chunk": - # Decode base64 audio data - audio_data = base64.b64decode(message.get("data", "")) - audio_buffer.write(audio_data) - chunk_count += 1 - - # Process every 10 chunks or when buffer is large enough - if chunk_count % 10 == 0 or len(audio_data) > 8192: - audio_buffer.seek(0) - audio_content = audio_buffer.read() - - if len(audio_content) > 0: - try: - # Convert speech to text - stt_result = await connector.speech_to_text( - audio_content=audio_content, - language=from_language, - format="detailed", - audio_format=audio_format - ) - - original_text = stt_result.get("text", "") - - # Translate if different languages - translated_text = original_text - if from_language != to_language and original_text.strip(): - try: - translated_text = await connector.translate_text( - text=original_text, - from_language=from_language, - to_language=to_language - ) - except Exception as e: - logger.warning(f"Translation failed: {str(e)}") - translated_text = original_text - - # Send result back to client - await manager.send_message(connection_id, { - "type": "translation_result", - "original_text": original_text, - "translated_text": translated_text, - "from_language": from_language, - "to_language": to_language, - "confidence": stt_result.get("confidence", 0.0), - "timestamp": datetime.now().isoformat() - }) - - except Exception as e: - logger.error(f"Error processing audio chunk: {str(e)}") - await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR") - - # Reset buffer - audio_buffer = io.BytesIO() - chunk_count = 0 - - elif message.get("type") == "ping": - # Respond to ping with pong - await manager.send_message(connection_id, { - "type": "pong", - "timestamp": datetime.now().isoformat() - }) - - elif message.get("type") == "disconnect": - # Client requested disconnect - break - - except WebSocketDisconnect: - break - except json.JSONDecodeError: - await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON") - except Exception as e: - logger.error(f"Error processing WebSocket message: {str(e)}") - await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR") - - except WebSocketDisconnect: - pass - except Exception as e: - logger.error(f"WebSocket error: {str(e)}") - try: - await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR") - except: - pass - finally: - manager.disconnect(connection_id) - -@router.websocket("/speech-to-text") -async def websocket_speech_to_text( - websocket: WebSocket, - user_id: str = Query(...), - language: str = Query("de-DE"), - audio_format: str = Query("wav") -): - """WebSocket endpoint for real-time speech-to-text with live audio streaming.""" - connection_id = f"stt_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - - try: - # Connect WebSocket - await manager.connect(websocket, connection_id, user_id, { - "service": "speech_to_text", - "language": language, - "audio_format": audio_format - }) - - # Get Azure Speech connector - connector = await get_azure_speech_connector_ws(user_id) - if not connector: - await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR") - return - - # Send connection confirmation - await manager.send_message(connection_id, { - "type": "connected", - "connection_id": connection_id, - "service": "speech_to_text", - "language": language, - "timestamp": datetime.now().isoformat() - }) - - # Process incoming audio chunks - audio_buffer = io.BytesIO() - chunk_count = 0 - - while True: - try: - # Receive message from client - data = await websocket.receive_text() - message = json.loads(data) - - if message.get("type") == "audio_chunk": - # Decode base64 audio data - audio_data = base64.b64decode(message.get("data", "")) - audio_buffer.write(audio_data) - chunk_count += 1 - - # Process every 10 chunks or when buffer is large enough - if chunk_count % 10 == 0 or len(audio_data) > 8192: - audio_buffer.seek(0) - audio_content = audio_buffer.read() - - if len(audio_content) > 0: - try: - # Convert speech to text - stt_result = await connector.speech_to_text( - audio_content=audio_content, - language=language, - format="detailed", - audio_format=audio_format - ) - - # Send result back to client - await manager.send_message(connection_id, { - "type": "transcription_result", - "text": stt_result.get("text", ""), - "confidence": stt_result.get("confidence", 0.0), - "language": stt_result.get("language", language), - "is_final": stt_result.get("RecognitionStatus") == "Success", - "timestamp": datetime.now().isoformat() - }) - - except Exception as e: - logger.error(f"Error processing audio chunk: {str(e)}") - await manager.send_error(connection_id, f"Audio processing failed: {str(e)}", "PROCESSING_ERROR") - - # Reset buffer - audio_buffer = io.BytesIO() - chunk_count = 0 - - elif message.get("type") == "ping": - # Respond to ping with pong - await manager.send_message(connection_id, { - "type": "pong", - "timestamp": datetime.now().isoformat() - }) - - elif message.get("type") == "disconnect": - # Client requested disconnect - break - - except WebSocketDisconnect: - break - except json.JSONDecodeError: - await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON") - except Exception as e: - logger.error(f"Error processing WebSocket message: {str(e)}") - await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR") - - except WebSocketDisconnect: - pass - except Exception as e: - logger.error(f"WebSocket error: {str(e)}") - try: - await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR") - except: - pass - finally: - manager.disconnect(connection_id) - -@router.websocket("/text-to-speech") -async def websocket_text_to_speech( - websocket: WebSocket, - user_id: str = Query(...), - language: str = Query("de-DE"), - voice: str = Query("de-DE-KatjaNeural"), - audio_format: str = Query("audio-16khz-128kbitrate-mono-mp3") -): - """WebSocket endpoint for real-time text-to-speech streaming.""" - connection_id = f"tts_{user_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - - try: - # Connect WebSocket - await manager.connect(websocket, connection_id, user_id, { - "service": "text_to_speech", - "language": language, - "voice": voice, - "audio_format": audio_format - }) - - # Get Azure Speech connector - connector = await get_azure_speech_connector_ws(user_id) - if not connector: - await manager.send_error(connection_id, "Azure Speech Services not configured", "NO_CONNECTOR") - return - - # Send connection confirmation - await manager.send_message(connection_id, { - "type": "connected", - "connection_id": connection_id, - "service": "text_to_speech", - "language": language, - "voice": voice, - "timestamp": datetime.now().isoformat() - }) - - while True: - try: - # Receive message from client - data = await websocket.receive_text() - message = json.loads(data) - - if message.get("type") == "text_to_speak": - text = message.get("text", "") - - if text.strip(): - try: - # Convert text to speech - audio_data = await connector.text_to_speech( - text=text, - language=language, - voice=voice, - format=audio_format - ) - - # Send audio data back to client - audio_base64 = base64.b64encode(audio_data).decode('utf-8') - await manager.send_message(connection_id, { - "type": "audio_data", - "audio_data": audio_base64, - "format": audio_format, - "voice": voice, - "text": text, - "timestamp": datetime.now().isoformat() - }) - - except Exception as e: - logger.error(f"Error converting text to speech: {str(e)}") - await manager.send_error(connection_id, f"TTS failed: {str(e)}", "TTS_ERROR") - - elif message.get("type") == "ping": - # Respond to ping with pong - await manager.send_message(connection_id, { - "type": "pong", - "timestamp": datetime.now().isoformat() - }) - - elif message.get("type") == "disconnect": - # Client requested disconnect - break - - except WebSocketDisconnect: - break - except json.JSONDecodeError: - await manager.send_error(connection_id, "Invalid JSON message", "INVALID_JSON") - except Exception as e: - logger.error(f"Error processing WebSocket message: {str(e)}") - await manager.send_error(connection_id, f"Message processing failed: {str(e)}", "MESSAGE_ERROR") - - except WebSocketDisconnect: - pass - except Exception as e: - logger.error(f"WebSocket error: {str(e)}") - try: - await manager.send_error(connection_id, f"Connection error: {str(e)}", "CONNECTION_ERROR") - except: - pass - finally: - manager.disconnect(connection_id) - -@router.get("/status") -async def websocket_status(): - """Get WebSocket connection status.""" - return { - "active_connections": len(manager.active_connections), - "connected_users": len(manager.user_connections), - "services": { - "realtime_interpreter": len([c for c in manager.connection_metadata.values() if c.get("service") == "realtime_interpreter"]), - "speech_to_text": len([c for c in manager.connection_metadata.values() if c.get("service") == "speech_to_text"]), - "text_to_speech": len([c for c in manager.connection_metadata.values() if c.get("service") == "text_to_speech"]) - } - } diff --git a/modules/services/serviceDeltaSync.py b/modules/services/serviceDeltaSync.py index d8376db4..a8489e82 100644 --- a/modules/services/serviceDeltaSync.py +++ b/modules/services/serviceDeltaSync.py @@ -57,20 +57,25 @@ class ManagerSyncDelta: JIRA_ISSUE_TYPE = "Task" # Task sync definition for field mapping (like original synchronizer) - TASK_SYNC_DEFINITION = { - "ID": ["get", ["key"]], - "Summary": ["get", ["fields", "summary"]], - "Status": ["get", ["fields", "status", "name"]], - "Assignee": ["get", ["fields", "assignee", "displayName"]], - "Reporter": ["get", ["fields", "reporter", "displayName"]], - "Created": ["get", ["fields", "created"]], - "Updated": ["get", ["fields", "updated"]], - "Priority": ["get", ["fields", "priority", "name"]], - "IssueType": ["get", ["fields", "issuetype", "name"]], - "Project": ["get", ["fields", "project", "name"]], - "Description": ["get", ["fields", "description"]], - } + task_sync_definition={ + #key=excel-header, [get:jira>excel | put: excel>jira, jira-xml-field-list] + 'ID': ['get', ['key']], + 'Module Category': ['get', ['fields', 'customfield_10058', 'value']], + 'Summary': ['get', ['fields', 'summary']], + 'Description': ['get', ['fields', 'description']], + 'References': ['get', ['fields', 'customfield_10066']], + 'Priority': ['get', ['fields', 'priority', 'name']], + 'Issue Status': ['get', ['fields', 'customfield_10062']], + 'Assignee': ['get', ['fields', 'assignee', 'displayName']], + 'Issue Created': ['get', ['fields', 'created']], + 'Due Date': ['get', ['fields', 'duedate']], + 'DELTA Comments': ['get', ['fields', 'customfield_10060']], + 'SELISE Ticket References': ['put', ['fields', 'customfield_10067']], + 'SELISE Status Values': ['put', ['fields', 'customfield_10065']], + 'SELISE Comments': ['put', ['fields', 'customfield_10064']], + } + def __init__(self): """Initialize the sync manager with hardcoded Delta Group credentials.""" self.root_interface = getRootInterface() diff --git a/requirements.txt b/requirements.txt index b4691ada..bf996e6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,6 +48,9 @@ Office365-REST-Python-Client==2.6.2 # Easy Sharepoint integration ## Image Processing Pillow>=10.0.0 # Für Bildverarbeitung (als PIL importiert) +## Audio Processing +# Audio format conversion handled by pure Python implementation + ## Utilities & Timezone Support python-dateutil==2.8.2 python-dotenv==1.0.0 @@ -56,6 +59,10 @@ pytz>=2023.3 # For timezone handling and UTC operations ## Dependencies for trio (used by httpx) sortedcontainers>=2.4.0 # Required by trio +## Google Cloud Integration +google-cloud-speech==2.21.0 +google-cloud-translate==3.11.1 + ## MSFT Integration msal==1.24.1