# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Voice / Language Catalog — Single Source of Truth. Every voice-related component (TTS connector, AI tools, REST routes, frontend language pickers) consumes this catalog. Hard-coded language lists or ad-hoc ISO→BCP-47 maps elsewhere are forbidden — extend the catalog instead. Schema per entry: bcp47 BCP-47 locale code, e.g. "de-DE", "ru-RU" iso ISO-639-1 short code, e.g. "de", "ru" label Native display label ("Deutsch", "Русский") flag Emoji flag (or empty string for region-neutral codes) defaultVoice Curated Google TTS voice name; None means "let Google pick automatically based on bcp47 + ssml_gender". """ from __future__ import annotations from dataclasses import asdict, dataclass from typing import Dict, List, Optional @dataclass(frozen=True) class VoiceLanguage: bcp47: str iso: str label: str flag: str defaultVoice: Optional[str] # Order matters for UI: most common first, then alphabetical groups. VOICE_LANGUAGES: List[VoiceLanguage] = [ VoiceLanguage("de-DE", "de", "Deutsch", "🇩🇪", "de-DE-Wavenet-A"), VoiceLanguage("de-CH", "de", "Deutsch (Schweiz)", "🇨🇭", "de-DE-Wavenet-A"), VoiceLanguage("de-AT", "de", "Deutsch (Österreich)", "🇦🇹", "de-DE-Wavenet-A"), VoiceLanguage("en-US", "en", "English (US)", "🇺🇸", "en-US-Wavenet-C"), VoiceLanguage("en-GB", "en", "English (UK)", "🇬🇧", "en-GB-Wavenet-A"), VoiceLanguage("en-AU", "en", "English (Australia)", "🇦🇺", "en-AU-Wavenet-A"), VoiceLanguage("fr-FR", "fr", "Français", "🇫🇷", "fr-FR-Wavenet-A"), VoiceLanguage("fr-CA", "fr", "Français (Canada)", "🇨🇦", "fr-CA-Wavenet-A"), VoiceLanguage("it-IT", "it", "Italiano", "🇮🇹", "it-IT-Wavenet-A"), VoiceLanguage("es-ES", "es", "Español", "🇪🇸", "es-ES-Wavenet-B"), VoiceLanguage("es-US", "es", "Español (US)", "🇺🇸", "es-US-Wavenet-A"), VoiceLanguage("pt-BR", "pt", "Português (Brasil)", "🇧🇷", "pt-BR-Wavenet-A"), VoiceLanguage("pt-PT", "pt", "Português (Portugal)", "🇵🇹", "pt-PT-Wavenet-A"), VoiceLanguage("nl-NL", "nl", "Nederlands", "🇳🇱", "nl-NL-Wavenet-A"), VoiceLanguage("pl-PL", "pl", "Polski", "🇵🇱", "pl-PL-Wavenet-A"), VoiceLanguage("ru-RU", "ru", "Русский", "🇷🇺", "ru-RU-Wavenet-A"), VoiceLanguage("uk-UA", "uk", "Українська", "🇺🇦", "uk-UA-Wavenet-A"), VoiceLanguage("cs-CZ", "cs", "Čeština", "🇨🇿", "cs-CZ-Wavenet-A"), VoiceLanguage("sk-SK", "sk", "Slovenčina", "🇸🇰", "sk-SK-Wavenet-A"), VoiceLanguage("hu-HU", "hu", "Magyar", "🇭🇺", "hu-HU-Wavenet-A"), VoiceLanguage("ro-RO", "ro", "Română", "🇷🇴", "ro-RO-Wavenet-A"), VoiceLanguage("el-GR", "el", "Ελληνικά", "🇬🇷", "el-GR-Wavenet-A"), VoiceLanguage("sv-SE", "sv", "Svenska", "🇸🇪", "sv-SE-Wavenet-A"), VoiceLanguage("da-DK", "da", "Dansk", "🇩🇰", "da-DK-Wavenet-A"), VoiceLanguage("nb-NO", "nb", "Norsk", "🇳🇴", "nb-NO-Wavenet-A"), VoiceLanguage("fi-FI", "fi", "Suomi", "🇫🇮", "fi-FI-Wavenet-A"), VoiceLanguage("tr-TR", "tr", "Türkçe", "🇹🇷", "tr-TR-Wavenet-A"), VoiceLanguage("ar-XA", "ar", "العربية", "", "ar-XA-Wavenet-A"), VoiceLanguage("hi-IN", "hi", "हिन्दी", "🇮🇳", "hi-IN-Wavenet-A"), VoiceLanguage("ja-JP", "ja", "日本語", "🇯🇵", "ja-JP-Wavenet-A"), VoiceLanguage("ko-KR", "ko", "한국어", "🇰🇷", "ko-KR-Wavenet-A"), VoiceLanguage("zh-CN", "zh", "中文 (简体)", "🇨🇳", "cmn-CN-Wavenet-A"), VoiceLanguage("vi-VN", "vi", "Tiếng Việt", "🇻🇳", "vi-VN-Wavenet-A"), VoiceLanguage("th-TH", "th", "ไทย", "🇹🇭", "th-TH-Standard-A"), VoiceLanguage("id-ID", "id", "Bahasa Indonesia", "🇮🇩", "id-ID-Wavenet-A"), ] # --------------------------------------------------------------------------- # Lookup indexes (built once at import). # --------------------------------------------------------------------------- _BY_BCP47: Dict[str, VoiceLanguage] = {v.bcp47.lower(): v for v in VOICE_LANGUAGES} _BY_ISO: Dict[str, VoiceLanguage] = {} for _v in VOICE_LANGUAGES: _BY_ISO.setdefault(_v.iso.lower(), _v) def listVoiceLanguages() -> List[VoiceLanguage]: """Return the canonical, ordered list of supported voice languages.""" return list(VOICE_LANGUAGES) def getCatalogPayload() -> List[Dict[str, Optional[str]]]: """Return the catalog as plain dicts — ready for JSON serialization.""" return [asdict(v) for v in VOICE_LANGUAGES] def getByBcp47(code: Optional[str]) -> Optional[VoiceLanguage]: if not code: return None return _BY_BCP47.get(code.strip().lower()) def getByIso(code: Optional[str]) -> Optional[VoiceLanguage]: if not code: return None return _BY_ISO.get(code.strip().lower()) def getDefaultVoice(bcp47: Optional[str]) -> Optional[str]: """Return the curated default Google TTS voice for a BCP-47 code, else None. None means: caller must omit `name` in VoiceSelectionParams so Google auto-selects a voice for the language code. """ entry = getByBcp47(bcp47) return entry.defaultVoice if entry else None def isoToBcp47(iso: Optional[str]) -> Optional[str]: """Map an ISO-639-1 short code to the canonical BCP-47 locale. Already-qualified BCP-47 inputs are passed through unchanged (canonicalised to the catalog form when known). Unknown ISO codes fall back to ``-`` (e.g. "fa" → "fa-FA") so callers always get a parseable locale, but unknown codes carry no curated voice. """ if not iso: return None normalized = iso.strip() if not normalized: return None if "-" in normalized: canonical = getByBcp47(normalized) return canonical.bcp47 if canonical else normalized isoLower = normalized.lower() entry = _BY_ISO.get(isoLower) if entry: return entry.bcp47 return f"{isoLower}-{isoLower.upper()}"