gateway/modules/routes/routeVoiceGoogle.py
2026-04-10 12:33:27 +02:00

682 lines
25 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Google Cloud Voice Services Routes
Replaces Azure voice services with Google Cloud Speech-to-Text and Translation
Includes WebSocket support for real-time voice streaming
"""
import asyncio
import logging
import json
import base64
import secrets
import time
from fastapi import APIRouter, File, Form, UploadFile, Depends, HTTPException, Body, Query, Request, WebSocket, WebSocketDisconnect
from fastapi.responses import Response
from typing import Optional, Dict, Any, List
from modules.auth import getCurrentUser, getRequestContext, RequestContext, limiter
from modules.datamodels.datamodelUam import User
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface, VoiceObjects
from modules.shared.i18nRegistry import apiRouteContext
routeApiMsg = apiRouteContext("routeVoiceGoogle")
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/voice-google", tags=["Voice Google"])
# Store active WebSocket connections
activeConnections: Dict[str, WebSocket] = {}
class ConnectionManager:
def __init__(self):
self.activeConnections: List[WebSocket] = []
async def connect(self, websocket: WebSocket, connectionId: str):
await websocket.accept()
self.activeConnections.append(websocket)
activeConnections[connectionId] = websocket
logger.info(f"WebSocket connected: {connectionId}")
def disconnect(self, websocket: WebSocket, connectionId: str):
if websocket in self.activeConnections:
self.activeConnections.remove(websocket)
if connectionId in activeConnections:
del activeConnections[connectionId]
logger.info(f"WebSocket disconnected: {connectionId}")
async def send_personal_message(self, message: dict, websocket: WebSocket):
try:
await websocket.send_text(json.dumps(message))
except Exception as e:
logger.error(f"Error sending message: {e}")
manager = ConnectionManager()
def _getVoiceInterface(currentUser: User) -> VoiceObjects:
"""Get voice interface instance with user context."""
try:
return getVoiceInterface(currentUser)
except Exception as e:
logger.error(f"Failed to initialize voice interface: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to initialize voice interface: {str(e)}"
)
@router.post("/speech-to-text")
async def speech_to_text(
audioFile: UploadFile = File(...),
language: str = Form("de-DE"),
currentUser: User = Depends(getCurrentUser)
):
"""Convert speech to text using Google Cloud Speech-to-Text API."""
try:
logger.info(f"🎤 Speech-to-text request: {audioFile.filename}, language: {language}")
# Read audio file
audioContent = await audioFile.read()
logger.info(f"📊 Audio file size: {len(audioContent)} bytes")
# Get voice interface
voiceInterface = _getVoiceInterface(currentUser)
# Validate audio format
validation = voiceInterface.validateAudioFormat(audioContent)
if not validation["valid"]:
raise HTTPException(
status_code=400,
detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}"
)
# Perform speech recognition
result = await voiceInterface.speechToText(
audioContent=audioContent,
language=language
)
if result["success"]:
return {
"success": True,
"text": result["text"],
"confidence": result["confidence"],
"language": result["language"],
"audio_info": {
"size": len(audioContent),
"format": validation["format"],
"estimated_duration": validation.get("estimated_duration", 0)
}
}
else:
raise HTTPException(
status_code=400,
detail=f"Speech recognition failed: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Speech-to-text error: {e}")
raise HTTPException(
status_code=500,
detail=f"Speech-to-text processing failed: {str(e)}"
)
@router.post("/detect-language")
async def detect_language(
text: str = Form(...),
currentUser: User = Depends(getCurrentUser)
):
"""Detect the language of text using Google Cloud Translation API."""
try:
logger.info(f"🔍 Language detection request: '{text[:100]}...'")
if not text.strip():
raise HTTPException(
status_code=400,
detail=routeApiMsg("Empty text provided for language detection")
)
# Get voice interface
voiceInterface = _getVoiceInterface(currentUser)
# Perform language detection
result = await voiceInterface.detectLanguage(text)
if result["success"]:
return {
"success": True,
"language": result["language"],
"confidence": result.get("confidence", 1.0)
}
else:
raise HTTPException(
status_code=400,
detail=f"Language detection failed: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Language detection error: {e}")
raise HTTPException(
status_code=500,
detail=f"Language detection processing failed: {str(e)}"
)
@router.post("/translate")
async def translate_text(
text: str = Form(...),
sourceLanguage: str = Form("de"),
targetLanguage: str = Form("en"),
currentUser: User = Depends(getCurrentUser)
):
"""Translate text using Google Cloud Translation API."""
try:
logger.info(f"🌐 Translation request: '{text}' ({sourceLanguage} -> {targetLanguage})")
if not text.strip():
raise HTTPException(
status_code=400,
detail=routeApiMsg("Empty text provided for translation")
)
# Get voice interface
voiceInterface = _getVoiceInterface(currentUser)
# Perform translation
result = await voiceInterface.translateText(
text=text,
sourceLanguage=sourceLanguage,
targetLanguage=targetLanguage
)
if result["success"]:
return {
"success": True,
"original_text": result["original_text"],
"translated_text": result["translated_text"],
"source_language": result["source_language"],
"target_language": result["target_language"]
}
else:
raise HTTPException(
status_code=400,
detail=f"Translation failed: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Translation error: {e}")
raise HTTPException(
status_code=500,
detail=f"Translation processing failed: {str(e)}"
)
@router.post("/realtime-interpreter")
async def realtime_interpreter(
audioFile: UploadFile = File(...),
fromLanguage: str = Form("de-DE"),
toLanguage: str = Form("en-US"),
connectionId: str = Form(None),
currentUser: User = Depends(getCurrentUser)
):
"""Real-time interpreter: speech to translated text using Google Cloud APIs."""
try:
logger.info(f"🔄 Real-time interpreter request: {audioFile.filename}")
logger.info(f" From: {fromLanguage} -> To: {toLanguage}")
logger.info(f" MIME type: {audioFile.content_type}")
# Read audio file
audioContent = await audioFile.read()
logger.info(f"📊 Audio file size: {len(audioContent)} bytes")
# Save audio file for debugging with correct extension
# file_extension = "webm" if audio_file.filename.endswith('.webm') else "wav"
# debug_filename = f"debug_audio/audio_google_{audio_file.filename.replace('.wav', '.webm')}"
# os.makedirs("debug_audio", exist_ok=True)
# with open(debug_filename, "wb") as f:
# f.write(audio_content)
# logger.info(f"💾 Saved audio file for debugging: {debug_filename}")
# Get voice interface
voiceInterface = _getVoiceInterface(currentUser)
# Validate audio format
validation = voiceInterface.validateAudioFormat(audioContent)
if not validation["valid"]:
raise HTTPException(
status_code=400,
detail=f"Invalid audio format: {validation.get('error', 'Unknown error')}"
)
# Perform complete pipeline: Speech-to-Text + Translation
result = await voiceInterface.speechToTranslatedText(
audioContent=audioContent,
fromLanguage=fromLanguage,
toLanguage=toLanguage
)
if result["success"]:
logger.info(f"✅ Real-time interpreter successful:")
logger.info(f" Original: '{result['original_text']}'")
logger.info(f" Translated: '{result['translated_text']}'")
return {
"success": True,
"original_text": result["original_text"],
"translated_text": result["translated_text"],
"confidence": result["confidence"],
"source_language": result["source_language"],
"target_language": result["target_language"],
"audio_info": {
"size": len(audioContent),
"format": validation["format"],
"estimated_duration": validation.get("estimated_duration", 0)
}
}
else:
raise HTTPException(
status_code=400,
detail=f"Real-time interpreter failed: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Real-time interpreter error: {e}")
raise HTTPException(
status_code=500,
detail=f"Real-time interpreter processing failed: {str(e)}"
)
@router.post("/text-to-speech")
async def text_to_speech(
request: Request,
text: str = Form(...),
language: str = Form("de-DE"),
voice: str = Form(None),
context: RequestContext = Depends(getRequestContext),
):
"""Convert text to speech using Google Cloud Text-to-Speech."""
try:
logger.info(f"Text-to-Speech request: '{text[:50]}...' in {language}")
if not text.strip():
raise HTTPException(
status_code=400,
detail=routeApiMsg("Empty text provided for text-to-speech")
)
mandateId = str(getattr(context, "mandateId", "") or "")
voiceInterface = getVoiceInterface(context.user, mandateId)
try:
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import getService as getBillingService
billingService = getBillingService(context.user, mandateId)
def _billingCb(data):
priceCHF = data.get("priceCHF", 0.0)
operation = data.get("operation", "voice")
if priceCHF > 0:
billingService.recordUsage(priceCHF=priceCHF, aicoreProvider="google-voice", aicoreModel=operation, description=f"Voice {operation}")
voiceInterface.billingCallback = _billingCb
except Exception as e:
logger.warning(f"TTS billing setup skipped: {e}")
result = await voiceInterface.textToSpeech(
text=text,
languageCode=language,
voiceName=voice
)
if result["success"]:
return Response(
content=result["audioContent"],
media_type="audio/mpeg",
headers={
"Content-Disposition": "attachment; filename=speech.mp3",
"X-Voice-Name": result.get("voiceName", ""),
"X-Language-Code": result.get("languageCode", language),
}
)
else:
raise HTTPException(
status_code=400,
detail=f"Text-to-Speech failed: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Text-to-Speech error: {e}")
raise HTTPException(
status_code=500,
detail=f"Text-to-Speech processing failed: {str(e)}"
)
@router.get("/languages")
async def get_available_languages(currentUser: User = Depends(getCurrentUser)):
"""Get available languages from Google Cloud Text-to-Speech."""
try:
logger.info("🌐 Getting available languages from Google Cloud TTS")
voiceInterface = _getVoiceInterface(currentUser)
result = await voiceInterface.getAvailableLanguages()
if result["success"]:
return {
"success": True,
"languages": result["languages"]
}
else:
raise HTTPException(
status_code=400,
detail=f"Failed to get languages: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Get languages error: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to get available languages: {str(e)}"
)
@router.get("/voices")
async def get_available_voices(
languageCode: Optional[str] = None,
language_code: Optional[str] = None, # Accept both camelCase and snake_case
currentUser: User = Depends(getCurrentUser)
):
"""
Get available voices from Google Cloud Text-to-Speech.
Accepts languageCode (camelCase) or language_code (snake_case) query parameter.
"""
# Use language_code if provided (frontend sends this), otherwise use languageCode
if language_code:
languageCode = language_code
try:
logger.info(f"🎤 Getting available voices, language filter: {languageCode}")
voiceInterface = _getVoiceInterface(currentUser)
result = await voiceInterface.getAvailableVoices(languageCode=languageCode)
if result["success"]:
return {
"success": True,
"voices": result["voices"],
"language_filter": languageCode
}
else:
raise HTTPException(
status_code=400,
detail=f"Failed to get voices: {result.get('error', 'Unknown error')}"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Get voices error: {e}")
raise HTTPException(
status_code=500,
detail=f"Failed to get available voices: {str(e)}"
)
@router.get("/health")
async def health_check(currentUser: User = Depends(getCurrentUser)):
"""Health check for Google Cloud voice services."""
try:
voiceInterface = _getVoiceInterface(currentUser)
test_result = await voiceInterface.healthCheck()
return test_result
except Exception as e:
logger.error(f"❌ Health check failed: {e}")
return {
"status": "unhealthy",
"error": str(e)
}
@router.get("/settings")
async def get_voice_settings(currentUser: User = Depends(getCurrentUser)):
"""Get voice settings for the current user (reads from UserVoicePreferences)."""
from modules.datamodels.datamodelUam import UserVoicePreferences
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
userId = str(currentUser.id)
prefs = rootInterface.db.getRecordset(
UserVoicePreferences, recordFilter={"userId": userId}
)
if prefs:
data = prefs[0] if isinstance(prefs[0], dict) else prefs[0].model_dump()
return {"success": True, "data": {"user_settings": data}}
return {"success": True, "data": {"user_settings": UserVoicePreferences(userId=userId).model_dump()}}
@router.post("/settings")
async def save_voice_settings(
settings: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser)
):
"""Save voice settings for the current user (writes to UserVoicePreferences)."""
from modules.datamodels.datamodelUam import UserVoicePreferences, _normalizeTtsVoiceMap
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
userId = str(currentUser.id)
allowedFields = {
"sttLanguage", "ttsLanguage", "ttsVoice", "ttsVoiceMap",
"translationSourceLanguage", "translationTargetLanguage",
}
updateData = {k: v for k, v in settings.items() if k in allowedFields}
if "ttsVoiceMap" in updateData:
updateData["ttsVoiceMap"] = _normalizeTtsVoiceMap(updateData["ttsVoiceMap"])
existing = rootInterface.db.getRecordset(
UserVoicePreferences, recordFilter={"userId": userId}
)
if existing:
existingRecord = existing[0]
existingId = existingRecord.get("id") if isinstance(existingRecord, dict) else existingRecord.id
rootInterface.db.recordModify(UserVoicePreferences, existingId, updateData)
else:
newPrefs = UserVoicePreferences(userId=userId, **updateData)
rootInterface.db.recordCreate(UserVoicePreferences, newPrefs.model_dump())
return {"success": True, "message": "Voice settings saved successfully", "data": updateData}
# =========================================================================
# STT Streaming WebSocket — generic, used by all features
# =========================================================================
_sttTokens: Dict[str, Dict[str, Any]] = {}
_STT_TOKEN_TTL = 45
def _cleanupSttTokens():
now = time.time()
expired = [t for t, p in _sttTokens.items() if p.get("expiresAt", 0) <= now]
for t in expired:
_sttTokens.pop(t, None)
@router.post("/stt/token")
@limiter.limit("60/minute")
async def createSttToken(
request: Request,
context: RequestContext = Depends(getRequestContext),
):
"""Issue a short-lived single-use token for the STT streaming WebSocket."""
_cleanupSttTokens()
token = secrets.token_urlsafe(32)
_sttTokens[token] = {
"userId": str(context.user.id),
"mandateId": str(getattr(context, "mandateId", "") or ""),
"expiresAt": time.time() + _STT_TOKEN_TTL,
}
return {"wsToken": token, "expiresInSeconds": _STT_TOKEN_TTL}
@router.websocket("/stt/stream")
async def sttStream(
websocket: WebSocket,
wsToken: Optional[str] = Query(None),
):
"""
Generic STT streaming WebSocket.
Protocol:
Client sends JSON:
{"type": "open", "language": "de-DE"}
{"type": "audio", "chunk": "<base64>"}
{"type": "close"}
Server sends JSON:
{"type": "interim", "text": "..."}
{"type": "final", "text": "...", "confidence": 0.95}
{"type": "error", "message": "..."}
{"type": "closed"}
"""
await websocket.accept()
# --- authenticate via wsToken ---
if not wsToken:
await websocket.send_json({"type": "error", "code": "ws_token_required", "message": "wsToken query param required"})
await websocket.close(code=1008)
return
_cleanupSttTokens()
tokenPayload = _sttTokens.pop(wsToken, None)
if not tokenPayload:
await websocket.send_json({"type": "error", "code": "ws_token_invalid", "message": "Invalid or expired wsToken"})
await websocket.close(code=1008)
return
tokenUserId = tokenPayload["userId"]
tokenMandateId = tokenPayload.get("mandateId", "")
# Resolve real user for billing
from modules.interfaces.interfaceDbApp import getRootInterface
rootInterface = getRootInterface()
currentUser = rootInterface.getUser(tokenUserId)
if not currentUser:
await websocket.send_json({"type": "error", "code": "user_not_found", "message": "User not found"})
await websocket.close(code=1008)
return
# --- billing pre-flight ---
billingService = None
try:
from modules.serviceCenter.services.serviceBilling.mainServiceBilling import getService as getBillingService
billingService = getBillingService(currentUser, tokenMandateId)
billingCheck = billingService.checkBalance(0.0)
if not billingCheck.allowed:
await websocket.send_json({"type": "error", "code": "billing_insufficient", "message": "Insufficient balance for voice services"})
await websocket.close(code=1008)
return
except Exception as e:
logger.warning(f"STT billing pre-flight skipped: {e}")
audioQueue: asyncio.Queue = asyncio.Queue()
language = "de-DE"
streamingTask: Optional[asyncio.Task] = None
voiceInterface: Optional[VoiceObjects] = None
async def _sendJson(payload: Dict[str, Any]) -> bool:
try:
await websocket.send_json(payload)
return True
except Exception:
return False
async def _runStreaming():
nonlocal voiceInterface
voiceInterface = getVoiceInterface(currentUser, tokenMandateId)
if billingService:
def _billingCb(data):
priceCHF = data.get("priceCHF", 0.0)
operation = data.get("operation", "voice")
if priceCHF > 0:
billingService.recordUsage(
priceCHF=priceCHF,
aicoreProvider="google-voice",
aicoreModel=operation,
description=f"Voice {operation}",
)
voiceInterface.billingCallback = _billingCb
try:
async for event in voiceInterface.streamingSpeechToText(audioQueue, language):
if event.get("reconnectRequired"):
await _sendJson({"type": "reconnect_required"})
return
if event.get("isFinal"):
if event.get("transcript"):
await _sendJson({"type": "final", "text": event["transcript"], "confidence": event.get("confidence", 0.0)})
else:
if event.get("transcript"):
await _sendJson({"type": "interim", "text": event["transcript"]})
except asyncio.CancelledError:
pass
except Exception as e:
logger.error(f"STT streaming error: {e}")
await _sendJson({"type": "error", "message": str(e)})
try:
await _sendJson({"type": "status", "label": "STT stream connected"})
while True:
raw = await websocket.receive_text()
msg = json.loads(raw)
msgType = (msg.get("type") or "").strip()
if msgType == "open":
language = msg.get("language") or "de-DE"
if streamingTask and not streamingTask.done():
await audioQueue.put((b"", True))
streamingTask.cancel()
audioQueue = asyncio.Queue()
streamingTask = asyncio.create_task(_runStreaming())
await _sendJson({"type": "status", "label": "Listening..."})
elif msgType == "audio":
chunkB64 = msg.get("chunk")
if not chunkB64:
continue
chunkBytes = base64.b64decode(chunkB64)
if len(chunkBytes) > 400_000:
await _sendJson({"type": "error", "code": "chunk_too_large", "message": "Audio chunk too large"})
continue
await audioQueue.put((chunkBytes, False))
elif msgType == "close":
await audioQueue.put((b"", True))
if streamingTask:
try:
await asyncio.wait_for(streamingTask, timeout=10.0)
except (asyncio.TimeoutError, asyncio.CancelledError):
pass
await _sendJson({"type": "closed"})
await websocket.close()
break
elif msgType == "ping":
await _sendJson({"type": "pong"})
except WebSocketDisconnect:
logger.info(f"STT WebSocket disconnected: userId={tokenUserId}")
except Exception as e:
logger.error(f"STT WebSocket error: {e}", exc_info=True)
try:
await websocket.send_json({"type": "error", "message": str(e)})
except Exception:
pass
finally:
await audioQueue.put((b"", True))
if streamingTask and not streamingTask.done():
streamingTask.cancel()