enhanced stt/tts functions

This commit is contained in:
ValueOn AG 2026-03-16 00:47:42 +01:00
parent 7fe6f9bc97
commit 3d49bd9d03
7 changed files with 510 additions and 39 deletions

View file

@ -229,15 +229,15 @@ class OutlookAdapter(_GraphApiMixin, ServiceAdapter):
return [
ExternalEntry(
name=f.get("displayName", ""),
path=f"/{f.get('displayName', '')}",
path=f"/{f.get('id', '')}",
isFolder=True,
metadata={"id": f.get("id"), "totalItemCount": f.get("totalItemCount")},
)
for f in result.get("value", [])
]
folderName = path.strip("/")
endpoint = f"me/mailFolders/{folderName}/messages?$top=25&$orderby=receivedDateTime desc"
folderId = path.strip("/")
endpoint = f"me/mailFolders/{folderId}/messages?$top=25&$orderby=receivedDateTime desc"
result = await self._graphGet(endpoint)
if "error" in result:
return []

View file

@ -2,6 +2,7 @@
# All rights reserved.
"""Voice settings datamodel."""
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
@ -16,6 +17,7 @@ class VoiceSettings(BaseModel):
sttLanguage: str = Field(default="de-DE", description="Speech-to-Text language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsLanguage: str = Field(default="de-DE", description="Text-to-Speech language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoice: str = Field(default="de-DE-KatjaNeural", description="Text-to-Speech voice", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoiceMap: Dict[str, Any] = Field(default_factory=dict, description="Per-language voice mapping, e.g. {'de-DE': {'voiceName': 'de-DE-Wavenet-A'}, 'en-US': {'voiceName': 'en-US-Wavenet-C'}}", json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False})
translationEnabled: bool = Field(default=True, description="Whether translation is enabled", json_schema_extra={"frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False})
targetLanguage: str = Field(default="en-US", description="Target language for translation", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False})
creationDate: float = Field(default_factory=getUtcTimestamp, description="Date when the settings were created (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
@ -33,6 +35,7 @@ registerModelLabels(
"sttLanguage": {"en": "STT Language", "fr": "Langue STT"},
"ttsLanguage": {"en": "TTS Language", "fr": "Langue TTS"},
"ttsVoice": {"en": "TTS Voice", "fr": "Voix TTS"},
"ttsVoiceMap": {"en": "TTS Voice Map", "fr": "Carte des voix TTS"},
"translationEnabled": {"en": "Translation Enabled", "fr": "Traduction activée"},
"targetLanguage": {"en": "Target Language", "fr": "Langue cible"},
"creationDate": {"en": "Creation Date", "fr": "Date de création"},

View file

@ -718,3 +718,120 @@ async def synthesizeVoice(
if not text:
raise HTTPException(status_code=400, detail="text is required")
return JSONResponse({"audio": None, "note": "TTS via browser Speech Synthesis API recommended"})
# =========================================================================
# Voice Settings Endpoints
# =========================================================================
@router.get("/{instanceId}/settings/voice")
@limiter.limit("30/minute")
async def getVoiceSettings(
request: Request,
instanceId: str = Path(...),
context: RequestContext = Depends(getRequestContext),
):
"""Load voice settings for the current user and instance."""
_validateInstanceAccess(instanceId, context)
dbMgmt = _getDbManagement(context, instanceId)
userId = str(context.user.id)
vs = dbMgmt.getVoiceSettings(userId)
if not vs:
vs = dbMgmt.getOrCreateVoiceSettings(userId)
result = vs.model_dump() if vs else {}
return JSONResponse(result)
@router.put("/{instanceId}/settings/voice")
@limiter.limit("30/minute")
async def updateVoiceSettings(
request: Request,
instanceId: str = Path(...),
body: dict = Body(...),
context: RequestContext = Depends(getRequestContext),
):
"""Update voice settings for the current user and instance."""
_validateInstanceAccess(instanceId, context)
dbMgmt = _getDbManagement(context, instanceId)
userId = str(context.user.id)
vs = dbMgmt.getVoiceSettings(userId)
if not vs:
createData = {
"userId": userId,
"mandateId": str(context.mandateId) if context.mandateId else "",
"featureInstanceId": instanceId,
}
createData.update(body)
created = dbMgmt.createVoiceSettings(createData)
return JSONResponse(created)
updateData = {k: v for k, v in body.items() if k not in ("id", "userId", "mandateId", "featureInstanceId", "creationDate")}
updated = dbMgmt.updateVoiceSettings(userId, updateData)
return JSONResponse(updated)
@router.get("/{instanceId}/voice/languages")
@limiter.limit("30/minute")
async def getVoiceLanguages(
request: Request,
instanceId: str = Path(...),
context: RequestContext = Depends(getRequestContext),
):
"""Return available TTS languages."""
mandateId = _validateInstanceAccess(instanceId, context)
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
voiceInterface = getVoiceInterface(context.user, mandateId)
languagesResult = await voiceInterface.getAvailableLanguages()
languageList = languagesResult.get("languages", []) if isinstance(languagesResult, dict) else languagesResult
return JSONResponse({"languages": languageList})
@router.get("/{instanceId}/voice/voices")
@limiter.limit("30/minute")
async def getVoiceVoices(
request: Request,
instanceId: str = Path(...),
language: str = Query("de-DE"),
context: RequestContext = Depends(getRequestContext),
):
"""Return available TTS voices for a given language."""
mandateId = _validateInstanceAccess(instanceId, context)
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
voiceInterface = getVoiceInterface(context.user, mandateId)
voicesResult = await voiceInterface.getAvailableVoices(language)
voiceList = voicesResult.get("voices", []) if isinstance(voicesResult, dict) else voicesResult
return JSONResponse({"voices": voiceList})
@router.post("/{instanceId}/voice/test")
@limiter.limit("10/minute")
async def testVoice(
request: Request,
instanceId: str = Path(...),
body: dict = Body(...),
context: RequestContext = Depends(getRequestContext),
):
"""Test a specific voice with a sample text."""
import base64
mandateId = _validateInstanceAccess(instanceId, context)
text = body.get("text", "Hallo, das ist ein Stimmtest.")
language = body.get("language", "de-DE")
voiceId = body.get("voiceId")
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
voiceInterface = getVoiceInterface(context.user, mandateId)
try:
result = await voiceInterface.textToSpeech(text=text, languageCode=language, voiceName=voiceId)
if result and isinstance(result, dict):
audioContent = result.get("audioContent")
if audioContent:
audioB64 = base64.b64encode(
audioContent if isinstance(audioContent, bytes) else audioContent.encode()
).decode()
return JSONResponse({"success": True, "audio": audioB64, "format": "mp3", "text": text})
return JSONResponse({"success": False, "error": "TTS returned no audio"})
except Exception as e:
logger.error(f"Voice test failed: {e}")
raise HTTPException(status_code=500, detail=f"TTS test failed: {str(e)}")

View file

@ -87,9 +87,10 @@ CLIENT_SECRET = APP_CONFIG.get("Service_GOOGLE_CLIENT_SECRET")
REDIRECT_URI = APP_CONFIG.get("Service_GOOGLE_REDIRECT_URI")
SCOPES = [
"https://www.googleapis.com/auth/gmail.readonly",
"https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/userinfo.profile",
"https://www.googleapis.com/auth/userinfo.email",
"openid"
"openid",
]
@router.get("/config")

View file

@ -59,6 +59,7 @@ SCOPES = [
"Mail.Send", # Send mail
"Files.ReadWrite.All", # Read and write files (SharePoint/OneDrive)
"Sites.ReadWrite.All", # Read and write SharePoint sites
"Team.ReadBasic.All", # List joined teams and channels
# Teams Bot: Meeting and chat access (requires admin consent)
"OnlineMeetings.Read", # Read user's Teams meeting details (delegated scope)
"Chat.ReadWrite", # Read and write Teams chat messages

View file

@ -208,7 +208,8 @@ async def runAgentLoop(
results = await _executeToolCalls(toolCalls, toolRegistry, {
"workflowId": workflowId,
"userId": userId,
"featureInstanceId": featureInstanceId
"featureInstanceId": featureInstanceId,
"mandateId": mandateId,
})
state.totalToolCalls += len(results)

View file

@ -209,7 +209,8 @@ class AgentService:
"## Attached Files\n"
"These files have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n\n"
"or `describeImage(fileId)` for image analysis.\n"
"When generating documents with `renderDocument`, embed images using `![alt text](file:fileId)` in the markdown content.\n\n"
)
header += "\n\n".join(fileDescriptions)
return f"{header}\n\n---\n\nUser request: {prompt}"
@ -1226,68 +1227,415 @@ def _registerCoreTools(registry: ToolRegistry, services):
readOnly=True,
)
# ---- Document generation tool ----
# ---- Document rendering tool ----
async def _generateDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Generate a document in any format using the existing GenerationService + RendererRegistry."""
prompt = args.get("prompt", "")
def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""Convert markdown content to the standard document JSON format expected by renderers."""
import re as _re
sections = []
order = 0
lines = markdown.split("\n")
i = 0
def _nextId():
nonlocal order
order += 1
return f"s_{order}"
while i < len(lines):
line = lines[i]
# --- Headings ---
headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line)
if headingMatch:
level = len(headingMatch.group(1))
text = headingMatch.group(2).strip()
sections.append({
"id": _nextId(), "content_type": "heading", "order": order,
"elements": [{"content": {"text": text, "level": level}}],
})
i += 1
continue
# --- Fenced code blocks ---
codeMatch = _re.match(r'^```(\w*)', line)
if codeMatch:
lang = codeMatch.group(1) or "text"
codeLines = []
i += 1
while i < len(lines) and not lines[i].startswith("```"):
codeLines.append(lines[i])
i += 1
i += 1
sections.append({
"id": _nextId(), "content_type": "code_block", "order": order,
"elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
})
continue
# --- Tables ---
tableMatch = _re.match(r'^\|(.+)\|$', line)
if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]):
rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
"id": _nextId(), "content_type": "table", "order": order,
"elements": [{"content": {"headers": headerCells, "rows": rows}}],
})
continue
# --- Bullet / numbered lists ---
listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line)
if listMatch:
isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2)))
items = []
while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]):
m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i])
items.append({"text": m.group(3).strip()})
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
"elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
})
continue
# --- Empty lines (skip) ---
if not line.strip():
i += 1
continue
# --- Images: ![alt](file:fileId) or ![alt](url) ---
imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
fileId = ""
if src.startswith("file:"):
fileId = src[5:]
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
"elements": [{
"content": {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
}],
})
i += 1
continue
# --- Paragraph (collect consecutive non-empty lines) ---
paraLines = []
while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]):
paraLines.append(lines[i])
i += 1
if paraLines:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": " ".join(paraLines)}}],
})
continue
i += 1
if not sections:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
})
return {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "agent_rendering",
"title": title,
"language": language,
},
"documents": [{
"id": "doc_1",
"title": title,
"sections": sections,
}],
}
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry."""
import re as _re
content = args.get("content", "")
outputFormat = args.get("outputFormat", "pdf")
title = args.get("title", "Generated Document")
title = args.get("title", "Document")
language = args.get("language", "de")
if not prompt:
return ToolResult(toolCallId="", toolName="generateDocument", success=False, error="prompt is required")
if not content:
return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="content is required")
try:
structuredContent = _markdownToDocumentJson(content, title, language)
# Resolve image file references (file:fileId) to base64 data from Knowledge Store
knowledgeService = None
try:
knowledgeService = services.getService("knowledge")
except Exception:
pass
resolvedImages = 0
for doc in structuredContent.get("documents", []):
for section in doc.get("sections", []):
if section.get("content_type") != "image":
continue
for element in section.get("elements", []):
contentObj = element.get("content", {})
fileRef = contentObj.get("_fileRef", "")
if not fileRef or contentObj.get("base64Data"):
continue
if knowledgeService:
chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if imageChunks:
contentObj["base64Data"] = imageChunks[0].get("data", "")
chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
contentObj["mimeType"] = chunkMime
resolvedImages += 1
if not contentObj.get("base64Data"):
try:
rawBytes = services.chat.getFileData(fileRef)
if rawBytes:
import base64 as _b64
contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
contentObj["mimeType"] = "image/png"
resolvedImages += 1
except Exception:
pass
contentObj.pop("_fileRef", None)
contentObj.pop("_srcUrl", None)
sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", []))
logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}")
generationService = services.getService("generation")
aiService = services.ai
structuredContent = await generationService.generateDocumentWithTwoPhases(userPrompt=prompt)
documents = await generationService.renderReport(
extractedContent=structuredContent,
outputFormat=outputFormat,
language="de",
language=language,
title=title,
userPrompt=prompt,
aiService=aiService,
userPrompt=content,
)
if not documents:
return ToolResult(toolCallId="", toolName="generateDocument", success=False, error="Rendering produced no documents")
return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="Rendering produced no output")
savedFiles = []
sideEvents = []
chatService = services.chat
for doc in documents:
docData = doc.data if hasattr(doc, "data") else doc.get("data", b"")
docName = doc.fileName if hasattr(doc, "fileName") else doc.get("fileName", f"{title}.{outputFormat}")
docMime = doc.mimeType if hasattr(doc, "mimeType") else doc.get("mimeType", "application/octet-stream")
fileItem = chatService.interfaceDbComponent.saveGeneratedFile(
docData, docName, docMime,
) if hasattr(chatService.interfaceDbComponent, "saveGeneratedFile") else None
sanitizedTitle = _re.sub(r'[^a-zA-Z0-9._-]', '_', title).strip('_') or "document"
for doc in documents:
docData = doc.documentData if hasattr(doc, "documentData") else b""
docName = doc.filename if hasattr(doc, "filename") else f"{sanitizedTitle}.{outputFormat}"
docMime = doc.mimeType if hasattr(doc, "mimeType") else "application/octet-stream"
if not docName.lower().endswith(f".{outputFormat}"):
docName = f"{sanitizedTitle}.{outputFormat}"
fileItem = None
if hasattr(chatService.interfaceDbComponent, "saveGeneratedFile"):
fileItem = chatService.interfaceDbComponent.saveGeneratedFile(docData, docName, docMime)
else:
fileItem, _ = chatService.interfaceDbComponent.saveUploadedFile(docData, docName)
if fileItem:
savedFiles.append(f"- {docName} (id: {fileItem.id if hasattr(fileItem, 'id') else fileItem.get('id', '?')})")
else:
savedFiles.append(f"- {docName} (generated, not saved)")
fid = fileItem.id if hasattr(fileItem, "id") else fileItem.get("id", "?")
savedFiles.append(f"- {docName} (id: {fid})")
sideEvents.append({
"type": "fileCreated",
"data": {
"fileId": fid,
"fileName": docName,
"mimeType": docMime,
"fileSize": len(docData),
},
})
result = f"Generated {len(documents)} document(s):\n" + "\n".join(savedFiles)
return ToolResult(toolCallId="", toolName="generateDocument", success=True, data=result)
result = f"Rendered {len(documents)} document(s):\n" + "\n".join(savedFiles)
return ToolResult(toolCallId="", toolName="renderDocument", success=True, data=result, sideEvents=sideEvents)
except Exception as e:
return ToolResult(toolCallId="", toolName="generateDocument", success=False, error=str(e))
logger.error(f"renderDocument failed: {e}")
return ToolResult(toolCallId="", toolName="renderDocument", success=False, error=str(e))
registry.register(
"generateDocument", _generateDocument,
description="Generate a document in any format (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT).",
"renderDocument", _renderDocument,
description=(
"Render markdown content into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). "
"You write the full document content as markdown, then this tool converts and renders it. "
"To embed images from uploaded files, use markdown image syntax with the file ID: ![alt text](file:fileId). "
"The images will be resolved from the Knowledge Store and embedded in the output document."
),
parameters={
"type": "object",
"properties": {
"prompt": {"type": "string", "description": "What the document should contain and how it should look"},
"content": {"type": "string", "description": "Full document content as markdown (headings, tables, lists, code blocks, paragraphs, images via ![alt](file:fileId))"},
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Generated Document"},
"title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
},
"required": ["prompt"],
"required": ["content"],
},
readOnly=False,
)
# ── textToSpeech tool ──────────────────────────────────────────────
def _stripMarkdownForTts(text: str) -> str:
"""Strip markdown formatting so TTS reads clean speech text."""
import re as _re
t = text
t = _re.sub(r'\*\*(.+?)\*\*', r'\1', t)
t = _re.sub(r'\*(.+?)\*', r'\1', t)
t = _re.sub(r'__(.+?)__', r'\1', t)
t = _re.sub(r'_(.+?)_', r'\1', t)
t = _re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], t)
t = _re.sub(r'^#{1,6}\s*', '', t, flags=_re.MULTILINE)
t = _re.sub(r'^\s*[-*+]\s+', '', t, flags=_re.MULTILINE)
t = _re.sub(r'^\s*\d+\.\s+', '', t, flags=_re.MULTILINE)
t = _re.sub(r'\[(.+?)\]\(.+?\)', r'\1', t)
t = _re.sub(r'!\[.*?\]\(.*?\)', '', t)
t = _re.sub(r'\n{3,}', '\n\n', t)
return t.strip()
async def _textToSpeech(args: Dict[str, Any], context: Dict[str, Any]):
"""Convert text to speech using Google Cloud TTS, deliver audio via SSE."""
import base64 as _b64
text = args.get("text", "")
language = args.get("language", "auto")
voiceName = args.get("voiceName")
if not text:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="text is required")
cleanText = _stripMarkdownForTts(text)
if not cleanText:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="text is empty after stripping markdown")
try:
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
mandateId = context.get("mandateId", "")
voiceInterface = getVoiceInterface(currentUser=None, mandateId=mandateId)
_ISO_TO_BCP47 = {
"de": "de-DE", "en": "en-US", "fr": "fr-FR", "it": "it-IT",
"es": "es-ES", "pt": "pt-BR", "nl": "nl-NL", "pl": "pl-PL",
"ru": "ru-RU", "ja": "ja-JP", "zh": "zh-CN", "ko": "ko-KR",
"ar": "ar-XA", "hi": "hi-IN", "tr": "tr-TR", "sv": "sv-SE",
}
if language == "auto":
try:
snippet = cleanText[:500]
detectResult = await voiceInterface.detectLanguage(snippet)
if detectResult and detectResult.get("success"):
detected = detectResult.get("language", "de")
language = _ISO_TO_BCP47.get(detected, detected)
if "-" not in language:
language = _ISO_TO_BCP47.get(language, f"{language}-{language.upper()}")
logger.info(f"textToSpeech: auto-detected language '{detected}' -> '{language}'")
else:
language = "de-DE"
except Exception as detectErr:
logger.warning(f"textToSpeech: language detection failed: {detectErr}, defaulting to de-DE")
language = "de-DE"
if not voiceName:
try:
featureInstanceId = context.get("featureInstanceId", "")
userId = context.get("userId", "")
if featureInstanceId and userId:
dbMgmt = services.chat.interfaceDbApp if hasattr(services.chat, "interfaceDbApp") else None
if dbMgmt and hasattr(dbMgmt, "getVoiceSettings"):
vs = dbMgmt.getVoiceSettings(userId)
if vs:
voiceMap = {}
if hasattr(vs, "ttsVoiceMap") and vs.ttsVoiceMap:
voiceMap = vs.ttsVoiceMap if isinstance(vs.ttsVoiceMap, dict) else {}
if language in voiceMap:
voiceName = voiceMap[language].get("voiceName") if isinstance(voiceMap[language], dict) else voiceMap[language]
logger.info(f"textToSpeech: using configured voice '{voiceName}' for {language}")
elif hasattr(vs, "ttsVoice") and vs.ttsVoice and hasattr(vs, "ttsLanguage") and vs.ttsLanguage == language:
voiceName = vs.ttsVoice
except Exception as prefErr:
logger.debug(f"textToSpeech: could not load voice preferences: {prefErr}")
ttsResult = await voiceInterface.textToSpeech(
text=cleanText,
languageCode=language,
voiceName=voiceName,
)
if not ttsResult or not ttsResult.get("success"):
errMsg = ttsResult.get("error", "TTS call failed") if ttsResult else "TTS returned None"
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error=errMsg)
audioContent = ttsResult.get("audioContent", "")
if not audioContent:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="TTS returned no audio")
if isinstance(audioContent, bytes):
audioB64 = _b64.b64encode(audioContent).decode("ascii")
elif isinstance(audioContent, str):
audioB64 = audioContent
else:
audioB64 = str(audioContent)
audioFormat = ttsResult.get("audioFormat", "mp3")
charCount = len(cleanText)
usedVoice = voiceName or "default"
logger.info(f"textToSpeech: generated {audioFormat} audio for {charCount} chars, language={language}, voice={usedVoice}")
return ToolResult(
toolCallId="", toolName="textToSpeech", success=True,
data=f"Audio generated ({charCount} characters, language={language}, voice={usedVoice}). Playing in chat.",
sideEvents=[{
"type": "voiceResponse",
"data": {
"audio": audioB64,
"format": audioFormat,
"language": language,
"charCount": charCount,
},
}],
)
except ImportError:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False,
error="Voice interface not available (missing dependency)")
except Exception as e:
logger.error(f"textToSpeech failed: {e}")
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error=str(e))
registry.register(
"textToSpeech", _textToSpeech,
description=(
"Convert text to speech audio. The audio is played directly in the chat. "
"Use this when the user asks you to read something aloud, narrate, or speak. "
"Language is auto-detected from the text content. You do NOT need to specify a language."
),
parameters={
"type": "object",
"properties": {
"text": {"type": "string", "description": "The text to convert to speech. Can include markdown (will be stripped automatically)."},
"language": {"type": "string", "description": "BCP-47 language code (e.g. de-DE, en-US) or 'auto' for automatic detection", "default": "auto"},
"voiceName": {"type": "string", "description": "Optional specific voice name. If omitted, uses the configured voice for the detected language."},
},
"required": ["text"],
},
readOnly=False,
)