enhanced stt/tts functions

This commit is contained in:
ValueOn AG 2026-03-16 00:47:42 +01:00
parent 7fe6f9bc97
commit 3d49bd9d03
7 changed files with 510 additions and 39 deletions

View file

@ -229,15 +229,15 @@ class OutlookAdapter(_GraphApiMixin, ServiceAdapter):
return [ return [
ExternalEntry( ExternalEntry(
name=f.get("displayName", ""), name=f.get("displayName", ""),
path=f"/{f.get('displayName', '')}", path=f"/{f.get('id', '')}",
isFolder=True, isFolder=True,
metadata={"id": f.get("id"), "totalItemCount": f.get("totalItemCount")}, metadata={"id": f.get("id"), "totalItemCount": f.get("totalItemCount")},
) )
for f in result.get("value", []) for f in result.get("value", [])
] ]
folderName = path.strip("/") folderId = path.strip("/")
endpoint = f"me/mailFolders/{folderName}/messages?$top=25&$orderby=receivedDateTime desc" endpoint = f"me/mailFolders/{folderId}/messages?$top=25&$orderby=receivedDateTime desc"
result = await self._graphGet(endpoint) result = await self._graphGet(endpoint)
if "error" in result: if "error" in result:
return [] return []

View file

@ -2,6 +2,7 @@
# All rights reserved. # All rights reserved.
"""Voice settings datamodel.""" """Voice settings datamodel."""
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp from modules.shared.timeUtils import getUtcTimestamp
@ -16,6 +17,7 @@ class VoiceSettings(BaseModel):
sttLanguage: str = Field(default="de-DE", description="Speech-to-Text language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True}) sttLanguage: str = Field(default="de-DE", description="Speech-to-Text language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsLanguage: str = Field(default="de-DE", description="Text-to-Speech language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True}) ttsLanguage: str = Field(default="de-DE", description="Text-to-Speech language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoice: str = Field(default="de-DE-KatjaNeural", description="Text-to-Speech voice", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True}) ttsVoice: str = Field(default="de-DE-KatjaNeural", description="Text-to-Speech voice", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoiceMap: Dict[str, Any] = Field(default_factory=dict, description="Per-language voice mapping, e.g. {'de-DE': {'voiceName': 'de-DE-Wavenet-A'}, 'en-US': {'voiceName': 'en-US-Wavenet-C'}}", json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False})
translationEnabled: bool = Field(default=True, description="Whether translation is enabled", json_schema_extra={"frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False}) translationEnabled: bool = Field(default=True, description="Whether translation is enabled", json_schema_extra={"frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False})
targetLanguage: str = Field(default="en-US", description="Target language for translation", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False}) targetLanguage: str = Field(default="en-US", description="Target language for translation", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False})
creationDate: float = Field(default_factory=getUtcTimestamp, description="Date when the settings were created (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False}) creationDate: float = Field(default_factory=getUtcTimestamp, description="Date when the settings were created (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
@ -33,6 +35,7 @@ registerModelLabels(
"sttLanguage": {"en": "STT Language", "fr": "Langue STT"}, "sttLanguage": {"en": "STT Language", "fr": "Langue STT"},
"ttsLanguage": {"en": "TTS Language", "fr": "Langue TTS"}, "ttsLanguage": {"en": "TTS Language", "fr": "Langue TTS"},
"ttsVoice": {"en": "TTS Voice", "fr": "Voix TTS"}, "ttsVoice": {"en": "TTS Voice", "fr": "Voix TTS"},
"ttsVoiceMap": {"en": "TTS Voice Map", "fr": "Carte des voix TTS"},
"translationEnabled": {"en": "Translation Enabled", "fr": "Traduction activée"}, "translationEnabled": {"en": "Translation Enabled", "fr": "Traduction activée"},
"targetLanguage": {"en": "Target Language", "fr": "Langue cible"}, "targetLanguage": {"en": "Target Language", "fr": "Langue cible"},
"creationDate": {"en": "Creation Date", "fr": "Date de création"}, "creationDate": {"en": "Creation Date", "fr": "Date de création"},

View file

@ -718,3 +718,120 @@ async def synthesizeVoice(
if not text: if not text:
raise HTTPException(status_code=400, detail="text is required") raise HTTPException(status_code=400, detail="text is required")
return JSONResponse({"audio": None, "note": "TTS via browser Speech Synthesis API recommended"}) return JSONResponse({"audio": None, "note": "TTS via browser Speech Synthesis API recommended"})
# =========================================================================
# Voice Settings Endpoints
# =========================================================================
@router.get("/{instanceId}/settings/voice")
@limiter.limit("30/minute")
async def getVoiceSettings(
request: Request,
instanceId: str = Path(...),
context: RequestContext = Depends(getRequestContext),
):
"""Load voice settings for the current user and instance."""
_validateInstanceAccess(instanceId, context)
dbMgmt = _getDbManagement(context, instanceId)
userId = str(context.user.id)
vs = dbMgmt.getVoiceSettings(userId)
if not vs:
vs = dbMgmt.getOrCreateVoiceSettings(userId)
result = vs.model_dump() if vs else {}
return JSONResponse(result)
@router.put("/{instanceId}/settings/voice")
@limiter.limit("30/minute")
async def updateVoiceSettings(
request: Request,
instanceId: str = Path(...),
body: dict = Body(...),
context: RequestContext = Depends(getRequestContext),
):
"""Update voice settings for the current user and instance."""
_validateInstanceAccess(instanceId, context)
dbMgmt = _getDbManagement(context, instanceId)
userId = str(context.user.id)
vs = dbMgmt.getVoiceSettings(userId)
if not vs:
createData = {
"userId": userId,
"mandateId": str(context.mandateId) if context.mandateId else "",
"featureInstanceId": instanceId,
}
createData.update(body)
created = dbMgmt.createVoiceSettings(createData)
return JSONResponse(created)
updateData = {k: v for k, v in body.items() if k not in ("id", "userId", "mandateId", "featureInstanceId", "creationDate")}
updated = dbMgmt.updateVoiceSettings(userId, updateData)
return JSONResponse(updated)
@router.get("/{instanceId}/voice/languages")
@limiter.limit("30/minute")
async def getVoiceLanguages(
request: Request,
instanceId: str = Path(...),
context: RequestContext = Depends(getRequestContext),
):
"""Return available TTS languages."""
mandateId = _validateInstanceAccess(instanceId, context)
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
voiceInterface = getVoiceInterface(context.user, mandateId)
languagesResult = await voiceInterface.getAvailableLanguages()
languageList = languagesResult.get("languages", []) if isinstance(languagesResult, dict) else languagesResult
return JSONResponse({"languages": languageList})
@router.get("/{instanceId}/voice/voices")
@limiter.limit("30/minute")
async def getVoiceVoices(
request: Request,
instanceId: str = Path(...),
language: str = Query("de-DE"),
context: RequestContext = Depends(getRequestContext),
):
"""Return available TTS voices for a given language."""
mandateId = _validateInstanceAccess(instanceId, context)
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
voiceInterface = getVoiceInterface(context.user, mandateId)
voicesResult = await voiceInterface.getAvailableVoices(language)
voiceList = voicesResult.get("voices", []) if isinstance(voicesResult, dict) else voicesResult
return JSONResponse({"voices": voiceList})
@router.post("/{instanceId}/voice/test")
@limiter.limit("10/minute")
async def testVoice(
request: Request,
instanceId: str = Path(...),
body: dict = Body(...),
context: RequestContext = Depends(getRequestContext),
):
"""Test a specific voice with a sample text."""
import base64
mandateId = _validateInstanceAccess(instanceId, context)
text = body.get("text", "Hallo, das ist ein Stimmtest.")
language = body.get("language", "de-DE")
voiceId = body.get("voiceId")
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
voiceInterface = getVoiceInterface(context.user, mandateId)
try:
result = await voiceInterface.textToSpeech(text=text, languageCode=language, voiceName=voiceId)
if result and isinstance(result, dict):
audioContent = result.get("audioContent")
if audioContent:
audioB64 = base64.b64encode(
audioContent if isinstance(audioContent, bytes) else audioContent.encode()
).decode()
return JSONResponse({"success": True, "audio": audioB64, "format": "mp3", "text": text})
return JSONResponse({"success": False, "error": "TTS returned no audio"})
except Exception as e:
logger.error(f"Voice test failed: {e}")
raise HTTPException(status_code=500, detail=f"TTS test failed: {str(e)}")

View file

@ -87,9 +87,10 @@ CLIENT_SECRET = APP_CONFIG.get("Service_GOOGLE_CLIENT_SECRET")
REDIRECT_URI = APP_CONFIG.get("Service_GOOGLE_REDIRECT_URI") REDIRECT_URI = APP_CONFIG.get("Service_GOOGLE_REDIRECT_URI")
SCOPES = [ SCOPES = [
"https://www.googleapis.com/auth/gmail.readonly", "https://www.googleapis.com/auth/gmail.readonly",
"https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.profile",
"https://www.googleapis.com/auth/userinfo.email", "https://www.googleapis.com/auth/userinfo.email",
"openid" "openid",
] ]
@router.get("/config") @router.get("/config")

View file

@ -59,6 +59,7 @@ SCOPES = [
"Mail.Send", # Send mail "Mail.Send", # Send mail
"Files.ReadWrite.All", # Read and write files (SharePoint/OneDrive) "Files.ReadWrite.All", # Read and write files (SharePoint/OneDrive)
"Sites.ReadWrite.All", # Read and write SharePoint sites "Sites.ReadWrite.All", # Read and write SharePoint sites
"Team.ReadBasic.All", # List joined teams and channels
# Teams Bot: Meeting and chat access (requires admin consent) # Teams Bot: Meeting and chat access (requires admin consent)
"OnlineMeetings.Read", # Read user's Teams meeting details (delegated scope) "OnlineMeetings.Read", # Read user's Teams meeting details (delegated scope)
"Chat.ReadWrite", # Read and write Teams chat messages "Chat.ReadWrite", # Read and write Teams chat messages

View file

@ -208,7 +208,8 @@ async def runAgentLoop(
results = await _executeToolCalls(toolCalls, toolRegistry, { results = await _executeToolCalls(toolCalls, toolRegistry, {
"workflowId": workflowId, "workflowId": workflowId,
"userId": userId, "userId": userId,
"featureInstanceId": featureInstanceId "featureInstanceId": featureInstanceId,
"mandateId": mandateId,
}) })
state.totalToolCalls += len(results) state.totalToolCalls += len(results)

View file

@ -209,7 +209,8 @@ class AgentService:
"## Attached Files\n" "## Attached Files\n"
"These files have been uploaded and processed through the extraction pipeline.\n" "These files have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, " "Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
"or `describeImage(fileId)` for image analysis.\n\n" "or `describeImage(fileId)` for image analysis.\n"
"When generating documents with `renderDocument`, embed images using `![alt text](file:fileId)` in the markdown content.\n\n"
) )
header += "\n\n".join(fileDescriptions) header += "\n\n".join(fileDescriptions)
return f"{header}\n\n---\n\nUser request: {prompt}" return f"{header}\n\n---\n\nUser request: {prompt}"
@ -1226,68 +1227,415 @@ def _registerCoreTools(registry: ToolRegistry, services):
readOnly=True, readOnly=True,
) )
# ---- Document generation tool ---- # ---- Document rendering tool ----
async def _generateDocument(args: Dict[str, Any], context: Dict[str, Any]): def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
"""Generate a document in any format using the existing GenerationService + RendererRegistry.""" """Convert markdown content to the standard document JSON format expected by renderers."""
prompt = args.get("prompt", "") import re as _re
sections = []
order = 0
lines = markdown.split("\n")
i = 0
def _nextId():
nonlocal order
order += 1
return f"s_{order}"
while i < len(lines):
line = lines[i]
# --- Headings ---
headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line)
if headingMatch:
level = len(headingMatch.group(1))
text = headingMatch.group(2).strip()
sections.append({
"id": _nextId(), "content_type": "heading", "order": order,
"elements": [{"content": {"text": text, "level": level}}],
})
i += 1
continue
# --- Fenced code blocks ---
codeMatch = _re.match(r'^```(\w*)', line)
if codeMatch:
lang = codeMatch.group(1) or "text"
codeLines = []
i += 1
while i < len(lines) and not lines[i].startswith("```"):
codeLines.append(lines[i])
i += 1
i += 1
sections.append({
"id": _nextId(), "content_type": "code_block", "order": order,
"elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
})
continue
# --- Tables ---
tableMatch = _re.match(r'^\|(.+)\|$', line)
if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
i += 2
rows = []
while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]):
rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
rows.append(rowCells)
i += 1
sections.append({
"id": _nextId(), "content_type": "table", "order": order,
"elements": [{"content": {"headers": headerCells, "rows": rows}}],
})
continue
# --- Bullet / numbered lists ---
listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line)
if listMatch:
isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2)))
items = []
while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]):
m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i])
items.append({"text": m.group(3).strip()})
i += 1
sections.append({
"id": _nextId(), "content_type": "bullet_list", "order": order,
"elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
})
continue
# --- Empty lines (skip) ---
if not line.strip():
i += 1
continue
# --- Images: ![alt](file:fileId) or ![alt](url) ---
imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line)
if imgMatch:
altText = imgMatch.group(1).strip() or "Image"
src = imgMatch.group(2).strip()
fileId = ""
if src.startswith("file:"):
fileId = src[5:]
sections.append({
"id": _nextId(), "content_type": "image", "order": order,
"elements": [{
"content": {
"altText": altText,
"base64Data": "",
"_fileRef": fileId,
"_srcUrl": src if not fileId else "",
}
}],
})
i += 1
continue
# --- Paragraph (collect consecutive non-empty lines) ---
paraLines = []
while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]):
paraLines.append(lines[i])
i += 1
if paraLines:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": " ".join(paraLines)}}],
})
continue
i += 1
if not sections:
sections.append({
"id": _nextId(), "content_type": "paragraph", "order": order,
"elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
})
return {
"metadata": {
"split_strategy": "single_document",
"source_documents": [],
"extraction_method": "agent_rendering",
"title": title,
"language": language,
},
"documents": [{
"id": "doc_1",
"title": title,
"sections": sections,
}],
}
async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
"""Render agent-produced markdown content into any document format via the RendererRegistry."""
import re as _re
content = args.get("content", "")
outputFormat = args.get("outputFormat", "pdf") outputFormat = args.get("outputFormat", "pdf")
title = args.get("title", "Generated Document") title = args.get("title", "Document")
language = args.get("language", "de")
if not prompt: if not content:
return ToolResult(toolCallId="", toolName="generateDocument", success=False, error="prompt is required") return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="content is required")
try: try:
structuredContent = _markdownToDocumentJson(content, title, language)
# Resolve image file references (file:fileId) to base64 data from Knowledge Store
knowledgeService = None
try:
knowledgeService = services.getService("knowledge")
except Exception:
pass
resolvedImages = 0
for doc in structuredContent.get("documents", []):
for section in doc.get("sections", []):
if section.get("content_type") != "image":
continue
for element in section.get("elements", []):
contentObj = element.get("content", {})
fileRef = contentObj.get("_fileRef", "")
if not fileRef or contentObj.get("base64Data"):
continue
if knowledgeService:
chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
if imageChunks:
contentObj["base64Data"] = imageChunks[0].get("data", "")
chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
contentObj["mimeType"] = chunkMime
resolvedImages += 1
if not contentObj.get("base64Data"):
try:
rawBytes = services.chat.getFileData(fileRef)
if rawBytes:
import base64 as _b64
contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
contentObj["mimeType"] = "image/png"
resolvedImages += 1
except Exception:
pass
contentObj.pop("_fileRef", None)
contentObj.pop("_srcUrl", None)
sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", []))
logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}")
generationService = services.getService("generation") generationService = services.getService("generation")
aiService = services.ai
structuredContent = await generationService.generateDocumentWithTwoPhases(userPrompt=prompt)
documents = await generationService.renderReport( documents = await generationService.renderReport(
extractedContent=structuredContent, extractedContent=structuredContent,
outputFormat=outputFormat, outputFormat=outputFormat,
language="de", language=language,
title=title, title=title,
userPrompt=prompt, userPrompt=content,
aiService=aiService,
) )
if not documents: if not documents:
return ToolResult(toolCallId="", toolName="generateDocument", success=False, error="Rendering produced no documents") return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="Rendering produced no output")
savedFiles = [] savedFiles = []
sideEvents = []
chatService = services.chat chatService = services.chat
for doc in documents:
docData = doc.data if hasattr(doc, "data") else doc.get("data", b"")
docName = doc.fileName if hasattr(doc, "fileName") else doc.get("fileName", f"{title}.{outputFormat}")
docMime = doc.mimeType if hasattr(doc, "mimeType") else doc.get("mimeType", "application/octet-stream")
fileItem = chatService.interfaceDbComponent.saveGeneratedFile( sanitizedTitle = _re.sub(r'[^a-zA-Z0-9._-]', '_', title).strip('_') or "document"
docData, docName, docMime,
) if hasattr(chatService.interfaceDbComponent, "saveGeneratedFile") else None for doc in documents:
docData = doc.documentData if hasattr(doc, "documentData") else b""
docName = doc.filename if hasattr(doc, "filename") else f"{sanitizedTitle}.{outputFormat}"
docMime = doc.mimeType if hasattr(doc, "mimeType") else "application/octet-stream"
if not docName.lower().endswith(f".{outputFormat}"):
docName = f"{sanitizedTitle}.{outputFormat}"
fileItem = None
if hasattr(chatService.interfaceDbComponent, "saveGeneratedFile"):
fileItem = chatService.interfaceDbComponent.saveGeneratedFile(docData, docName, docMime)
else:
fileItem, _ = chatService.interfaceDbComponent.saveUploadedFile(docData, docName)
if fileItem: if fileItem:
savedFiles.append(f"- {docName} (id: {fileItem.id if hasattr(fileItem, 'id') else fileItem.get('id', '?')})") fid = fileItem.id if hasattr(fileItem, "id") else fileItem.get("id", "?")
else: savedFiles.append(f"- {docName} (id: {fid})")
savedFiles.append(f"- {docName} (generated, not saved)") sideEvents.append({
"type": "fileCreated",
"data": {
"fileId": fid,
"fileName": docName,
"mimeType": docMime,
"fileSize": len(docData),
},
})
result = f"Generated {len(documents)} document(s):\n" + "\n".join(savedFiles) result = f"Rendered {len(documents)} document(s):\n" + "\n".join(savedFiles)
return ToolResult(toolCallId="", toolName="generateDocument", success=True, data=result) return ToolResult(toolCallId="", toolName="renderDocument", success=True, data=result, sideEvents=sideEvents)
except Exception as e: except Exception as e:
return ToolResult(toolCallId="", toolName="generateDocument", success=False, error=str(e)) logger.error(f"renderDocument failed: {e}")
return ToolResult(toolCallId="", toolName="renderDocument", success=False, error=str(e))
registry.register( registry.register(
"generateDocument", _generateDocument, "renderDocument", _renderDocument,
description="Generate a document in any format (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT).", description=(
"Render markdown content into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). "
"You write the full document content as markdown, then this tool converts and renders it. "
"To embed images from uploaded files, use markdown image syntax with the file ID: ![alt text](file:fileId). "
"The images will be resolved from the Knowledge Store and embedded in the output document."
),
parameters={ parameters={
"type": "object", "type": "object",
"properties": { "properties": {
"prompt": {"type": "string", "description": "What the document should contain and how it should look"}, "content": {"type": "string", "description": "Full document content as markdown (headings, tables, lists, code blocks, paragraphs, images via ![alt](file:fileId))"},
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"}, "outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
"title": {"type": "string", "description": "Document title", "default": "Generated Document"}, "title": {"type": "string", "description": "Document title", "default": "Document"},
"language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
}, },
"required": ["prompt"], "required": ["content"],
},
readOnly=False,
)
# ── textToSpeech tool ──────────────────────────────────────────────
def _stripMarkdownForTts(text: str) -> str:
"""Strip markdown formatting so TTS reads clean speech text."""
import re as _re
t = text
t = _re.sub(r'\*\*(.+?)\*\*', r'\1', t)
t = _re.sub(r'\*(.+?)\*', r'\1', t)
t = _re.sub(r'__(.+?)__', r'\1', t)
t = _re.sub(r'_(.+?)_', r'\1', t)
t = _re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], t)
t = _re.sub(r'^#{1,6}\s*', '', t, flags=_re.MULTILINE)
t = _re.sub(r'^\s*[-*+]\s+', '', t, flags=_re.MULTILINE)
t = _re.sub(r'^\s*\d+\.\s+', '', t, flags=_re.MULTILINE)
t = _re.sub(r'\[(.+?)\]\(.+?\)', r'\1', t)
t = _re.sub(r'!\[.*?\]\(.*?\)', '', t)
t = _re.sub(r'\n{3,}', '\n\n', t)
return t.strip()
async def _textToSpeech(args: Dict[str, Any], context: Dict[str, Any]):
"""Convert text to speech using Google Cloud TTS, deliver audio via SSE."""
import base64 as _b64
text = args.get("text", "")
language = args.get("language", "auto")
voiceName = args.get("voiceName")
if not text:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="text is required")
cleanText = _stripMarkdownForTts(text)
if not cleanText:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="text is empty after stripping markdown")
try:
from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
mandateId = context.get("mandateId", "")
voiceInterface = getVoiceInterface(currentUser=None, mandateId=mandateId)
_ISO_TO_BCP47 = {
"de": "de-DE", "en": "en-US", "fr": "fr-FR", "it": "it-IT",
"es": "es-ES", "pt": "pt-BR", "nl": "nl-NL", "pl": "pl-PL",
"ru": "ru-RU", "ja": "ja-JP", "zh": "zh-CN", "ko": "ko-KR",
"ar": "ar-XA", "hi": "hi-IN", "tr": "tr-TR", "sv": "sv-SE",
}
if language == "auto":
try:
snippet = cleanText[:500]
detectResult = await voiceInterface.detectLanguage(snippet)
if detectResult and detectResult.get("success"):
detected = detectResult.get("language", "de")
language = _ISO_TO_BCP47.get(detected, detected)
if "-" not in language:
language = _ISO_TO_BCP47.get(language, f"{language}-{language.upper()}")
logger.info(f"textToSpeech: auto-detected language '{detected}' -> '{language}'")
else:
language = "de-DE"
except Exception as detectErr:
logger.warning(f"textToSpeech: language detection failed: {detectErr}, defaulting to de-DE")
language = "de-DE"
if not voiceName:
try:
featureInstanceId = context.get("featureInstanceId", "")
userId = context.get("userId", "")
if featureInstanceId and userId:
dbMgmt = services.chat.interfaceDbApp if hasattr(services.chat, "interfaceDbApp") else None
if dbMgmt and hasattr(dbMgmt, "getVoiceSettings"):
vs = dbMgmt.getVoiceSettings(userId)
if vs:
voiceMap = {}
if hasattr(vs, "ttsVoiceMap") and vs.ttsVoiceMap:
voiceMap = vs.ttsVoiceMap if isinstance(vs.ttsVoiceMap, dict) else {}
if language in voiceMap:
voiceName = voiceMap[language].get("voiceName") if isinstance(voiceMap[language], dict) else voiceMap[language]
logger.info(f"textToSpeech: using configured voice '{voiceName}' for {language}")
elif hasattr(vs, "ttsVoice") and vs.ttsVoice and hasattr(vs, "ttsLanguage") and vs.ttsLanguage == language:
voiceName = vs.ttsVoice
except Exception as prefErr:
logger.debug(f"textToSpeech: could not load voice preferences: {prefErr}")
ttsResult = await voiceInterface.textToSpeech(
text=cleanText,
languageCode=language,
voiceName=voiceName,
)
if not ttsResult or not ttsResult.get("success"):
errMsg = ttsResult.get("error", "TTS call failed") if ttsResult else "TTS returned None"
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error=errMsg)
audioContent = ttsResult.get("audioContent", "")
if not audioContent:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="TTS returned no audio")
if isinstance(audioContent, bytes):
audioB64 = _b64.b64encode(audioContent).decode("ascii")
elif isinstance(audioContent, str):
audioB64 = audioContent
else:
audioB64 = str(audioContent)
audioFormat = ttsResult.get("audioFormat", "mp3")
charCount = len(cleanText)
usedVoice = voiceName or "default"
logger.info(f"textToSpeech: generated {audioFormat} audio for {charCount} chars, language={language}, voice={usedVoice}")
return ToolResult(
toolCallId="", toolName="textToSpeech", success=True,
data=f"Audio generated ({charCount} characters, language={language}, voice={usedVoice}). Playing in chat.",
sideEvents=[{
"type": "voiceResponse",
"data": {
"audio": audioB64,
"format": audioFormat,
"language": language,
"charCount": charCount,
},
}],
)
except ImportError:
return ToolResult(toolCallId="", toolName="textToSpeech", success=False,
error="Voice interface not available (missing dependency)")
except Exception as e:
logger.error(f"textToSpeech failed: {e}")
return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error=str(e))
registry.register(
"textToSpeech", _textToSpeech,
description=(
"Convert text to speech audio. The audio is played directly in the chat. "
"Use this when the user asks you to read something aloud, narrate, or speak. "
"Language is auto-detected from the text content. You do NOT need to specify a language."
),
parameters={
"type": "object",
"properties": {
"text": {"type": "string", "description": "The text to convert to speech. Can include markdown (will be stripped automatically)."},
"language": {"type": "string", "description": "BCP-47 language code (e.g. de-DE, en-US) or 'auto' for automatic detection", "default": "auto"},
"voiceName": {"type": "string", "description": "Optional specific voice name. If omitted, uses the configured voice for the detected language."},
},
"required": ["text"],
}, },
readOnly=False, readOnly=False,
) )