From 3d49bd9d032a5ee8297fc3c4441d8d8214a5c060 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Mon, 16 Mar 2026 00:47:42 +0100
Subject: [PATCH] enhanced stt/tts functions
---
.../connectors/providerMsft/connectorMsft.py | 6 +-
modules/datamodels/datamodelVoice.py | 3 +
.../workspace/routeFeatureWorkspace.py | 117 +++++
modules/routes/routeSecurityGoogle.py | 3 +-
modules/routes/routeSecurityMsft.py | 1 +
.../services/serviceAgent/agentLoop.py | 3 +-
.../services/serviceAgent/mainServiceAgent.py | 416 ++++++++++++++++--
7 files changed, 510 insertions(+), 39 deletions(-)
diff --git a/modules/connectors/providerMsft/connectorMsft.py b/modules/connectors/providerMsft/connectorMsft.py
index 105ae8fc..26aa3790 100644
--- a/modules/connectors/providerMsft/connectorMsft.py
+++ b/modules/connectors/providerMsft/connectorMsft.py
@@ -229,15 +229,15 @@ class OutlookAdapter(_GraphApiMixin, ServiceAdapter):
return [
ExternalEntry(
name=f.get("displayName", ""),
- path=f"/{f.get('displayName', '')}",
+ path=f"/{f.get('id', '')}",
isFolder=True,
metadata={"id": f.get("id"), "totalItemCount": f.get("totalItemCount")},
)
for f in result.get("value", [])
]
- folderName = path.strip("/")
- endpoint = f"me/mailFolders/{folderName}/messages?$top=25&$orderby=receivedDateTime desc"
+ folderId = path.strip("/")
+ endpoint = f"me/mailFolders/{folderId}/messages?$top=25&$orderby=receivedDateTime desc"
result = await self._graphGet(endpoint)
if "error" in result:
return []
diff --git a/modules/datamodels/datamodelVoice.py b/modules/datamodels/datamodelVoice.py
index 86f4bb1d..2223a3e6 100644
--- a/modules/datamodels/datamodelVoice.py
+++ b/modules/datamodels/datamodelVoice.py
@@ -2,6 +2,7 @@
# All rights reserved.
"""Voice settings datamodel."""
+from typing import Dict, Any, Optional
from pydantic import BaseModel, Field
from modules.shared.attributeUtils import registerModelLabels
from modules.shared.timeUtils import getUtcTimestamp
@@ -16,6 +17,7 @@ class VoiceSettings(BaseModel):
sttLanguage: str = Field(default="de-DE", description="Speech-to-Text language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsLanguage: str = Field(default="de-DE", description="Text-to-Speech language", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
ttsVoice: str = Field(default="de-DE-KatjaNeural", description="Text-to-Speech voice", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": True})
+ ttsVoiceMap: Dict[str, Any] = Field(default_factory=dict, description="Per-language voice mapping, e.g. {'de-DE': {'voiceName': 'de-DE-Wavenet-A'}, 'en-US': {'voiceName': 'en-US-Wavenet-C'}}", json_schema_extra={"frontend_type": "json", "frontend_readonly": False, "frontend_required": False})
translationEnabled: bool = Field(default=True, description="Whether translation is enabled", json_schema_extra={"frontend_type": "checkbox", "frontend_readonly": False, "frontend_required": False})
targetLanguage: str = Field(default="en-US", description="Target language for translation", json_schema_extra={"frontend_type": "select", "frontend_readonly": False, "frontend_required": False})
creationDate: float = Field(default_factory=getUtcTimestamp, description="Date when the settings were created (UTC timestamp in seconds)", json_schema_extra={"frontend_type": "timestamp", "frontend_readonly": True, "frontend_required": False})
@@ -33,6 +35,7 @@ registerModelLabels(
"sttLanguage": {"en": "STT Language", "fr": "Langue STT"},
"ttsLanguage": {"en": "TTS Language", "fr": "Langue TTS"},
"ttsVoice": {"en": "TTS Voice", "fr": "Voix TTS"},
+ "ttsVoiceMap": {"en": "TTS Voice Map", "fr": "Carte des voix TTS"},
"translationEnabled": {"en": "Translation Enabled", "fr": "Traduction activée"},
"targetLanguage": {"en": "Target Language", "fr": "Langue cible"},
"creationDate": {"en": "Creation Date", "fr": "Date de création"},
diff --git a/modules/features/workspace/routeFeatureWorkspace.py b/modules/features/workspace/routeFeatureWorkspace.py
index 2188f8d5..5b1dc679 100644
--- a/modules/features/workspace/routeFeatureWorkspace.py
+++ b/modules/features/workspace/routeFeatureWorkspace.py
@@ -718,3 +718,120 @@ async def synthesizeVoice(
if not text:
raise HTTPException(status_code=400, detail="text is required")
return JSONResponse({"audio": None, "note": "TTS via browser Speech Synthesis API recommended"})
+
+
+# =========================================================================
+# Voice Settings Endpoints
+# =========================================================================
+
+@router.get("/{instanceId}/settings/voice")
+@limiter.limit("30/minute")
+async def getVoiceSettings(
+ request: Request,
+ instanceId: str = Path(...),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Load voice settings for the current user and instance."""
+ _validateInstanceAccess(instanceId, context)
+ dbMgmt = _getDbManagement(context, instanceId)
+ userId = str(context.user.id)
+ vs = dbMgmt.getVoiceSettings(userId)
+ if not vs:
+ vs = dbMgmt.getOrCreateVoiceSettings(userId)
+ result = vs.model_dump() if vs else {}
+ return JSONResponse(result)
+
+
+@router.put("/{instanceId}/settings/voice")
+@limiter.limit("30/minute")
+async def updateVoiceSettings(
+ request: Request,
+ instanceId: str = Path(...),
+ body: dict = Body(...),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Update voice settings for the current user and instance."""
+ _validateInstanceAccess(instanceId, context)
+ dbMgmt = _getDbManagement(context, instanceId)
+ userId = str(context.user.id)
+
+ vs = dbMgmt.getVoiceSettings(userId)
+ if not vs:
+ createData = {
+ "userId": userId,
+ "mandateId": str(context.mandateId) if context.mandateId else "",
+ "featureInstanceId": instanceId,
+ }
+ createData.update(body)
+ created = dbMgmt.createVoiceSettings(createData)
+ return JSONResponse(created)
+
+ updateData = {k: v for k, v in body.items() if k not in ("id", "userId", "mandateId", "featureInstanceId", "creationDate")}
+ updated = dbMgmt.updateVoiceSettings(userId, updateData)
+ return JSONResponse(updated)
+
+
+@router.get("/{instanceId}/voice/languages")
+@limiter.limit("30/minute")
+async def getVoiceLanguages(
+ request: Request,
+ instanceId: str = Path(...),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Return available TTS languages."""
+ mandateId = _validateInstanceAccess(instanceId, context)
+ from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
+ voiceInterface = getVoiceInterface(context.user, mandateId)
+ languagesResult = await voiceInterface.getAvailableLanguages()
+ languageList = languagesResult.get("languages", []) if isinstance(languagesResult, dict) else languagesResult
+ return JSONResponse({"languages": languageList})
+
+
+@router.get("/{instanceId}/voice/voices")
+@limiter.limit("30/minute")
+async def getVoiceVoices(
+ request: Request,
+ instanceId: str = Path(...),
+ language: str = Query("de-DE"),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Return available TTS voices for a given language."""
+ mandateId = _validateInstanceAccess(instanceId, context)
+ from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
+ voiceInterface = getVoiceInterface(context.user, mandateId)
+ voicesResult = await voiceInterface.getAvailableVoices(language)
+ voiceList = voicesResult.get("voices", []) if isinstance(voicesResult, dict) else voicesResult
+ return JSONResponse({"voices": voiceList})
+
+
+@router.post("/{instanceId}/voice/test")
+@limiter.limit("10/minute")
+async def testVoice(
+ request: Request,
+ instanceId: str = Path(...),
+ body: dict = Body(...),
+ context: RequestContext = Depends(getRequestContext),
+):
+ """Test a specific voice with a sample text."""
+ import base64
+ mandateId = _validateInstanceAccess(instanceId, context)
+ text = body.get("text", "Hallo, das ist ein Stimmtest.")
+ language = body.get("language", "de-DE")
+ voiceId = body.get("voiceId")
+
+ from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
+ voiceInterface = getVoiceInterface(context.user, mandateId)
+
+ try:
+ result = await voiceInterface.textToSpeech(text=text, languageCode=language, voiceName=voiceId)
+ if result and isinstance(result, dict):
+ audioContent = result.get("audioContent")
+ if audioContent:
+ audioB64 = base64.b64encode(
+ audioContent if isinstance(audioContent, bytes) else audioContent.encode()
+ ).decode()
+ return JSONResponse({"success": True, "audio": audioB64, "format": "mp3", "text": text})
+ return JSONResponse({"success": False, "error": "TTS returned no audio"})
+ except Exception as e:
+ logger.error(f"Voice test failed: {e}")
+ raise HTTPException(status_code=500, detail=f"TTS test failed: {str(e)}")
diff --git a/modules/routes/routeSecurityGoogle.py b/modules/routes/routeSecurityGoogle.py
index 82e7cccd..ad0dcd52 100644
--- a/modules/routes/routeSecurityGoogle.py
+++ b/modules/routes/routeSecurityGoogle.py
@@ -87,9 +87,10 @@ CLIENT_SECRET = APP_CONFIG.get("Service_GOOGLE_CLIENT_SECRET")
REDIRECT_URI = APP_CONFIG.get("Service_GOOGLE_REDIRECT_URI")
SCOPES = [
"https://www.googleapis.com/auth/gmail.readonly",
+ "https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/userinfo.profile",
"https://www.googleapis.com/auth/userinfo.email",
- "openid"
+ "openid",
]
@router.get("/config")
diff --git a/modules/routes/routeSecurityMsft.py b/modules/routes/routeSecurityMsft.py
index 11d35915..97604e67 100644
--- a/modules/routes/routeSecurityMsft.py
+++ b/modules/routes/routeSecurityMsft.py
@@ -59,6 +59,7 @@ SCOPES = [
"Mail.Send", # Send mail
"Files.ReadWrite.All", # Read and write files (SharePoint/OneDrive)
"Sites.ReadWrite.All", # Read and write SharePoint sites
+ "Team.ReadBasic.All", # List joined teams and channels
# Teams Bot: Meeting and chat access (requires admin consent)
"OnlineMeetings.Read", # Read user's Teams meeting details (delegated scope)
"Chat.ReadWrite", # Read and write Teams chat messages
diff --git a/modules/serviceCenter/services/serviceAgent/agentLoop.py b/modules/serviceCenter/services/serviceAgent/agentLoop.py
index 02d072be..1636db07 100644
--- a/modules/serviceCenter/services/serviceAgent/agentLoop.py
+++ b/modules/serviceCenter/services/serviceAgent/agentLoop.py
@@ -208,7 +208,8 @@ async def runAgentLoop(
results = await _executeToolCalls(toolCalls, toolRegistry, {
"workflowId": workflowId,
"userId": userId,
- "featureInstanceId": featureInstanceId
+ "featureInstanceId": featureInstanceId,
+ "mandateId": mandateId,
})
state.totalToolCalls += len(results)
diff --git a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
index 05ce4da9..59655442 100644
--- a/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
+++ b/modules/serviceCenter/services/serviceAgent/mainServiceAgent.py
@@ -209,7 +209,8 @@ class AgentService:
"## Attached Files\n"
"These files have been uploaded and processed through the extraction pipeline.\n"
"Use `readFile(fileId)` to read text content, `readContentObjects(fileId)` for structured access, "
- "or `describeImage(fileId)` for image analysis.\n\n"
+ "or `describeImage(fileId)` for image analysis.\n"
+ "When generating documents with `renderDocument`, embed images using `` in the markdown content.\n\n"
)
header += "\n\n".join(fileDescriptions)
return f"{header}\n\n---\n\nUser request: {prompt}"
@@ -1226,68 +1227,415 @@ def _registerCoreTools(registry: ToolRegistry, services):
readOnly=True,
)
- # ---- Document generation tool ----
+ # ---- Document rendering tool ----
- async def _generateDocument(args: Dict[str, Any], context: Dict[str, Any]):
- """Generate a document in any format using the existing GenerationService + RendererRegistry."""
- prompt = args.get("prompt", "")
+ def _markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
+ """Convert markdown content to the standard document JSON format expected by renderers."""
+ import re as _re
+
+ sections = []
+ order = 0
+ lines = markdown.split("\n")
+ i = 0
+
+ def _nextId():
+ nonlocal order
+ order += 1
+ return f"s_{order}"
+
+ while i < len(lines):
+ line = lines[i]
+
+ # --- Headings ---
+ headingMatch = _re.match(r'^(#{1,6})\s+(.+)', line)
+ if headingMatch:
+ level = len(headingMatch.group(1))
+ text = headingMatch.group(2).strip()
+ sections.append({
+ "id": _nextId(), "content_type": "heading", "order": order,
+ "elements": [{"content": {"text": text, "level": level}}],
+ })
+ i += 1
+ continue
+
+ # --- Fenced code blocks ---
+ codeMatch = _re.match(r'^```(\w*)', line)
+ if codeMatch:
+ lang = codeMatch.group(1) or "text"
+ codeLines = []
+ i += 1
+ while i < len(lines) and not lines[i].startswith("```"):
+ codeLines.append(lines[i])
+ i += 1
+ i += 1
+ sections.append({
+ "id": _nextId(), "content_type": "code_block", "order": order,
+ "elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
+ })
+ continue
+
+ # --- Tables ---
+ tableMatch = _re.match(r'^\|(.+)\|$', line)
+ if tableMatch and (i + 1) < len(lines) and _re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
+ headerCells = [c.strip() for c in tableMatch.group(1).split("|")]
+ i += 2
+ rows = []
+ while i < len(lines) and _re.match(r'^\|(.+)\|$', lines[i]):
+ rowCells = [c.strip() for c in lines[i][1:-1].split("|")]
+ rows.append(rowCells)
+ i += 1
+ sections.append({
+ "id": _nextId(), "content_type": "table", "order": order,
+ "elements": [{"content": {"headers": headerCells, "rows": rows}}],
+ })
+ continue
+
+ # --- Bullet / numbered lists ---
+ listMatch = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', line)
+ if listMatch:
+ isNumbered = bool(_re.match(r'\d+[.)]', listMatch.group(2)))
+ items = []
+ while i < len(lines) and _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i]):
+ m = _re.match(r'^(\s*)([-*+]|\d+[.)]) (.+)', lines[i])
+ items.append({"text": m.group(3).strip()})
+ i += 1
+ sections.append({
+ "id": _nextId(), "content_type": "bullet_list", "order": order,
+ "elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
+ })
+ continue
+
+ # --- Empty lines (skip) ---
+ if not line.strip():
+ i += 1
+ continue
+
+ # --- Images:  or  ---
+ imgMatch = _re.match(r'^!\[([^\]]*)\]\(([^)]+)\)', line)
+ if imgMatch:
+ altText = imgMatch.group(1).strip() or "Image"
+ src = imgMatch.group(2).strip()
+ fileId = ""
+ if src.startswith("file:"):
+ fileId = src[5:]
+ sections.append({
+ "id": _nextId(), "content_type": "image", "order": order,
+ "elements": [{
+ "content": {
+ "altText": altText,
+ "base64Data": "",
+ "_fileRef": fileId,
+ "_srcUrl": src if not fileId else "",
+ }
+ }],
+ })
+ i += 1
+ continue
+
+ # --- Paragraph (collect consecutive non-empty lines) ---
+ paraLines = []
+ while i < len(lines) and lines[i].strip() and not _re.match(r'^(#{1,6}\s|```|\|.+\||!\[|(\s*)([-*+]|\d+[.)]) )', lines[i]):
+ paraLines.append(lines[i])
+ i += 1
+ if paraLines:
+ sections.append({
+ "id": _nextId(), "content_type": "paragraph", "order": order,
+ "elements": [{"content": {"text": " ".join(paraLines)}}],
+ })
+ continue
+
+ i += 1
+
+ if not sections:
+ sections.append({
+ "id": _nextId(), "content_type": "paragraph", "order": order,
+ "elements": [{"content": {"text": markdown.strip() or "(empty)"}}],
+ })
+
+ return {
+ "metadata": {
+ "split_strategy": "single_document",
+ "source_documents": [],
+ "extraction_method": "agent_rendering",
+ "title": title,
+ "language": language,
+ },
+ "documents": [{
+ "id": "doc_1",
+ "title": title,
+ "sections": sections,
+ }],
+ }
+
+ async def _renderDocument(args: Dict[str, Any], context: Dict[str, Any]):
+ """Render agent-produced markdown content into any document format via the RendererRegistry."""
+ import re as _re
+ content = args.get("content", "")
outputFormat = args.get("outputFormat", "pdf")
- title = args.get("title", "Generated Document")
+ title = args.get("title", "Document")
+ language = args.get("language", "de")
- if not prompt:
- return ToolResult(toolCallId="", toolName="generateDocument", success=False, error="prompt is required")
+ if not content:
+ return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="content is required")
try:
+ structuredContent = _markdownToDocumentJson(content, title, language)
+
+ # Resolve image file references (file:fileId) to base64 data from Knowledge Store
+ knowledgeService = None
+ try:
+ knowledgeService = services.getService("knowledge")
+ except Exception:
+ pass
+ resolvedImages = 0
+ for doc in structuredContent.get("documents", []):
+ for section in doc.get("sections", []):
+ if section.get("content_type") != "image":
+ continue
+ for element in section.get("elements", []):
+ contentObj = element.get("content", {})
+ fileRef = contentObj.get("_fileRef", "")
+ if not fileRef or contentObj.get("base64Data"):
+ continue
+ if knowledgeService:
+ chunks = knowledgeService._knowledgeDb.getContentChunks(fileRef)
+ imageChunks = [c for c in (chunks or []) if c.get("contentType") == "image"]
+ if imageChunks:
+ contentObj["base64Data"] = imageChunks[0].get("data", "")
+ chunkMime = imageChunks[0].get("contextRef", {}).get("mimeType", "image/png")
+ contentObj["mimeType"] = chunkMime
+ resolvedImages += 1
+ if not contentObj.get("base64Data"):
+ try:
+ rawBytes = services.chat.getFileData(fileRef)
+ if rawBytes:
+ import base64 as _b64
+ contentObj["base64Data"] = _b64.b64encode(rawBytes).decode("ascii")
+ contentObj["mimeType"] = "image/png"
+ resolvedImages += 1
+ except Exception:
+ pass
+ contentObj.pop("_fileRef", None)
+ contentObj.pop("_srcUrl", None)
+
+ sectionCount = len(structuredContent.get("documents", [{}])[0].get("sections", []))
+ logger.info(f"renderDocument: parsed {sectionCount} sections from markdown ({len(content)} chars), resolved {resolvedImages} image(s), format={outputFormat}")
+
generationService = services.getService("generation")
- aiService = services.ai
-
- structuredContent = await generationService.generateDocumentWithTwoPhases(userPrompt=prompt)
-
documents = await generationService.renderReport(
extractedContent=structuredContent,
outputFormat=outputFormat,
- language="de",
+ language=language,
title=title,
- userPrompt=prompt,
- aiService=aiService,
+ userPrompt=content,
)
if not documents:
- return ToolResult(toolCallId="", toolName="generateDocument", success=False, error="Rendering produced no documents")
+ return ToolResult(toolCallId="", toolName="renderDocument", success=False, error="Rendering produced no output")
savedFiles = []
+ sideEvents = []
chatService = services.chat
- for doc in documents:
- docData = doc.data if hasattr(doc, "data") else doc.get("data", b"")
- docName = doc.fileName if hasattr(doc, "fileName") else doc.get("fileName", f"{title}.{outputFormat}")
- docMime = doc.mimeType if hasattr(doc, "mimeType") else doc.get("mimeType", "application/octet-stream")
- fileItem = chatService.interfaceDbComponent.saveGeneratedFile(
- docData, docName, docMime,
- ) if hasattr(chatService.interfaceDbComponent, "saveGeneratedFile") else None
+ sanitizedTitle = _re.sub(r'[^a-zA-Z0-9._-]', '_', title).strip('_') or "document"
+
+ for doc in documents:
+ docData = doc.documentData if hasattr(doc, "documentData") else b""
+ docName = doc.filename if hasattr(doc, "filename") else f"{sanitizedTitle}.{outputFormat}"
+ docMime = doc.mimeType if hasattr(doc, "mimeType") else "application/octet-stream"
+
+ if not docName.lower().endswith(f".{outputFormat}"):
+ docName = f"{sanitizedTitle}.{outputFormat}"
+
+ fileItem = None
+ if hasattr(chatService.interfaceDbComponent, "saveGeneratedFile"):
+ fileItem = chatService.interfaceDbComponent.saveGeneratedFile(docData, docName, docMime)
+ else:
+ fileItem, _ = chatService.interfaceDbComponent.saveUploadedFile(docData, docName)
if fileItem:
- savedFiles.append(f"- {docName} (id: {fileItem.id if hasattr(fileItem, 'id') else fileItem.get('id', '?')})")
- else:
- savedFiles.append(f"- {docName} (generated, not saved)")
+ fid = fileItem.id if hasattr(fileItem, "id") else fileItem.get("id", "?")
+ savedFiles.append(f"- {docName} (id: {fid})")
+ sideEvents.append({
+ "type": "fileCreated",
+ "data": {
+ "fileId": fid,
+ "fileName": docName,
+ "mimeType": docMime,
+ "fileSize": len(docData),
+ },
+ })
- result = f"Generated {len(documents)} document(s):\n" + "\n".join(savedFiles)
- return ToolResult(toolCallId="", toolName="generateDocument", success=True, data=result)
+ result = f"Rendered {len(documents)} document(s):\n" + "\n".join(savedFiles)
+ return ToolResult(toolCallId="", toolName="renderDocument", success=True, data=result, sideEvents=sideEvents)
except Exception as e:
- return ToolResult(toolCallId="", toolName="generateDocument", success=False, error=str(e))
+ logger.error(f"renderDocument failed: {e}")
+ return ToolResult(toolCallId="", toolName="renderDocument", success=False, error=str(e))
registry.register(
- "generateDocument", _generateDocument,
- description="Generate a document in any format (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT).",
+ "renderDocument", _renderDocument,
+ description=(
+ "Render markdown content into a document file (PDF, DOCX, XLSX, PPTX, CSV, HTML, MD, JSON, TXT). "
+ "You write the full document content as markdown, then this tool converts and renders it. "
+ "To embed images from uploaded files, use markdown image syntax with the file ID: . "
+ "The images will be resolved from the Knowledge Store and embedded in the output document."
+ ),
parameters={
"type": "object",
"properties": {
- "prompt": {"type": "string", "description": "What the document should contain and how it should look"},
+ "content": {"type": "string", "description": "Full document content as markdown (headings, tables, lists, code blocks, paragraphs, images via )"},
"outputFormat": {"type": "string", "description": "Target format: pdf, docx, xlsx, pptx, csv, html, md, json, txt", "default": "pdf"},
- "title": {"type": "string", "description": "Document title", "default": "Generated Document"},
+ "title": {"type": "string", "description": "Document title", "default": "Document"},
+ "language": {"type": "string", "description": "Document language (ISO 639-1)", "default": "de"},
},
- "required": ["prompt"],
+ "required": ["content"],
+ },
+ readOnly=False,
+ )
+
+ # ── textToSpeech tool ──────────────────────────────────────────────
+ def _stripMarkdownForTts(text: str) -> str:
+ """Strip markdown formatting so TTS reads clean speech text."""
+ import re as _re
+ t = text
+ t = _re.sub(r'\*\*(.+?)\*\*', r'\1', t)
+ t = _re.sub(r'\*(.+?)\*', r'\1', t)
+ t = _re.sub(r'__(.+?)__', r'\1', t)
+ t = _re.sub(r'_(.+?)_', r'\1', t)
+ t = _re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], t)
+ t = _re.sub(r'^#{1,6}\s*', '', t, flags=_re.MULTILINE)
+ t = _re.sub(r'^\s*[-*+]\s+', '', t, flags=_re.MULTILINE)
+ t = _re.sub(r'^\s*\d+\.\s+', '', t, flags=_re.MULTILINE)
+ t = _re.sub(r'\[(.+?)\]\(.+?\)', r'\1', t)
+ t = _re.sub(r'!\[.*?\]\(.*?\)', '', t)
+ t = _re.sub(r'\n{3,}', '\n\n', t)
+ return t.strip()
+
+ async def _textToSpeech(args: Dict[str, Any], context: Dict[str, Any]):
+ """Convert text to speech using Google Cloud TTS, deliver audio via SSE."""
+ import base64 as _b64
+ text = args.get("text", "")
+ language = args.get("language", "auto")
+ voiceName = args.get("voiceName")
+
+ if not text:
+ return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="text is required")
+
+ cleanText = _stripMarkdownForTts(text)
+ if not cleanText:
+ return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="text is empty after stripping markdown")
+
+ try:
+ from modules.interfaces.interfaceVoiceObjects import getVoiceInterface
+ mandateId = context.get("mandateId", "")
+ voiceInterface = getVoiceInterface(currentUser=None, mandateId=mandateId)
+
+ _ISO_TO_BCP47 = {
+ "de": "de-DE", "en": "en-US", "fr": "fr-FR", "it": "it-IT",
+ "es": "es-ES", "pt": "pt-BR", "nl": "nl-NL", "pl": "pl-PL",
+ "ru": "ru-RU", "ja": "ja-JP", "zh": "zh-CN", "ko": "ko-KR",
+ "ar": "ar-XA", "hi": "hi-IN", "tr": "tr-TR", "sv": "sv-SE",
+ }
+
+ if language == "auto":
+ try:
+ snippet = cleanText[:500]
+ detectResult = await voiceInterface.detectLanguage(snippet)
+ if detectResult and detectResult.get("success"):
+ detected = detectResult.get("language", "de")
+ language = _ISO_TO_BCP47.get(detected, detected)
+ if "-" not in language:
+ language = _ISO_TO_BCP47.get(language, f"{language}-{language.upper()}")
+ logger.info(f"textToSpeech: auto-detected language '{detected}' -> '{language}'")
+ else:
+ language = "de-DE"
+ except Exception as detectErr:
+ logger.warning(f"textToSpeech: language detection failed: {detectErr}, defaulting to de-DE")
+ language = "de-DE"
+
+ if not voiceName:
+ try:
+ featureInstanceId = context.get("featureInstanceId", "")
+ userId = context.get("userId", "")
+ if featureInstanceId and userId:
+ dbMgmt = services.chat.interfaceDbApp if hasattr(services.chat, "interfaceDbApp") else None
+ if dbMgmt and hasattr(dbMgmt, "getVoiceSettings"):
+ vs = dbMgmt.getVoiceSettings(userId)
+ if vs:
+ voiceMap = {}
+ if hasattr(vs, "ttsVoiceMap") and vs.ttsVoiceMap:
+ voiceMap = vs.ttsVoiceMap if isinstance(vs.ttsVoiceMap, dict) else {}
+ if language in voiceMap:
+ voiceName = voiceMap[language].get("voiceName") if isinstance(voiceMap[language], dict) else voiceMap[language]
+ logger.info(f"textToSpeech: using configured voice '{voiceName}' for {language}")
+ elif hasattr(vs, "ttsVoice") and vs.ttsVoice and hasattr(vs, "ttsLanguage") and vs.ttsLanguage == language:
+ voiceName = vs.ttsVoice
+ except Exception as prefErr:
+ logger.debug(f"textToSpeech: could not load voice preferences: {prefErr}")
+
+ ttsResult = await voiceInterface.textToSpeech(
+ text=cleanText,
+ languageCode=language,
+ voiceName=voiceName,
+ )
+
+ if not ttsResult or not ttsResult.get("success"):
+ errMsg = ttsResult.get("error", "TTS call failed") if ttsResult else "TTS returned None"
+ return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error=errMsg)
+
+ audioContent = ttsResult.get("audioContent", "")
+ if not audioContent:
+ return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error="TTS returned no audio")
+
+ if isinstance(audioContent, bytes):
+ audioB64 = _b64.b64encode(audioContent).decode("ascii")
+ elif isinstance(audioContent, str):
+ audioB64 = audioContent
+ else:
+ audioB64 = str(audioContent)
+
+ audioFormat = ttsResult.get("audioFormat", "mp3")
+ charCount = len(cleanText)
+ usedVoice = voiceName or "default"
+ logger.info(f"textToSpeech: generated {audioFormat} audio for {charCount} chars, language={language}, voice={usedVoice}")
+
+ return ToolResult(
+ toolCallId="", toolName="textToSpeech", success=True,
+ data=f"Audio generated ({charCount} characters, language={language}, voice={usedVoice}). Playing in chat.",
+ sideEvents=[{
+ "type": "voiceResponse",
+ "data": {
+ "audio": audioB64,
+ "format": audioFormat,
+ "language": language,
+ "charCount": charCount,
+ },
+ }],
+ )
+
+ except ImportError:
+ return ToolResult(toolCallId="", toolName="textToSpeech", success=False,
+ error="Voice interface not available (missing dependency)")
+ except Exception as e:
+ logger.error(f"textToSpeech failed: {e}")
+ return ToolResult(toolCallId="", toolName="textToSpeech", success=False, error=str(e))
+
+ registry.register(
+ "textToSpeech", _textToSpeech,
+ description=(
+ "Convert text to speech audio. The audio is played directly in the chat. "
+ "Use this when the user asks you to read something aloud, narrate, or speak. "
+ "Language is auto-detected from the text content. You do NOT need to specify a language."
+ ),
+ parameters={
+ "type": "object",
+ "properties": {
+ "text": {"type": "string", "description": "The text to convert to speech. Can include markdown (will be stripped automatically)."},
+ "language": {"type": "string", "description": "BCP-47 language code (e.g. de-DE, en-US) or 'auto' for automatic detection", "default": "auto"},
+ "voiceName": {"type": "string", "description": "Optional specific voice name. If omitted, uses the configured voice for the detected language."},
+ },
+ "required": ["text"],
},
readOnly=False,
)