feat/893 azure ssml based on env config

This commit is contained in:
vasudevanubrolu
2025-05-08 16:42:27 +05:30
parent 0d98f73c43
commit 59bca302b9
2 changed files with 16 additions and 7 deletions

View File

@@ -2,6 +2,7 @@ const JAMBONES_TTS_TRIM_SILENCE = process.env.JAMBONES_TTS_TRIM_SILENCE;
const JAMBONES_DISABLE_TTS_STREAMING = process.env.JAMBONES_DISABLE_TTS_STREAMING;
const JAMBONES_DISABLE_AZURE_TTS_STREAMING = process.env.JAMBONES_DISABLE_AZURE_TTS_STREAMING;
const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO;
const JAMBONES_AZURE_ENABLE_SSML = process.env.JAMBONES_AZURE_ENABLE_SSML;
const JAMBONES_HTTP_PROXY_IP = process.env.JAMBONES_HTTP_PROXY_IP;
const JAMBONES_HTTP_PROXY_PORT = process.env.JAMBONES_HTTP_PROXY_PORT;
@@ -21,5 +22,6 @@ module.exports = {
JAMBONES_TTS_CACHE_DURATION_MINS,
JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
TMP_FOLDER,
HTTP_TIMEOUT
HTTP_TIMEOUT,
JAMBONES_AZURE_ENABLE_SSML
};

View File

@@ -47,6 +47,7 @@ const {
JAMBONES_HTTP_PROXY_PORT,
JAMBONES_TTS_CACHE_DURATION_MINS,
JAMBONES_TTS_TRIM_SILENCE,
JAMBONES_AZURE_ENABLE_SSML
} = require('./config');
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
const OpenAI = require('openai');
@@ -466,7 +467,6 @@ async function _synthOnPremMicrosoft(logger, {
}) {
const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials;
let content = text;
if (use_custom_tts && !content.startsWith('<speak')) {
/**
* Note: it seems that to use custom voice ssml is required with the voice attribute
@@ -516,16 +516,23 @@ const synthMicrosoft = async(logger, {
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
// let clean up the text
let content = text;
if (use_custom_tts && !content.startsWith('<speak')) {
/**
if (!JAMBONES_AZURE_ENABLE_SSML) {
if (use_custom_tts && !content.startsWith('<speak')) {
/**
* Note: it seems that to use custom voice ssml is required with the voice attribute
* Otherwise sending plain text we get "Voice does not match"
*/
content = `<speak>${text}</speak>`;
}
content = `<speak>${text}</speak>`;
}
if (content.startsWith('<speak>')) {
if (content.startsWith('<speak>')) {
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
// eslint-disable-next-line max-len
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
logger.info({content}, 'synthMicrosoft');
}
} else {
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
// eslint-disable-next-line max-len
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;