Merge pull request #113 from vasudevanubrolu/feat/893-azure-ssml

Feat/893 azure ssml
2026-07-04 19:31:49 +00:00 · 2025-05-27 10:08:48 -04:00
parent 0d98f73c43 925bd26a70
commit 2ff8587601
2 changed files with 18 additions and 8 deletions
@@ -2,6 +2,7 @@ const JAMBONES_TTS_TRIM_SILENCE = process.env.JAMBONES_TTS_TRIM_SILENCE;
 const JAMBONES_DISABLE_TTS_STREAMING = process.env.JAMBONES_DISABLE_TTS_STREAMING;
 const JAMBONES_DISABLE_AZURE_TTS_STREAMING = process.env.JAMBONES_DISABLE_AZURE_TTS_STREAMING;
 const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO;
+const JAMBONES_AZURE_ENABLE_SSML = process.env.JAMBONES_AZURE_ENABLE_SSML;

 const JAMBONES_HTTP_PROXY_IP = process.env.JAMBONES_HTTP_PROXY_IP;
 const JAMBONES_HTTP_PROXY_PORT = process.env.JAMBONES_HTTP_PROXY_PORT;
@@ -21,5 +22,6 @@ module.exports = {
  JAMBONES_TTS_CACHE_DURATION_MINS,
  JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
  TMP_FOLDER,
-  HTTP_TIMEOUT
+  HTTP_TIMEOUT,
+  JAMBONES_AZURE_ENABLE_SSML
 };
@@ -47,6 +47,7 @@ const {
  JAMBONES_HTTP_PROXY_PORT,
  JAMBONES_TTS_CACHE_DURATION_MINS,
  JAMBONES_TTS_TRIM_SILENCE,
+  JAMBONES_AZURE_ENABLE_SSML
 } = require('./config');
 const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
 const OpenAI = require('openai');
@@ -466,7 +467,6 @@ async function _synthOnPremMicrosoft(logger, {
 }) {
  const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials;
  let content = text;
-
  if (use_custom_tts && !content.startsWith('<speak')) {
    /**
     * Note: it seems that to use custom voice ssml is required with the voice attribute
@@ -479,9 +479,13 @@ async function _synthOnPremMicrosoft(logger, {
    /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
    const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
    // eslint-disable-next-line max-len
-    content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
+    content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><lang xml:lang="${language}"><voice name="${voice}">${words}</voice></lang></speak>`;
    logger.info({content}, 'synthMicrosoft');
  }
+  else if (JAMBONES_AZURE_ENABLE_SSML) {
+    // eslint-disable-next-line max-len
+    content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}"><lang xml:lang="${language}">${text}</lang></voice></speak>`;
+  }

  try {
    const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
@@ -518,19 +522,23 @@ const synthMicrosoft = async(logger, {
    let content = text;
    if (use_custom_tts && !content.startsWith('<speak')) {
      /**
-       * Note: it seems that to use custom voice ssml is required with the voice attribute
-       * Otherwise sending plain text we get "Voice does not match"
-       */
+     * Note: it seems that to use custom voice ssml is required with the voice attribute
+     * Otherwise sending plain text we get "Voice does not match"
+     */
      content = `<speak>${text}</speak>`;
    }

    if (content.startsWith('<speak>')) {
-      /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
+    /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
      const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
      // eslint-disable-next-line max-len
-      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
+      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><lang xml:lang="${language}"><voice name="${voice}">${words}</voice></lang></speak>`;
      logger.info({content}, 'synthMicrosoft');
    }
+    else if (JAMBONES_AZURE_ENABLE_SSML) {
+      // eslint-disable-next-line max-len
+      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}"><lang xml:lang="${language}">${text}</lang></voice></speak>`;
+    }
    if (!JAMBONES_DISABLE_TTS_STREAMING && !JAMBONES_DISABLE_AZURE_TTS_STREAMING &&
      !renderForCaching && !disableTtsStreaming) {
      let params = '';