Merge pull request #59 from jambonz/feat/azure_tts

support azure streaming
2026-07-23 20:51:49 +00:00 · 2024-03-30 09:21:07 -04:00
parent 8f3e930004 2701af102a
commit 4f1685a365
2 changed files with 46 additions and 18 deletions
@@ -183,7 +183,9 @@ async function synthAudio(client, logger, stats, { account_sid,
      case 'azure':
      case 'microsoft':
        vendorLabel = 'microsoft';
-        audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath});
+        audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
+          filePath, renderForCaching, disableTtsStreaming});
+        if (audioBuffer?.filePath) return audioBuffer;
        break;
      case 'nuance':
        model = model || 'enhanced';
@@ -381,10 +383,46 @@ const synthMicrosoft = async(logger, {
  language,
  voice,
  text,
-  filePath
+  filePath,
+  renderForCaching,
+  disableTtsStreaming
 }) => {
  try {
    const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
+    // let clean up the text
+    let content = text;
+    if (use_custom_tts && !content.startsWith('<speak')) {
+      /**
+       * Note: it seems that to use custom voice ssml is required with the voice attribute
+       * Otherwise sending plain text we get "Voice does not match"
+       */
+      content = `<speak>${text}</speak>`;
+    }
+
+    if (content.startsWith('<speak>')) {
+      /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
+      const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
+      // eslint-disable-next-line max-len
+      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
+      logger.info({content}, 'synthMicrosoft');
+    }
+    if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+      let params = '';
+      params += `{api_key=${apiKey}`;
+      params += `,language=${language}`;
+      params += ',vendor=microsoft';
+      params += `,voice=${voice}`;
+      if (region) params += `,region=${region}`;
+      if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
+      if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`;
+      if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`;
+      params += '}';
+      return {
+        filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
+        servedFromCache: false,
+        rtt: 0
+      };
+    }
    if (use_custom_tts && custom_tts_endpoint_url) {
      return await _synthOnPremMicrosoft(logger, {
        credentials,
@@ -396,20 +434,12 @@ const synthMicrosoft = async(logger, {
      });
    }
    const trimSilence = filePath.endsWith('.r8');
-    let content = text;
    const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
    speechConfig.speechSynthesisLanguage = language;
    speechConfig.speechSynthesisVoiceName = voice;
    if (use_custom_tts && custom_tts_endpoint) {
      speechConfig.endpointId = custom_tts_endpoint;
    }
-    if (use_custom_tts && !content.startsWith('<speak')) {
-      /**
-       * Note: it seems that to use custom voice ssml is required with the voice attribute
-       * Otherwise sending plain text we get "Voice does not match"
-       */
-      content = `<speak>${text}</speak>`;
-    }
    speechConfig.speechSynthesisOutputFormat = trimSilence ?
      SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
      SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
@@ -421,14 +451,6 @@ const synthMicrosoft = async(logger, {
    }
    const synthesizer = new SpeechSynthesizer(speechConfig);

-    if (content.startsWith('<speak>')) {
-      /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
-      const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
-      // eslint-disable-next-line max-len
-      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
-      logger.info({content}, 'synthMicrosoft');
-    }
-
    return new Promise((resolve, reject) => {
      const speakAsync = content.startsWith('<speak') ?
        synthesizer.speakSsmlAsync.bind(synthesizer) :
@@ -188,6 +188,7 @@ test('Azure speech synth tests', async(t) => {
      language: 'en-US',
      voice: 'en-US-ChristopherNeural',
      text: longText,
+      renderForCaching: true
    });
    t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
    if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
@@ -203,6 +204,7 @@ test('Azure speech synth tests', async(t) => {
      language: 'en-US',
      voice: 'en-US-ChristopherNeural',
      text: longText,
+      renderForCaching: true
    });
    t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
  } catch (err) {
@@ -237,6 +239,7 @@ test('Azure SSML tests', async(t) => {
      language: 'en-US',
      voice: 'en-US-ChristopherNeural',
      text,
+      renderForCaching: true
    });
    t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
    if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
@@ -252,6 +255,7 @@ test('Azure SSML tests', async(t) => {
      language: 'en-US',
      voice: 'en-US-ChristopherNeural',
      text,
+      renderForCaching: true
    });
    t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
  } catch (err) {
@@ -283,6 +287,7 @@ test('Azure custom voice speech synth tests', async(t) => {
      language: 'en-US',
      voice: process.env.MICROSOFT_CUSTOM_VOICE,
      text,
+      renderForCaching: true
    });
    t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);

@@ -297,6 +302,7 @@ test('Azure custom voice speech synth tests', async(t) => {
      language: 'en-US',
      voice: process.env.MICROSOFT_CUSTOM_VOICE,
      text,
+      renderForCaching: true
    });
    t.ok(opts.servedFromCache, `successfully retrieved microsoft custom voice audio from cache ${opts.filePath}`);
  } catch (err) {