support azure streaming

2026-02-09 02:30:09 +00:00 · 2024-02-26 13:33:22 +07:00
parent ced1a0ef0d
commit 9e74760c39
1 changed files with 35 additions and 17 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -388,10 +388,44 @@ const synthMicrosoft = async(logger, {
  language,
  voice,
  text,
-  filePath
+  filePath,
+  renderForCaching
 }) => {
  try {
    const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
+    // let clean up the text
+    let content = text;
+    if (use_custom_tts && !content.startsWith('<speak')) {
+      /**
+       * Note: it seems that to use custom voice ssml is required with the voice attribute
+       * Otherwise sending plain text we get "Voice does not match"
+       */
+      content = `<speak>${text}</speak>`;
+    }
+
+    if (content.startsWith('<speak>')) {
+      /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
+      const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
+      // eslint-disable-next-line max-len
+      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
+      logger.info({content}, 'synthMicrosoft');
+    }
+
+    if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching) {
+      let params = '';
+      params += `{api_key=${apiKey}`;
+      params += `,region=${region}`;
+      params += `,language=${language}`;
+      if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
+      if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`;
+      if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`;
+      params += '}';
+      return {
+        filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
+        servedFromCache: false,
+        rtt: 0
+      };
+    }
    if (use_custom_tts && custom_tts_endpoint_url) {
      return await _synthOnPremMicrosoft(logger, {
        credentials,
@@ -403,20 +437,12 @@ const synthMicrosoft = async(logger, {
      });
    }
    const trimSilence = filePath.endsWith('.r8');
-    let content = text;
    const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
    speechConfig.speechSynthesisLanguage = language;
    speechConfig.speechSynthesisVoiceName = voice;
    if (use_custom_tts && custom_tts_endpoint) {
      speechConfig.endpointId = custom_tts_endpoint;
    }
-    if (use_custom_tts && !content.startsWith('<speak')) {
-      /**
-       * Note: it seems that to use custom voice ssml is required with the voice attribute
-       * Otherwise sending plain text we get "Voice does not match"
-       */
-      content = `<speak>${text}</speak>`;
-    }
    speechConfig.speechSynthesisOutputFormat = trimSilence ?
      SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
      SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
@@ -428,14 +454,6 @@ const synthMicrosoft = async(logger, {
    }
    const synthesizer = new SpeechSynthesizer(speechConfig);

-    if (content.startsWith('<speak>')) {
-      /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
-      const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
-      // eslint-disable-next-line max-len
-      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
-      logger.info({content}, 'synthMicrosoft');
-    }
-
    return new Promise((resolve, reject) => {
      const speakAsync = content.startsWith('<speak') ?
        synthesizer.speakSsmlAsync.bind(synthesizer) :