feat fallback speech

2025-12-21 09:08:02 +00:00 · 2023-08-17 14:25:26 +07:00
parent f8c5abe9e9
commit b1c0478051
5 changed files with 271 additions and 42 deletions
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -180,6 +180,13 @@ class CallSession extends Emitter {
    this.application.speech_synthesis_vendor = vendor;
  }

+  get fallbackSpeechSynthesisVendor() {
+    return this.application.fallback_speech_synthesis_vendor;
+  }
+  set fallbackSpeechSynthesisVendor(vendor) {
+    this.application.fallback_speech_synthesis_vendor = vendor;
+  }
+
  /**
   * default label to use for speech synthesis if not provided in the app
   */
@@ -189,6 +196,13 @@ class CallSession extends Emitter {
  set speechSynthesisLabel(label) {
    this.application.speech_synthesis_label = label;
  }
+
+  get fallbackSpeechSynthesisLabel() {
+    return this.application.fallback_speech_synthesis_label;
+  }
+  set fallbackSpeechSynthesisLabel(label) {
+    this.application.fallback_speech_synthesis_label = label;
+  }
  /**
   * default voice to use for speech synthesis if not provided in the app
   */
@@ -198,6 +212,13 @@ class CallSession extends Emitter {
  set speechSynthesisVoice(voice) {
    this.application.speech_synthesis_voice = voice;
  }
+
+  get fallbackSpeechSynthesisVoice() {
+    return this.application.fallback_speech_synthesis_voice;
+  }
+  set fallbackSpeechSynthesisVoice(voice) {
+    this.application.fallback_speech_synthesis_voice = voice;
+  }
  /**
   * default language to use for speech synthesis if not provided in the app
   */
@@ -208,6 +229,13 @@ class CallSession extends Emitter {
    this.application.speech_synthesis_language = language;
  }

+  get fallbackSpeechSynthesisLanguage() {
+    return this.application.fallback_speech_synthesis_language;
+  }
+  set fallbackSpeechSynthesisLanguage(language) {
+    this.application.fallback_speech_synthesis_language = language;
+  }
+
  /**
   * default vendor to use for speech recognition if not provided in the app
   */
@@ -217,6 +245,13 @@ class CallSession extends Emitter {
  set speechRecognizerVendor(vendor) {
    this.application.speech_recognizer_vendor = vendor;
  }
+
+  get fallbackSpeechRecognizerVendor() {
+    return this.application.fallback_speech_recognizer_vendor;
+  }
+  set fallbackSpeechRecognizerVendor(vendor) {
+    this.application.fallback_speech_recognizer_vendor = vendor;
+  }
  /**
   * default vendor to use for speech recognition if not provided in the app
   */
@@ -226,6 +261,13 @@ class CallSession extends Emitter {
  set speechRecognizerLabel(label) {
    this.application.speech_recognizer_label = label;
  }
+
+  get fallbackSpeechRecognizerLabel() {
+    return this.application.fallback_speech_recognizer_label;
+  }
+  set fallbackSpeechRecognizerLabel(label) {
+    this.application.fallback_speech_recognizer_label = label;
+  }
  /**
 * default language to use for speech recognition if not provided in the app
 */
@@ -236,6 +278,13 @@ class CallSession extends Emitter {
    this.application.speech_recognizer_language = language;
  }

+  get fallbackSpeechRecognizerLanguage() {
+    return this.application.fallback_speech_recognizer_language;
+  }
+  set fallbackSpeechRecognizerLanguage(language) {
+    this.application.fallback_speech_recognizer_language = language;
+  }
+
  /**
   * indicates whether the call currently in progress
   */
--- a/lib/tasks/config.js
+++ b/lib/tasks/config.js
@@ -114,6 +114,19 @@ class TaskConfig extends Task {
      cs.speechSynthesisVoice = this.synthesizer.voice !== 'default'
        ? this.synthesizer.voice
        : cs.speechSynthesisVoice;
+      // fallback vendor
+      cs.fallbackSpeechSynthesisVendor = this.synthesizer.fallbackVendor !== 'default'
+        ? this.synthesizer.fallbackVendor
+        : cs.fallbackSpeechSynthesisVendor;
+      cs.fallbackSpeechSynthesisLabel = this.synthesizer.fallbackLabel !== 'default'
+        ? this.synthesizer.fallbackLabel
+        : cs.fallbackSpeechSynthesisLabel;
+      cs.fallbackSpeechSynthesisLanguage = this.synthesizer.fallbackLanguage !== 'default'
+        ?  this.synthesizer.fallbackLanguage
+        : cs.fallbackSpeechSynthesisLanguage;
+      cs.fallbackSpeechSynthesisVoice = this.synthesizer.fallbackVoice !== 'default'
+        ? this.synthesizer.fallbackVoice
+        : cs.fallbackSpeechSynthesisVoice;
      this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer');
    }
    if (this.hasRecognizer) {
@@ -126,6 +139,17 @@ class TaskConfig extends Task {
      cs.speechRecognizerLanguage = this.recognizer.language !== 'default'
        ? this.recognizer.language
        : cs.speechRecognizerLanguage;
+
+      //fallback
+      cs.fallbackSpeechRecognizerVendor = this.recognizer.fallbackVendor !== 'default'
+        ? this.recognizer.fallbackVendor
+        : cs.fallbackSpeechRecognizerVendor;
+      cs.fallbackSpeechRecognizerLabel = this.recognizer.fallbackLabel !== 'default'
+        ? this.recognizer.fallbackLabel
+        : cs.fallbackSpeechRecognizerLabel;
+      cs.fallbackSpeechRecognizerLanguage = this.recognizer.fallbackLanguage !== 'default'
+        ? this.recognizer.fallbackLanguage
+        : cs.fallbackSpeechRecognizerLanguage;
      cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false;
      if (cs.isContinuousAsr) {
        cs.asrTimeout = this.recognizer.asrTimeout;
--- a/lib/tasks/dialogflow/index.js
+++ b/lib/tasks/dialogflow/index.js
@@ -59,6 +59,12 @@ class Dialogflow extends Task {
      this.language = this.data.tts.language || 'default';
      this.voice = this.data.tts.voice || 'default';
      this.speechSynthesisLabel = this.data.tts.label || null;
+
+      // fallback tts
+      this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
+      this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
    }
    this.bargein = this.data.bargein;
  }
@@ -119,9 +125,15 @@ class Dialogflow extends Task {
        this.vendor = cs.speechSynthesisVendor;
        this.language = cs.speechSynthesisLanguage;
        this.voice = cs.speechSynthesisVoice;
+        this.speechSynthesisLabel = cs.speechSynthesisLabel;
      }
-      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
-        this.speechSynthesisLabel || cs.speechSynthesisLabel);
+      if (this.fallbackVendor === 'default') {
+        this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
+        this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
+        this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
+        this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
+      }
+      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',this.speechSynthesisLabel);

      this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs));
      this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs));
@@ -223,17 +235,7 @@ class Dialogflow extends Task {
      }

      try {
-        const obj = {
-          account_sid: cs.accountSid,
-          text: intent.fulfillmentText,
-          vendor: this.vendor,
-          language: this.language,
-          voice: this.voice,
-          salt: cs.callSid,
-          credentials: this.ttsCredentials
-        };
-        this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
-        const {filePath, servedFromCache} = await synthAudio(stats, obj);
+        const {filePath, servedFromCache} = await this.fallbackSynthAudio(cs, intent, stats);
        if (filePath) cs.trackTmpFile(filePath);
        if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length);

@@ -279,6 +281,46 @@ class Dialogflow extends Task {
    }
  }

+  async fallbackSynthAudio(cs, intent, stats) {
+    try {
+      const obj = {
+        account_sid: cs.accountSid,
+        text: intent.fulfillmentText,
+        vendor: this.vendor,
+        language: this.language,
+        voice: this.voice,
+        salt: cs.callSid,
+        credentials: this.ttsCredentials
+      };
+      this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
+
+      return await synthAudio(stats, obj);
+    } catch (error) {
+      this.logger.info({error}, 'Failed to synthesize audio from primary vendor');
+
+      try {
+        if(this.fallbackVendor) {
+          const credentials = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
+          const obj = {
+            account_sid: cs.accountSid,
+            text: intent.fulfillmentText,
+            vendor: this.fallbackVendor,
+            language: this.fallbackLanguage,
+            voice: this.fallbackVoice,
+            salt: cs.callSid,
+            credentials
+          };
+          this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via fallback tts');
+          return await synthAudio(stats, obj);
+        }
+      } catch(err) {
+        this.logger.info({err}, 'Failed to synthesize audio from falllback vendor');
+        throw err;
+      }
+      throw error;
+    }
+  }
+
  /**
   * A transcription - either interim or final - has been returned.
   * If we are doing barge-in based on hotword detection, check for the hotword or phrase.
--- a/lib/tasks/lex.js
+++ b/lib/tasks/lex.js
@@ -26,6 +26,12 @@ class Lex extends Task {
      this.language = this.data.tts.language || 'default';
      this.voice = this.data.tts.voice || 'default';
      this.speechCredentialLabel = this.data.tts.label || null;
+
+      // fallback tts
+      this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
+      this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
    }

    this.botName = `${this.bot}:${this.alias}:${this.region}`;
@@ -103,9 +109,15 @@ class Lex extends Task {
        this.vendor = cs.speechSynthesisVendor;
        this.language = cs.speechSynthesisLanguage;
        this.voice = cs.speechSynthesisVoice;
+        this.speechCredentialLabel = cs.speechSynthesisLabel;
      }
-      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
-        this.speechCredentialLabel || cs.speechSynthesisVendor);
+      if (this.fallbackVendor === 'default') {
+        this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
+        this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
+        this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
+        this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
+      }
+      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechCredentialLabel);

      this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs));
      this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs));
@@ -170,6 +182,41 @@ class Lex extends Task {
    }
  }

+  async fallbackSynthAudio(cs, msg, stats, synthAudio) {
+    try {
+      const {filePath} = await synthAudio(stats, {
+        account_sid: cs.accountSid,
+        text: msg,
+        vendor: this.vendor,
+        language: this.language,
+        voice: this.voice,
+        salt: cs.callSid,
+        credentials: this.ttsCredentials
+      });
+
+      return filePath;
+    } catch (error) {
+      this.logger.info({error}, 'failed to synth audio from primary vendor');
+      if (this.fallbackVendor) {
+        try {
+          const credential = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
+          const {filePath} = await synthAudio(stats, {
+            account_sid: cs.accountSid,
+            text: msg,
+            vendor: this.fallbackVendor,
+            language: this.fallbackLanguage,
+            voice: this.fallbackVoice,
+            salt: cs.callSid,
+            credentials: credential
+          });
+          return filePath;
+        } catch(err) {
+          this.logger.info({err}, 'failed to synth audio from fallback vendor');
+        }
+      }
+    }
+  }
+
  /**
   * @param {*} evt - event data
   */
@@ -189,16 +236,7 @@ class Lex extends Task {

        try {
          this.logger.debug(`tts with ${this.vendor} ${this.voice}`);
-          // eslint-disable-next-line no-unused-vars
-          const {filePath, servedFromCache} = await synthAudio(stats, {
-            account_sid: cs.accountSid,
-            text: msg,
-            vendor: this.vendor,
-            language: this.language,
-            voice: this.voice,
-            salt: cs.callSid,
-            credentials: this.ttsCredentials
-          });
+          const filePath = await this.fallbackSynthAudio(cs, msg, stats, synthAudio);
          if (filePath) cs.trackTmpFile(filePath);

          if (this.events.includes('start-play')) {
--- a/lib/tasks/say.js
+++ b/lib/tasks/say.js
@@ -59,15 +59,28 @@ class TaskSay extends Task {
    const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
      this.synthesizer.vendor :
      cs.speechSynthesisVendor;
+    const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
+    this.synthesizer.fallbackVendor :
+    cs.fallbackSpeechSynthesisVendor;
    const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
      this.synthesizer.language :
      cs.speechSynthesisLanguage ;
+    const fallbackLanguage = this.synthesizer.fallbackLanguage && this.synthesizer.fallbackLanguage !== 'default' ?
+    this.synthesizer.fallbackLanguage :
+    cs.fallbackSpeechSynthesisLanguage ;
    let voice =  this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
      this.synthesizer.voice :
      cs.speechSynthesisVoice;
+    const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
+    this.synthesizer.fallbackVoice :
+    cs.fallbackSpeechSynthesisVoice;
+    const fallbackLabel = this.synthesizer.fallbackLabel && this.synthesizer.fallbackLabel !== 'default' ?
+    this.synthesizer.fallbackLabel :
+    cs.fallbackSpeechSynthesisLabel;
    const engine = this.synthesizer.engine || 'standard';
    const salt = cs.callSid;
-    let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer?.label || cs.speechSynthesisLabel);
+    let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer ?
+      this.data.synthesizer?.label : cs.speechSynthesisLabel);

    /* parse Nuance voices into name and model */
    let model;
@@ -118,6 +131,8 @@ class TaskSay extends Task {
          'tts.language': language,
          'tts.voice': voice
        });
+        let filePathUrl, isFromCache, roundTripTime;
+        let executedVendor, executedLanguage;
        try {
          const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
            account_sid: cs.accountSid,
@@ -131,37 +146,98 @@ class TaskSay extends Task {
            credentials,
            disableTtsCache : this.disableTtsCache
          });
-          this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
-          if (filePath) cs.trackTmpFile(filePath);
+
+          span.setAttributes({'tts.cached': servedFromCache});
+          span.end();
+
          if (!servedFromCache && !lastUpdated) {
            lastUpdated = true;
            updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
              .catch(() => {/*already logged error */});
          }
-          span.setAttributes({'tts.cached': servedFromCache});
-          span.end();
-          if (!servedFromCache && rtt) {
-            this.notifyStatus({
-              event: 'synthesized-audio',
-              vendor,
-              language,
-              characters: text.length,
-              elapsedTime: rtt
+
+          filePathUrl = filePath;
+          isFromCache = servedFromCache;
+          roundTripTime = rtt;
+          executedVendor = vendor;
+          executedLanguage = language;
+
+        } catch (error) {
+          if (fallbackVendor) {
+            const fallbackcredentials = cs.getSpeechCredentials(fallbackVendor, 'tts', fallbackLabel); 
+            const {span: fallbackSpan} = this.startChildSpan('fallback-tts-generation', {
+              'tts.vendor': fallbackVendor,
+              'tts.language': fallbackLanguage,
+              'tts.voice': fallbackVoice
            });
+
+            try {
+              const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
+                account_sid: cs.accountSid,
+                text,
+                fallbackVendor,
+                fallbackLanguage,
+                fallbackVoice,
+                engine,
+                model,
+                salt,
+                credentials: fallbackcredentials,
+                disableTtsCache : this.disableTtsCache
+              });
+
+              fallbackSpan.setAttributes({'tts.cached': servedFromCache});
+              fallbackSpan.end();
+
+              if (!servedFromCache && !lastUpdated) {
+                lastUpdated = true;
+                updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
+                  .catch(() => {/*already logged error */});
              }
-          return filePath;
+
+              filePathUrl = filePath;
+              isFromCache = servedFromCache;
+              roundTripTime = rtt;
+              executedVendor = fallbackVendor;
+              executedLanguage = fallbackLanguage;
+
            } catch (err){
-          this.logger.info({err}, 'Error synthesizing tts');
+              this.logger.info({err}, 'fallback Speech failed to synthesize audio');
+              fallbackSpan.end();
+              writeAlerts({
+                account_sid: cs.accountSid,
+                alert_type: AlertType.TTS_FAILURE,
+                vendor: fallbackVendor,
+                detail: err.message
+              }).catch((err) => this.logger.info({err}, 'Error generating alert for fallback tts failure'));
+            }
+          }
+
+          this.logger.info({error}, 'Error synthesizing tts');
          span.end();
          writeAlerts({
            account_sid: cs.accountSid,
            alert_type: AlertType.TTS_FAILURE,
            vendor,
-            detail: err.message
+            detail: error.message
          }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure'));
-          this.notifyError({msg: 'TTS error', details: err.message || err});
+          this.notifyError({msg: 'TTS error', details: error.message || error});
          return;
        }
+
+        this.logger.debug(`file ${filePathUrl}, served from cache ${isFromCache}`);
+        if (filePathUrl) cs.trackTmpFile(filePathUrl);
+
+        if (!isFromCache && roundTripTime) {
+          this.notifyStatus({
+            event: 'synthesized-audio',
+            vendor: executedVendor,
+            language: executedLanguage,
+            characters: text.length,
+            elapsedTime: roundTripTime
+          });
+        }
+
+        return filePathUrl;
      };

      const arr = this.text.map((t) => generateAudio(t));