From b1c0478051ba7340829659fc03d506e703b3e197 Mon Sep 17 00:00:00 2001
From: Quan HL <quan.luuhoang8@gmail.com>
Date: Thu, 17 Aug 2023 14:25:26 +0700
Subject: [PATCH] feat fallback speech

---
 lib/session/call-session.js   |  49 +++++++++++++++
 lib/tasks/config.js           |  24 ++++++++
 lib/tasks/dialogflow/index.js |  68 +++++++++++++++++----
 lib/tasks/lex.js              |  62 +++++++++++++++----
 lib/tasks/say.js              | 110 ++++++++++++++++++++++++++++------
 5 files changed, 271 insertions(+), 42 deletions(-)

diff --git a/lib/session/call-session.js b/lib/session/call-session.js
index b67f095b..2ed0b1ed 100644
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -180,6 +180,13 @@ class CallSession extends Emitter {
     this.application.speech_synthesis_vendor = vendor;
   }
 
+  get fallbackSpeechSynthesisVendor() {
+    return this.application.fallback_speech_synthesis_vendor;
+  }
+  set fallbackSpeechSynthesisVendor(vendor) {
+    this.application.fallback_speech_synthesis_vendor = vendor;
+  }
+
   /**
    * default label to use for speech synthesis if not provided in the app
    */
@@ -189,6 +196,13 @@ class CallSession extends Emitter {
   set speechSynthesisLabel(label) {
     this.application.speech_synthesis_label = label;
   }
+
+  get fallbackSpeechSynthesisLabel() {
+    return this.application.fallback_speech_synthesis_label;
+  }
+  set fallbackSpeechSynthesisLabel(label) {
+    this.application.fallback_speech_synthesis_label = label;
+  }
   /**
    * default voice to use for speech synthesis if not provided in the app
    */
@@ -198,6 +212,13 @@ class CallSession extends Emitter {
   set speechSynthesisVoice(voice) {
     this.application.speech_synthesis_voice = voice;
   }
+
+  get fallbackSpeechSynthesisVoice() {
+    return this.application.fallback_speech_synthesis_voice;
+  }
+  set fallbackSpeechSynthesisVoice(voice) {
+    this.application.fallback_speech_synthesis_voice = voice;
+  }
   /**
    * default language to use for speech synthesis if not provided in the app
    */
@@ -208,6 +229,13 @@ class CallSession extends Emitter {
     this.application.speech_synthesis_language = language;
   }
 
+  get fallbackSpeechSynthesisLanguage() {
+    return this.application.fallback_speech_synthesis_language;
+  }
+  set fallbackSpeechSynthesisLanguage(language) {
+    this.application.fallback_speech_synthesis_language = language;
+  }
+
   /**
    * default vendor to use for speech recognition if not provided in the app
    */
@@ -217,6 +245,13 @@ class CallSession extends Emitter {
   set speechRecognizerVendor(vendor) {
     this.application.speech_recognizer_vendor = vendor;
   }
+
+  get fallbackSpeechRecognizerVendor() {
+    return this.application.fallback_speech_recognizer_vendor;
+  }
+  set fallbackSpeechRecognizerVendor(vendor) {
+    this.application.fallback_speech_recognizer_vendor = vendor;
+  }
   /**
    * default vendor to use for speech recognition if not provided in the app
    */
@@ -226,6 +261,13 @@ class CallSession extends Emitter {
   set speechRecognizerLabel(label) {
     this.application.speech_recognizer_label = label;
   }
+
+  get fallbackSpeechRecognizerLabel() {
+    return this.application.fallback_speech_recognizer_label;
+  }
+  set fallbackSpeechRecognizerLabel(label) {
+    this.application.fallback_speech_recognizer_label = label;
+  }
   /**
  * default language to use for speech recognition if not provided in the app
  */
@@ -236,6 +278,13 @@ class CallSession extends Emitter {
     this.application.speech_recognizer_language = language;
   }
 
+  get fallbackSpeechRecognizerLanguage() {
+    return this.application.fallback_speech_recognizer_language;
+  }
+  set fallbackSpeechRecognizerLanguage(language) {
+    this.application.fallback_speech_recognizer_language = language;
+  }
+
   /**
    * indicates whether the call currently in progress
    */
diff --git a/lib/tasks/config.js b/lib/tasks/config.js
index 826a6ed5..f7c7396d 100644
--- a/lib/tasks/config.js
+++ b/lib/tasks/config.js
@@ -114,6 +114,19 @@ class TaskConfig extends Task {
       cs.speechSynthesisVoice = this.synthesizer.voice !== 'default'
         ? this.synthesizer.voice
         : cs.speechSynthesisVoice;
+      // fallback vendor
+      cs.fallbackSpeechSynthesisVendor = this.synthesizer.fallbackVendor !== 'default'
+        ? this.synthesizer.fallbackVendor
+        : cs.fallbackSpeechSynthesisVendor;
+      cs.fallbackSpeechSynthesisLabel = this.synthesizer.fallbackLabel !== 'default'
+        ? this.synthesizer.fallbackLabel
+        : cs.fallbackSpeechSynthesisLabel;
+      cs.fallbackSpeechSynthesisLanguage = this.synthesizer.fallbackLanguage !== 'default'
+        ?  this.synthesizer.fallbackLanguage
+        : cs.fallbackSpeechSynthesisLanguage;
+      cs.fallbackSpeechSynthesisVoice = this.synthesizer.fallbackVoice !== 'default'
+        ? this.synthesizer.fallbackVoice
+        : cs.fallbackSpeechSynthesisVoice;
       this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer');
     }
     if (this.hasRecognizer) {
@@ -126,6 +139,17 @@ class TaskConfig extends Task {
       cs.speechRecognizerLanguage = this.recognizer.language !== 'default'
         ? this.recognizer.language
         : cs.speechRecognizerLanguage;
+
+      //fallback
+      cs.fallbackSpeechRecognizerVendor = this.recognizer.fallbackVendor !== 'default'
+        ? this.recognizer.fallbackVendor
+        : cs.fallbackSpeechRecognizerVendor;
+      cs.fallbackSpeechRecognizerLabel = this.recognizer.fallbackLabel !== 'default'
+        ? this.recognizer.fallbackLabel
+        : cs.fallbackSpeechRecognizerLabel;
+      cs.fallbackSpeechRecognizerLanguage = this.recognizer.fallbackLanguage !== 'default'
+        ? this.recognizer.fallbackLanguage
+        : cs.fallbackSpeechRecognizerLanguage;
       cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false;
       if (cs.isContinuousAsr) {
         cs.asrTimeout = this.recognizer.asrTimeout;
diff --git a/lib/tasks/dialogflow/index.js b/lib/tasks/dialogflow/index.js
index 8882a338..4ed2ccf7 100644
--- a/lib/tasks/dialogflow/index.js
+++ b/lib/tasks/dialogflow/index.js
@@ -59,6 +59,12 @@ class Dialogflow extends Task {
       this.language = this.data.tts.language || 'default';
       this.voice = this.data.tts.voice || 'default';
       this.speechSynthesisLabel = this.data.tts.label || null;
+
+      // fallback tts
+      this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
+      this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
     }
     this.bargein = this.data.bargein;
   }
@@ -119,9 +125,15 @@ class Dialogflow extends Task {
         this.vendor = cs.speechSynthesisVendor;
         this.language = cs.speechSynthesisLanguage;
         this.voice = cs.speechSynthesisVoice;
+        this.speechSynthesisLabel = cs.speechSynthesisLabel;
       }
-      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
-        this.speechSynthesisLabel || cs.speechSynthesisLabel);
+      if (this.fallbackVendor === 'default') {
+        this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
+        this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
+        this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
+        this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
+      }
+      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',this.speechSynthesisLabel);
 
       this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs));
       this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs));
@@ -223,17 +235,7 @@ class Dialogflow extends Task {
       }
 
       try {
-        const obj = {
-          account_sid: cs.accountSid,
-          text: intent.fulfillmentText,
-          vendor: this.vendor,
-          language: this.language,
-          voice: this.voice,
-          salt: cs.callSid,
-          credentials: this.ttsCredentials
-        };
-        this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
-        const {filePath, servedFromCache} = await synthAudio(stats, obj);
+        const {filePath, servedFromCache} = await this.fallbackSynthAudio(cs, intent, stats);
         if (filePath) cs.trackTmpFile(filePath);
         if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length);
 
@@ -279,6 +281,46 @@ class Dialogflow extends Task {
     }
   }
 
+  async fallbackSynthAudio(cs, intent, stats) {
+    try {
+      const obj = {
+        account_sid: cs.accountSid,
+        text: intent.fulfillmentText,
+        vendor: this.vendor,
+        language: this.language,
+        voice: this.voice,
+        salt: cs.callSid,
+        credentials: this.ttsCredentials
+      };
+      this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
+
+      return await synthAudio(stats, obj);
+    } catch (error) {
+      this.logger.info({error}, 'Failed to synthesize audio from primary vendor');
+
+      try {
+        if(this.fallbackVendor) {
+          const credentials = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
+          const obj = {
+            account_sid: cs.accountSid,
+            text: intent.fulfillmentText,
+            vendor: this.fallbackVendor,
+            language: this.fallbackLanguage,
+            voice: this.fallbackVoice,
+            salt: cs.callSid,
+            credentials
+          };
+          this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via fallback tts');
+          return await synthAudio(stats, obj);
+        }
+      } catch(err) {
+        this.logger.info({err}, 'Failed to synthesize audio from falllback vendor');
+        throw err;
+      }
+      throw error;
+    }
+  }
+
   /**
    * A transcription - either interim or final - has been returned.
    * If we are doing barge-in based on hotword detection, check for the hotword or phrase.
diff --git a/lib/tasks/lex.js b/lib/tasks/lex.js
index d8dd8fae..3e33ecb9 100644
--- a/lib/tasks/lex.js
+++ b/lib/tasks/lex.js
@@ -26,6 +26,12 @@ class Lex extends Task {
       this.language = this.data.tts.language || 'default';
       this.voice = this.data.tts.voice || 'default';
       this.speechCredentialLabel = this.data.tts.label || null;
+
+      // fallback tts
+      this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
+      this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
+      this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
     }
 
     this.botName = `${this.bot}:${this.alias}:${this.region}`;
@@ -103,9 +109,15 @@ class Lex extends Task {
         this.vendor = cs.speechSynthesisVendor;
         this.language = cs.speechSynthesisLanguage;
         this.voice = cs.speechSynthesisVoice;
+        this.speechCredentialLabel = cs.speechSynthesisLabel;
       }
-      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
-        this.speechCredentialLabel || cs.speechSynthesisVendor);
+      if (this.fallbackVendor === 'default') {
+        this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
+        this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
+        this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
+        this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
+      }
+      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechCredentialLabel);
 
       this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs));
       this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs));
@@ -170,6 +182,41 @@ class Lex extends Task {
     }
   }
 
+  async fallbackSynthAudio(cs, msg, stats, synthAudio) {
+    try {
+      const {filePath} = await synthAudio(stats, {
+        account_sid: cs.accountSid,
+        text: msg,
+        vendor: this.vendor,
+        language: this.language,
+        voice: this.voice,
+        salt: cs.callSid,
+        credentials: this.ttsCredentials
+      });
+
+      return filePath;
+    } catch (error) {
+      this.logger.info({error}, 'failed to synth audio from primary vendor');
+      if (this.fallbackVendor) {
+        try {
+          const credential = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
+          const {filePath} = await synthAudio(stats, {
+            account_sid: cs.accountSid,
+            text: msg,
+            vendor: this.fallbackVendor,
+            language: this.fallbackLanguage,
+            voice: this.fallbackVoice,
+            salt: cs.callSid,
+            credentials: credential
+          });
+          return filePath;
+        } catch(err) {
+          this.logger.info({err}, 'failed to synth audio from fallback vendor');
+        }
+      }
+    }
+  }
+
   /**
    * @param {*} evt - event data
    */
@@ -189,16 +236,7 @@ class Lex extends Task {
 
         try {
           this.logger.debug(`tts with ${this.vendor} ${this.voice}`);
-          // eslint-disable-next-line no-unused-vars
-          const {filePath, servedFromCache} = await synthAudio(stats, {
-            account_sid: cs.accountSid,
-            text: msg,
-            vendor: this.vendor,
-            language: this.language,
-            voice: this.voice,
-            salt: cs.callSid,
-            credentials: this.ttsCredentials
-          });
+          const filePath = await this.fallbackSynthAudio(cs, msg, stats, synthAudio);
           if (filePath) cs.trackTmpFile(filePath);
 
           if (this.events.includes('start-play')) {
diff --git a/lib/tasks/say.js b/lib/tasks/say.js
index 3e34ddd2..cb82731e 100644
--- a/lib/tasks/say.js
+++ b/lib/tasks/say.js
@@ -59,15 +59,28 @@ class TaskSay extends Task {
     const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
       this.synthesizer.vendor :
       cs.speechSynthesisVendor;
+    const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
+    this.synthesizer.fallbackVendor :
+    cs.fallbackSpeechSynthesisVendor;
     const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
       this.synthesizer.language :
       cs.speechSynthesisLanguage ;
+    const fallbackLanguage = this.synthesizer.fallbackLanguage && this.synthesizer.fallbackLanguage !== 'default' ?
+    this.synthesizer.fallbackLanguage :
+    cs.fallbackSpeechSynthesisLanguage ;
     let voice =  this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
       this.synthesizer.voice :
       cs.speechSynthesisVoice;
+    const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
+    this.synthesizer.fallbackVoice :
+    cs.fallbackSpeechSynthesisVoice;
+    const fallbackLabel = this.synthesizer.fallbackLabel && this.synthesizer.fallbackLabel !== 'default' ?
+    this.synthesizer.fallbackLabel :
+    cs.fallbackSpeechSynthesisLabel;
     const engine = this.synthesizer.engine || 'standard';
     const salt = cs.callSid;
-    let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer?.label || cs.speechSynthesisLabel);
+    let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer ?
+      this.data.synthesizer?.label : cs.speechSynthesisLabel);
 
     /* parse Nuance voices into name and model */
     let model;
@@ -118,6 +131,8 @@ class TaskSay extends Task {
           'tts.language': language,
           'tts.voice': voice
         });
+        let filePathUrl, isFromCache, roundTripTime;
+        let executedVendor, executedLanguage;
         try {
           const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
             account_sid: cs.accountSid,
@@ -131,37 +146,98 @@ class TaskSay extends Task {
             credentials,
             disableTtsCache : this.disableTtsCache
           });
-          this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
-          if (filePath) cs.trackTmpFile(filePath);
+
+          span.setAttributes({'tts.cached': servedFromCache});
+          span.end();
+
           if (!servedFromCache && !lastUpdated) {
             lastUpdated = true;
             updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
               .catch(() => {/*already logged error */});
           }
-          span.setAttributes({'tts.cached': servedFromCache});
-          span.end();
-          if (!servedFromCache && rtt) {
-            this.notifyStatus({
-              event: 'synthesized-audio',
-              vendor,
-              language,
-              characters: text.length,
-              elapsedTime: rtt
+
+          filePathUrl = filePath;
+          isFromCache = servedFromCache;
+          roundTripTime = rtt;
+          executedVendor = vendor;
+          executedLanguage = language;
+
+        } catch (error) {
+          if (fallbackVendor) {
+            const fallbackcredentials = cs.getSpeechCredentials(fallbackVendor, 'tts', fallbackLabel); 
+            const {span: fallbackSpan} = this.startChildSpan('fallback-tts-generation', {
+              'tts.vendor': fallbackVendor,
+              'tts.language': fallbackLanguage,
+              'tts.voice': fallbackVoice
             });
+
+            try {
+              const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
+                account_sid: cs.accountSid,
+                text,
+                fallbackVendor,
+                fallbackLanguage,
+                fallbackVoice,
+                engine,
+                model,
+                salt,
+                credentials: fallbackcredentials,
+                disableTtsCache : this.disableTtsCache
+              });
+
+              fallbackSpan.setAttributes({'tts.cached': servedFromCache});
+              fallbackSpan.end();
+
+              if (!servedFromCache && !lastUpdated) {
+                lastUpdated = true;
+                updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
+                  .catch(() => {/*already logged error */});
+              }
+
+              filePathUrl = filePath;
+              isFromCache = servedFromCache;
+              roundTripTime = rtt;
+              executedVendor = fallbackVendor;
+              executedLanguage = fallbackLanguage;
+
+            } catch (err){
+              this.logger.info({err}, 'fallback Speech failed to synthesize audio');
+              fallbackSpan.end();
+              writeAlerts({
+                account_sid: cs.accountSid,
+                alert_type: AlertType.TTS_FAILURE,
+                vendor: fallbackVendor,
+                detail: err.message
+              }).catch((err) => this.logger.info({err}, 'Error generating alert for fallback tts failure'));
+            }
           }
-          return filePath;
-        } catch (err) {
-          this.logger.info({err}, 'Error synthesizing tts');
+
+          this.logger.info({error}, 'Error synthesizing tts');
           span.end();
           writeAlerts({
             account_sid: cs.accountSid,
             alert_type: AlertType.TTS_FAILURE,
             vendor,
-            detail: err.message
+            detail: error.message
           }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure'));
-          this.notifyError({msg: 'TTS error', details: err.message || err});
+          this.notifyError({msg: 'TTS error', details: error.message || error});
           return;
         }
+
+        this.logger.debug(`file ${filePathUrl}, served from cache ${isFromCache}`);
+        if (filePathUrl) cs.trackTmpFile(filePathUrl);
+
+        if (!isFromCache && roundTripTime) {
+          this.notifyStatus({
+            event: 'synthesized-audio',
+            vendor: executedVendor,
+            language: executedLanguage,
+            characters: text.length,
+            elapsedTime: roundTripTime
+          });
+        }
+
+        return filePathUrl;
       };
 
       const arr = this.text.map((t) => generateAudio(t));