feat fallback speech

2025-12-21 09:08:02 +00:00 · 2023-08-17 14:25:26 +07:00
parent f8c5abe9e9
commit b1c0478051
5 changed files with 271 additions and 42 deletions
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -180,6 +180,13 @@ class CallSession extends Emitter {
    this.application.speech_synthesis_vendor = vendor;
  }
  get fallbackSpeechSynthesisVendor() {
    return this.application.fallback_speech_synthesis_vendor;
  }
  set fallbackSpeechSynthesisVendor(vendor) {
    this.application.fallback_speech_synthesis_vendor = vendor;
  }
  /**
   * default label to use for speech synthesis if not provided in the app
   */
@@ -189,6 +196,13 @@ class CallSession extends Emitter {
  set speechSynthesisLabel(label) {
    this.application.speech_synthesis_label = label;
  }
  get fallbackSpeechSynthesisLabel() {
    return this.application.fallback_speech_synthesis_label;
  }
  set fallbackSpeechSynthesisLabel(label) {
    this.application.fallback_speech_synthesis_label = label;
  }
  /**
   * default voice to use for speech synthesis if not provided in the app
   */
@@ -198,6 +212,13 @@ class CallSession extends Emitter {
  set speechSynthesisVoice(voice) {
    this.application.speech_synthesis_voice = voice;
  }
  get fallbackSpeechSynthesisVoice() {
    return this.application.fallback_speech_synthesis_voice;
  }
  set fallbackSpeechSynthesisVoice(voice) {
    this.application.fallback_speech_synthesis_voice = voice;
  }
  /**
   * default language to use for speech synthesis if not provided in the app
   */
@@ -208,6 +229,13 @@ class CallSession extends Emitter {
    this.application.speech_synthesis_language = language;
  }
  get fallbackSpeechSynthesisLanguage() {
    return this.application.fallback_speech_synthesis_language;
  }
  set fallbackSpeechSynthesisLanguage(language) {
    this.application.fallback_speech_synthesis_language = language;
  }
  /**
   * default vendor to use for speech recognition if not provided in the app
   */
@@ -217,6 +245,13 @@ class CallSession extends Emitter {
  set speechRecognizerVendor(vendor) {
    this.application.speech_recognizer_vendor = vendor;
  }
  get fallbackSpeechRecognizerVendor() {
    return this.application.fallback_speech_recognizer_vendor;
  }
  set fallbackSpeechRecognizerVendor(vendor) {
    this.application.fallback_speech_recognizer_vendor = vendor;
  }
  /**
   * default vendor to use for speech recognition if not provided in the app
   */
@@ -226,6 +261,13 @@ class CallSession extends Emitter {
  set speechRecognizerLabel(label) {
    this.application.speech_recognizer_label = label;
  }
  get fallbackSpeechRecognizerLabel() {
    return this.application.fallback_speech_recognizer_label;
  }
  set fallbackSpeechRecognizerLabel(label) {
    this.application.fallback_speech_recognizer_label = label;
  }
  /**
 * default language to use for speech recognition if not provided in the app
 */
@@ -236,6 +278,13 @@ class CallSession extends Emitter {
    this.application.speech_recognizer_language = language;
  }
  get fallbackSpeechRecognizerLanguage() {
    return this.application.fallback_speech_recognizer_language;
  }
  set fallbackSpeechRecognizerLanguage(language) {
    this.application.fallback_speech_recognizer_language = language;
  }
  /**
   * indicates whether the call currently in progress
   */
--- a/lib/tasks/config.js
+++ b/lib/tasks/config.js
@@ -114,6 +114,19 @@ class TaskConfig extends Task {
      cs.speechSynthesisVoice = this.synthesizer.voice !== 'default'
        ? this.synthesizer.voice
        : cs.speechSynthesisVoice;
      // fallback vendor
      cs.fallbackSpeechSynthesisVendor = this.synthesizer.fallbackVendor !== 'default'
        ? this.synthesizer.fallbackVendor
        : cs.fallbackSpeechSynthesisVendor;
      cs.fallbackSpeechSynthesisLabel = this.synthesizer.fallbackLabel !== 'default'
        ? this.synthesizer.fallbackLabel
        : cs.fallbackSpeechSynthesisLabel;
      cs.fallbackSpeechSynthesisLanguage = this.synthesizer.fallbackLanguage !== 'default'
        ?  this.synthesizer.fallbackLanguage
        : cs.fallbackSpeechSynthesisLanguage;
      cs.fallbackSpeechSynthesisVoice = this.synthesizer.fallbackVoice !== 'default'
        ? this.synthesizer.fallbackVoice
        : cs.fallbackSpeechSynthesisVoice;
      this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer');
    }
    if (this.hasRecognizer) {
@@ -126,6 +139,17 @@ class TaskConfig extends Task {
      cs.speechRecognizerLanguage = this.recognizer.language !== 'default'
        ? this.recognizer.language
        : cs.speechRecognizerLanguage;
      //fallback
      cs.fallbackSpeechRecognizerVendor = this.recognizer.fallbackVendor !== 'default'
        ? this.recognizer.fallbackVendor
        : cs.fallbackSpeechRecognizerVendor;
      cs.fallbackSpeechRecognizerLabel = this.recognizer.fallbackLabel !== 'default'
        ? this.recognizer.fallbackLabel
        : cs.fallbackSpeechRecognizerLabel;
      cs.fallbackSpeechRecognizerLanguage = this.recognizer.fallbackLanguage !== 'default'
        ? this.recognizer.fallbackLanguage
        : cs.fallbackSpeechRecognizerLanguage;
      cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false;
      if (cs.isContinuousAsr) {
        cs.asrTimeout = this.recognizer.asrTimeout;
--- a/lib/tasks/dialogflow/index.js
+++ b/lib/tasks/dialogflow/index.js
@@ -59,6 +59,12 @@ class Dialogflow extends Task {
      this.language = this.data.tts.language || 'default';
      this.voice = this.data.tts.voice || 'default';
      this.speechSynthesisLabel = this.data.tts.label || null;
      // fallback tts
      this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
      this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
      this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
      this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
    }
    this.bargein = this.data.bargein;
  }
@@ -119,9 +125,15 @@ class Dialogflow extends Task {
        this.vendor = cs.speechSynthesisVendor;
        this.language = cs.speechSynthesisLanguage;
        this.voice = cs.speechSynthesisVoice;
        this.speechSynthesisLabel = cs.speechSynthesisLabel;
      }
-      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
+      if (this.fallbackVendor === 'default') {
-        this.speechSynthesisLabel || cs.speechSynthesisLabel);
+        this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
        this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
        this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
        this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
      }
      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',this.speechSynthesisLabel);
      this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs));
      this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs));
@@ -223,17 +235,7 @@ class Dialogflow extends Task {
      }
      try {
-        const obj = {
+        const {filePath, servedFromCache} = await this.fallbackSynthAudio(cs, intent, stats);
          account_sid: cs.accountSid,
          text: intent.fulfillmentText,
          vendor: this.vendor,
          language: this.language,
          voice: this.voice,
          salt: cs.callSid,
          credentials: this.ttsCredentials
        };
        this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
        const {filePath, servedFromCache} = await synthAudio(stats, obj);
        if (filePath) cs.trackTmpFile(filePath);
        if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length);
@@ -279,6 +281,46 @@ class Dialogflow extends Task {
    }
  }
  async fallbackSynthAudio(cs, intent, stats) {
    try {
      const obj = {
        account_sid: cs.accountSid,
        text: intent.fulfillmentText,
        vendor: this.vendor,
        language: this.language,
        voice: this.voice,
        salt: cs.callSid,
        credentials: this.ttsCredentials
      };
      this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
      return await synthAudio(stats, obj);
    } catch (error) {
      this.logger.info({error}, 'Failed to synthesize audio from primary vendor');
      try {
        if(this.fallbackVendor) {
          const credentials = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
          const obj = {
            account_sid: cs.accountSid,
            text: intent.fulfillmentText,
            vendor: this.fallbackVendor,
            language: this.fallbackLanguage,
            voice: this.fallbackVoice,
            salt: cs.callSid,
            credentials
          };
          this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via fallback tts');
          return await synthAudio(stats, obj);
        }
      } catch(err) {
        this.logger.info({err}, 'Failed to synthesize audio from falllback vendor');
        throw err;
      }
      throw error;
    }
  }
  /**
   * A transcription - either interim or final - has been returned.
   * If we are doing barge-in based on hotword detection, check for the hotword or phrase.
--- a/lib/tasks/lex.js
+++ b/lib/tasks/lex.js
@@ -26,6 +26,12 @@ class Lex extends Task {
      this.language = this.data.tts.language || 'default';
      this.voice = this.data.tts.voice || 'default';
      this.speechCredentialLabel = this.data.tts.label || null;
      // fallback tts
      this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
      this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
      this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
      this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
    }
    this.botName = `${this.bot}:${this.alias}:${this.region}`;
@@ -103,9 +109,15 @@ class Lex extends Task {
        this.vendor = cs.speechSynthesisVendor;
        this.language = cs.speechSynthesisLanguage;
        this.voice = cs.speechSynthesisVoice;
        this.speechCredentialLabel = cs.speechSynthesisLabel;
      }
-      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
+      if (this.fallbackVendor === 'default') {
-        this.speechCredentialLabel || cs.speechSynthesisVendor);
+        this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
        this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
        this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
        this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
      }
      this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechCredentialLabel);
      this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs));
      this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs));
@@ -170,6 +182,41 @@ class Lex extends Task {
    }
  }
  async fallbackSynthAudio(cs, msg, stats, synthAudio) {
    try {
      const {filePath} = await synthAudio(stats, {
        account_sid: cs.accountSid,
        text: msg,
        vendor: this.vendor,
        language: this.language,
        voice: this.voice,
        salt: cs.callSid,
        credentials: this.ttsCredentials
      });
      return filePath;
    } catch (error) {
      this.logger.info({error}, 'failed to synth audio from primary vendor');
      if (this.fallbackVendor) {
        try {
          const credential = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
          const {filePath} = await synthAudio(stats, {
            account_sid: cs.accountSid,
            text: msg,
            vendor: this.fallbackVendor,
            language: this.fallbackLanguage,
            voice: this.fallbackVoice,
            salt: cs.callSid,
            credentials: credential
          });
          return filePath;
        } catch(err) {
          this.logger.info({err}, 'failed to synth audio from fallback vendor');
        }
      }
    }
  }
  /**
   * @param {*} evt - event data
   */
@@ -189,16 +236,7 @@ class Lex extends Task {
        try {
          this.logger.debug(`tts with ${this.vendor} ${this.voice}`);
-          // eslint-disable-next-line no-unused-vars
+          const filePath = await this.fallbackSynthAudio(cs, msg, stats, synthAudio);
          const {filePath, servedFromCache} = await synthAudio(stats, {
            account_sid: cs.accountSid,
            text: msg,
            vendor: this.vendor,
            language: this.language,
            voice: this.voice,
            salt: cs.callSid,
            credentials: this.ttsCredentials
          });
          if (filePath) cs.trackTmpFile(filePath);
          if (this.events.includes('start-play')) {
--- a/lib/tasks/say.js
+++ b/lib/tasks/say.js
@@ -59,15 +59,28 @@ class TaskSay extends Task {
    const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
      this.synthesizer.vendor :
      cs.speechSynthesisVendor;
    const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
    this.synthesizer.fallbackVendor :
    cs.fallbackSpeechSynthesisVendor;
    const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
      this.synthesizer.language :
      cs.speechSynthesisLanguage ;
    const fallbackLanguage = this.synthesizer.fallbackLanguage && this.synthesizer.fallbackLanguage !== 'default' ?
    this.synthesizer.fallbackLanguage :
    cs.fallbackSpeechSynthesisLanguage ;
    let voice =  this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
      this.synthesizer.voice :
      cs.speechSynthesisVoice;
    const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
    this.synthesizer.fallbackVoice :
    cs.fallbackSpeechSynthesisVoice;
    const fallbackLabel = this.synthesizer.fallbackLabel && this.synthesizer.fallbackLabel !== 'default' ?
    this.synthesizer.fallbackLabel :
    cs.fallbackSpeechSynthesisLabel;
    const engine = this.synthesizer.engine || 'standard';
    const salt = cs.callSid;
-    let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer?.label || cs.speechSynthesisLabel);
+    let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer ?
      this.data.synthesizer?.label : cs.speechSynthesisLabel);
    /* parse Nuance voices into name and model */
    let model;
@@ -118,6 +131,8 @@ class TaskSay extends Task {
          'tts.language': language,
          'tts.voice': voice
        });
        let filePathUrl, isFromCache, roundTripTime;
        let executedVendor, executedLanguage;
        try {
          const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
            account_sid: cs.accountSid,
@@ -131,37 +146,98 @@ class TaskSay extends Task {
            credentials,
            disableTtsCache : this.disableTtsCache
          });
-          this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
+
-          if (filePath) cs.trackTmpFile(filePath);
+          span.setAttributes({'tts.cached': servedFromCache});
          span.end();
          if (!servedFromCache && !lastUpdated) {
            lastUpdated = true;
            updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
              .catch(() => {/*already logged error */});
          }
-          span.setAttributes({'tts.cached': servedFromCache});
+
-          span.end();
+          filePathUrl = filePath;
-          if (!servedFromCache && rtt) {
+          isFromCache = servedFromCache;
-            this.notifyStatus({
+          roundTripTime = rtt;
-              event: 'synthesized-audio',
+          executedVendor = vendor;
-              vendor,
+          executedLanguage = language;
-              language,
+
-              characters: text.length,
+        } catch (error) {
-              elapsedTime: rtt
+          if (fallbackVendor) {
            const fallbackcredentials = cs.getSpeechCredentials(fallbackVendor, 'tts', fallbackLabel); 
            const {span: fallbackSpan} = this.startChildSpan('fallback-tts-generation', {
              'tts.vendor': fallbackVendor,
              'tts.language': fallbackLanguage,
              'tts.voice': fallbackVoice
            });
            try {
              const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
                account_sid: cs.accountSid,
                text,
                fallbackVendor,
                fallbackLanguage,
                fallbackVoice,
                engine,
                model,
                salt,
                credentials: fallbackcredentials,
                disableTtsCache : this.disableTtsCache
              });
              fallbackSpan.setAttributes({'tts.cached': servedFromCache});
              fallbackSpan.end();
              if (!servedFromCache && !lastUpdated) {
                lastUpdated = true;
                updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
                  .catch(() => {/*already logged error */});
              }
-          return filePath;
+
-        } catch (err) {
+              filePathUrl = filePath;
-          this.logger.info({err}, 'Error synthesizing tts');
+              isFromCache = servedFromCache;
              roundTripTime = rtt;
              executedVendor = fallbackVendor;
              executedLanguage = fallbackLanguage;
            } catch (err){
              this.logger.info({err}, 'fallback Speech failed to synthesize audio');
              fallbackSpan.end();
              writeAlerts({
                account_sid: cs.accountSid,
                alert_type: AlertType.TTS_FAILURE,
                vendor: fallbackVendor,
                detail: err.message
              }).catch((err) => this.logger.info({err}, 'Error generating alert for fallback tts failure'));
            }
          }
          this.logger.info({error}, 'Error synthesizing tts');
          span.end();
          writeAlerts({
            account_sid: cs.accountSid,
            alert_type: AlertType.TTS_FAILURE,
            vendor,
-            detail: err.message
+            detail: error.message
          }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure'));
-          this.notifyError({msg: 'TTS error', details: err.message || err});
+          this.notifyError({msg: 'TTS error', details: error.message || error});
          return;
        }
        this.logger.debug(`file ${filePathUrl}, served from cache ${isFromCache}`);
        if (filePathUrl) cs.trackTmpFile(filePathUrl);
        if (!isFromCache && roundTripTime) {
          this.notifyStatus({
            event: 'synthesized-audio',
            vendor: executedVendor,
            language: executedLanguage,
            characters: text.length,
            elapsedTime: roundTripTime
          });
        }
        return filePathUrl;
      };
      const arr = this.text.map((t) => generateAudio(t));