From b1c0478051ba7340829659fc03d506e703b3e197 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Thu, 17 Aug 2023 14:25:26 +0700 Subject: [PATCH] feat fallback speech --- lib/session/call-session.js | 49 +++++++++++++++ lib/tasks/config.js | 24 ++++++++ lib/tasks/dialogflow/index.js | 68 +++++++++++++++++---- lib/tasks/lex.js | 62 +++++++++++++++---- lib/tasks/say.js | 110 ++++++++++++++++++++++++++++------ 5 files changed, 271 insertions(+), 42 deletions(-) diff --git a/lib/session/call-session.js b/lib/session/call-session.js index b67f095b..2ed0b1ed 100644 --- a/lib/session/call-session.js +++ b/lib/session/call-session.js @@ -180,6 +180,13 @@ class CallSession extends Emitter { this.application.speech_synthesis_vendor = vendor; } + get fallbackSpeechSynthesisVendor() { + return this.application.fallback_speech_synthesis_vendor; + } + set fallbackSpeechSynthesisVendor(vendor) { + this.application.fallback_speech_synthesis_vendor = vendor; + } + /** * default label to use for speech synthesis if not provided in the app */ @@ -189,6 +196,13 @@ class CallSession extends Emitter { set speechSynthesisLabel(label) { this.application.speech_synthesis_label = label; } + + get fallbackSpeechSynthesisLabel() { + return this.application.fallback_speech_synthesis_label; + } + set fallbackSpeechSynthesisLabel(label) { + this.application.fallback_speech_synthesis_label = label; + } /** * default voice to use for speech synthesis if not provided in the app */ @@ -198,6 +212,13 @@ class CallSession extends Emitter { set speechSynthesisVoice(voice) { this.application.speech_synthesis_voice = voice; } + + get fallbackSpeechSynthesisVoice() { + return this.application.fallback_speech_synthesis_voice; + } + set fallbackSpeechSynthesisVoice(voice) { + this.application.fallback_speech_synthesis_voice = voice; + } /** * default language to use for speech synthesis if not provided in the app */ @@ -208,6 +229,13 @@ class CallSession extends Emitter { this.application.speech_synthesis_language = language; } + get fallbackSpeechSynthesisLanguage() { + return this.application.fallback_speech_synthesis_language; + } + set fallbackSpeechSynthesisLanguage(language) { + this.application.fallback_speech_synthesis_language = language; + } + /** * default vendor to use for speech recognition if not provided in the app */ @@ -217,6 +245,13 @@ class CallSession extends Emitter { set speechRecognizerVendor(vendor) { this.application.speech_recognizer_vendor = vendor; } + + get fallbackSpeechRecognizerVendor() { + return this.application.fallback_speech_recognizer_vendor; + } + set fallbackSpeechRecognizerVendor(vendor) { + this.application.fallback_speech_recognizer_vendor = vendor; + } /** * default vendor to use for speech recognition if not provided in the app */ @@ -226,6 +261,13 @@ class CallSession extends Emitter { set speechRecognizerLabel(label) { this.application.speech_recognizer_label = label; } + + get fallbackSpeechRecognizerLabel() { + return this.application.fallback_speech_recognizer_label; + } + set fallbackSpeechRecognizerLabel(label) { + this.application.fallback_speech_recognizer_label = label; + } /** * default language to use for speech recognition if not provided in the app */ @@ -236,6 +278,13 @@ class CallSession extends Emitter { this.application.speech_recognizer_language = language; } + get fallbackSpeechRecognizerLanguage() { + return this.application.fallback_speech_recognizer_language; + } + set fallbackSpeechRecognizerLanguage(language) { + this.application.fallback_speech_recognizer_language = language; + } + /** * indicates whether the call currently in progress */ diff --git a/lib/tasks/config.js b/lib/tasks/config.js index 826a6ed5..f7c7396d 100644 --- a/lib/tasks/config.js +++ b/lib/tasks/config.js @@ -114,6 +114,19 @@ class TaskConfig extends Task { cs.speechSynthesisVoice = this.synthesizer.voice !== 'default' ? this.synthesizer.voice : cs.speechSynthesisVoice; + // fallback vendor + cs.fallbackSpeechSynthesisVendor = this.synthesizer.fallbackVendor !== 'default' + ? this.synthesizer.fallbackVendor + : cs.fallbackSpeechSynthesisVendor; + cs.fallbackSpeechSynthesisLabel = this.synthesizer.fallbackLabel !== 'default' + ? this.synthesizer.fallbackLabel + : cs.fallbackSpeechSynthesisLabel; + cs.fallbackSpeechSynthesisLanguage = this.synthesizer.fallbackLanguage !== 'default' + ? this.synthesizer.fallbackLanguage + : cs.fallbackSpeechSynthesisLanguage; + cs.fallbackSpeechSynthesisVoice = this.synthesizer.fallbackVoice !== 'default' + ? this.synthesizer.fallbackVoice + : cs.fallbackSpeechSynthesisVoice; this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer'); } if (this.hasRecognizer) { @@ -126,6 +139,17 @@ class TaskConfig extends Task { cs.speechRecognizerLanguage = this.recognizer.language !== 'default' ? this.recognizer.language : cs.speechRecognizerLanguage; + + //fallback + cs.fallbackSpeechRecognizerVendor = this.recognizer.fallbackVendor !== 'default' + ? this.recognizer.fallbackVendor + : cs.fallbackSpeechRecognizerVendor; + cs.fallbackSpeechRecognizerLabel = this.recognizer.fallbackLabel !== 'default' + ? this.recognizer.fallbackLabel + : cs.fallbackSpeechRecognizerLabel; + cs.fallbackSpeechRecognizerLanguage = this.recognizer.fallbackLanguage !== 'default' + ? this.recognizer.fallbackLanguage + : cs.fallbackSpeechRecognizerLanguage; cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false; if (cs.isContinuousAsr) { cs.asrTimeout = this.recognizer.asrTimeout; diff --git a/lib/tasks/dialogflow/index.js b/lib/tasks/dialogflow/index.js index 8882a338..4ed2ccf7 100644 --- a/lib/tasks/dialogflow/index.js +++ b/lib/tasks/dialogflow/index.js @@ -59,6 +59,12 @@ class Dialogflow extends Task { this.language = this.data.tts.language || 'default'; this.voice = this.data.tts.voice || 'default'; this.speechSynthesisLabel = this.data.tts.label || null; + + // fallback tts + this.fallbackVendor = this.data.tts.fallbackVendor || 'default'; + this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default'; + this.fallbackVoice = this.data.tts.fallbackLanguage || 'default'; + this.fallbackLabel = this.data.tts.fallbackLabel || 'default'; } this.bargein = this.data.bargein; } @@ -119,9 +125,15 @@ class Dialogflow extends Task { this.vendor = cs.speechSynthesisVendor; this.language = cs.speechSynthesisLanguage; this.voice = cs.speechSynthesisVoice; + this.speechSynthesisLabel = cs.speechSynthesisLabel; } - this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', - this.speechSynthesisLabel || cs.speechSynthesisLabel); + if (this.fallbackVendor === 'default') { + this.fallbackVendor = cs.fallbackSpeechSynthesisVendor; + this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage; + this.fallbackVoice = cs.fallbackSpeechSynthesisVoice; + this.fallbackLabel = cs.fallbackSpeechSynthesisLabel; + } + this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',this.speechSynthesisLabel); this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs)); this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs)); @@ -223,17 +235,7 @@ class Dialogflow extends Task { } try { - const obj = { - account_sid: cs.accountSid, - text: intent.fulfillmentText, - vendor: this.vendor, - language: this.language, - voice: this.voice, - salt: cs.callSid, - credentials: this.ttsCredentials - }; - this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts'); - const {filePath, servedFromCache} = await synthAudio(stats, obj); + const {filePath, servedFromCache} = await this.fallbackSynthAudio(cs, intent, stats); if (filePath) cs.trackTmpFile(filePath); if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length); @@ -279,6 +281,46 @@ class Dialogflow extends Task { } } + async fallbackSynthAudio(cs, intent, stats) { + try { + const obj = { + account_sid: cs.accountSid, + text: intent.fulfillmentText, + vendor: this.vendor, + language: this.language, + voice: this.voice, + salt: cs.callSid, + credentials: this.ttsCredentials + }; + this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts'); + + return await synthAudio(stats, obj); + } catch (error) { + this.logger.info({error}, 'Failed to synthesize audio from primary vendor'); + + try { + if(this.fallbackVendor) { + const credentials = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel); + const obj = { + account_sid: cs.accountSid, + text: intent.fulfillmentText, + vendor: this.fallbackVendor, + language: this.fallbackLanguage, + voice: this.fallbackVoice, + salt: cs.callSid, + credentials + }; + this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via fallback tts'); + return await synthAudio(stats, obj); + } + } catch(err) { + this.logger.info({err}, 'Failed to synthesize audio from falllback vendor'); + throw err; + } + throw error; + } + } + /** * A transcription - either interim or final - has been returned. * If we are doing barge-in based on hotword detection, check for the hotword or phrase. diff --git a/lib/tasks/lex.js b/lib/tasks/lex.js index d8dd8fae..3e33ecb9 100644 --- a/lib/tasks/lex.js +++ b/lib/tasks/lex.js @@ -26,6 +26,12 @@ class Lex extends Task { this.language = this.data.tts.language || 'default'; this.voice = this.data.tts.voice || 'default'; this.speechCredentialLabel = this.data.tts.label || null; + + // fallback tts + this.fallbackVendor = this.data.tts.fallbackVendor || 'default'; + this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default'; + this.fallbackVoice = this.data.tts.fallbackLanguage || 'default'; + this.fallbackLabel = this.data.tts.fallbackLabel || 'default'; } this.botName = `${this.bot}:${this.alias}:${this.region}`; @@ -103,9 +109,15 @@ class Lex extends Task { this.vendor = cs.speechSynthesisVendor; this.language = cs.speechSynthesisLanguage; this.voice = cs.speechSynthesisVoice; + this.speechCredentialLabel = cs.speechSynthesisLabel; } - this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', - this.speechCredentialLabel || cs.speechSynthesisVendor); + if (this.fallbackVendor === 'default') { + this.fallbackVendor = cs.fallbackSpeechSynthesisVendor; + this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage; + this.fallbackVoice = cs.fallbackSpeechSynthesisVoice; + this.fallbackLabel = cs.fallbackSpeechSynthesisLabel; + } + this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechCredentialLabel); this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs)); this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs)); @@ -170,6 +182,41 @@ class Lex extends Task { } } + async fallbackSynthAudio(cs, msg, stats, synthAudio) { + try { + const {filePath} = await synthAudio(stats, { + account_sid: cs.accountSid, + text: msg, + vendor: this.vendor, + language: this.language, + voice: this.voice, + salt: cs.callSid, + credentials: this.ttsCredentials + }); + + return filePath; + } catch (error) { + this.logger.info({error}, 'failed to synth audio from primary vendor'); + if (this.fallbackVendor) { + try { + const credential = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel); + const {filePath} = await synthAudio(stats, { + account_sid: cs.accountSid, + text: msg, + vendor: this.fallbackVendor, + language: this.fallbackLanguage, + voice: this.fallbackVoice, + salt: cs.callSid, + credentials: credential + }); + return filePath; + } catch(err) { + this.logger.info({err}, 'failed to synth audio from fallback vendor'); + } + } + } + } + /** * @param {*} evt - event data */ @@ -189,16 +236,7 @@ class Lex extends Task { try { this.logger.debug(`tts with ${this.vendor} ${this.voice}`); - // eslint-disable-next-line no-unused-vars - const {filePath, servedFromCache} = await synthAudio(stats, { - account_sid: cs.accountSid, - text: msg, - vendor: this.vendor, - language: this.language, - voice: this.voice, - salt: cs.callSid, - credentials: this.ttsCredentials - }); + const filePath = await this.fallbackSynthAudio(cs, msg, stats, synthAudio); if (filePath) cs.trackTmpFile(filePath); if (this.events.includes('start-play')) { diff --git a/lib/tasks/say.js b/lib/tasks/say.js index 3e34ddd2..cb82731e 100644 --- a/lib/tasks/say.js +++ b/lib/tasks/say.js @@ -59,15 +59,28 @@ class TaskSay extends Task { const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ? this.synthesizer.vendor : cs.speechSynthesisVendor; + const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ? + this.synthesizer.fallbackVendor : + cs.fallbackSpeechSynthesisVendor; const language = this.synthesizer.language && this.synthesizer.language !== 'default' ? this.synthesizer.language : cs.speechSynthesisLanguage ; + const fallbackLanguage = this.synthesizer.fallbackLanguage && this.synthesizer.fallbackLanguage !== 'default' ? + this.synthesizer.fallbackLanguage : + cs.fallbackSpeechSynthesisLanguage ; let voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ? this.synthesizer.voice : cs.speechSynthesisVoice; + const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ? + this.synthesizer.fallbackVoice : + cs.fallbackSpeechSynthesisVoice; + const fallbackLabel = this.synthesizer.fallbackLabel && this.synthesizer.fallbackLabel !== 'default' ? + this.synthesizer.fallbackLabel : + cs.fallbackSpeechSynthesisLabel; const engine = this.synthesizer.engine || 'standard'; const salt = cs.callSid; - let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer?.label || cs.speechSynthesisLabel); + let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer ? + this.data.synthesizer?.label : cs.speechSynthesisLabel); /* parse Nuance voices into name and model */ let model; @@ -118,6 +131,8 @@ class TaskSay extends Task { 'tts.language': language, 'tts.voice': voice }); + let filePathUrl, isFromCache, roundTripTime; + let executedVendor, executedLanguage; try { const {filePath, servedFromCache, rtt} = await synthAudio(stats, { account_sid: cs.accountSid, @@ -131,37 +146,98 @@ class TaskSay extends Task { credentials, disableTtsCache : this.disableTtsCache }); - this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`); - if (filePath) cs.trackTmpFile(filePath); + + span.setAttributes({'tts.cached': servedFromCache}); + span.end(); + if (!servedFromCache && !lastUpdated) { lastUpdated = true; updateSpeechCredentialLastUsed(credentials.speech_credential_sid) .catch(() => {/*already logged error */}); } - span.setAttributes({'tts.cached': servedFromCache}); - span.end(); - if (!servedFromCache && rtt) { - this.notifyStatus({ - event: 'synthesized-audio', - vendor, - language, - characters: text.length, - elapsedTime: rtt + + filePathUrl = filePath; + isFromCache = servedFromCache; + roundTripTime = rtt; + executedVendor = vendor; + executedLanguage = language; + + } catch (error) { + if (fallbackVendor) { + const fallbackcredentials = cs.getSpeechCredentials(fallbackVendor, 'tts', fallbackLabel); + const {span: fallbackSpan} = this.startChildSpan('fallback-tts-generation', { + 'tts.vendor': fallbackVendor, + 'tts.language': fallbackLanguage, + 'tts.voice': fallbackVoice }); + + try { + const {filePath, servedFromCache, rtt} = await synthAudio(stats, { + account_sid: cs.accountSid, + text, + fallbackVendor, + fallbackLanguage, + fallbackVoice, + engine, + model, + salt, + credentials: fallbackcredentials, + disableTtsCache : this.disableTtsCache + }); + + fallbackSpan.setAttributes({'tts.cached': servedFromCache}); + fallbackSpan.end(); + + if (!servedFromCache && !lastUpdated) { + lastUpdated = true; + updateSpeechCredentialLastUsed(credentials.speech_credential_sid) + .catch(() => {/*already logged error */}); + } + + filePathUrl = filePath; + isFromCache = servedFromCache; + roundTripTime = rtt; + executedVendor = fallbackVendor; + executedLanguage = fallbackLanguage; + + } catch (err){ + this.logger.info({err}, 'fallback Speech failed to synthesize audio'); + fallbackSpan.end(); + writeAlerts({ + account_sid: cs.accountSid, + alert_type: AlertType.TTS_FAILURE, + vendor: fallbackVendor, + detail: err.message + }).catch((err) => this.logger.info({err}, 'Error generating alert for fallback tts failure')); + } } - return filePath; - } catch (err) { - this.logger.info({err}, 'Error synthesizing tts'); + + this.logger.info({error}, 'Error synthesizing tts'); span.end(); writeAlerts({ account_sid: cs.accountSid, alert_type: AlertType.TTS_FAILURE, vendor, - detail: err.message + detail: error.message }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure')); - this.notifyError({msg: 'TTS error', details: err.message || err}); + this.notifyError({msg: 'TTS error', details: error.message || error}); return; } + + this.logger.debug(`file ${filePathUrl}, served from cache ${isFromCache}`); + if (filePathUrl) cs.trackTmpFile(filePathUrl); + + if (!isFromCache && roundTripTime) { + this.notifyStatus({ + event: 'synthesized-audio', + vendor: executedVendor, + language: executedLanguage, + characters: text.length, + elapsedTime: roundTripTime + }); + } + + return filePathUrl; }; const arr = this.text.map((t) => generateAudio(t));