feat fallback speech

This commit is contained in:
Quan HL
2023-08-17 14:25:26 +07:00
parent f8c5abe9e9
commit b1c0478051
5 changed files with 271 additions and 42 deletions

View File

@@ -180,6 +180,13 @@ class CallSession extends Emitter {
this.application.speech_synthesis_vendor = vendor; this.application.speech_synthesis_vendor = vendor;
} }
get fallbackSpeechSynthesisVendor() {
return this.application.fallback_speech_synthesis_vendor;
}
set fallbackSpeechSynthesisVendor(vendor) {
this.application.fallback_speech_synthesis_vendor = vendor;
}
/** /**
* default label to use for speech synthesis if not provided in the app * default label to use for speech synthesis if not provided in the app
*/ */
@@ -189,6 +196,13 @@ class CallSession extends Emitter {
set speechSynthesisLabel(label) { set speechSynthesisLabel(label) {
this.application.speech_synthesis_label = label; this.application.speech_synthesis_label = label;
} }
get fallbackSpeechSynthesisLabel() {
return this.application.fallback_speech_synthesis_label;
}
set fallbackSpeechSynthesisLabel(label) {
this.application.fallback_speech_synthesis_label = label;
}
/** /**
* default voice to use for speech synthesis if not provided in the app * default voice to use for speech synthesis if not provided in the app
*/ */
@@ -198,6 +212,13 @@ class CallSession extends Emitter {
set speechSynthesisVoice(voice) { set speechSynthesisVoice(voice) {
this.application.speech_synthesis_voice = voice; this.application.speech_synthesis_voice = voice;
} }
get fallbackSpeechSynthesisVoice() {
return this.application.fallback_speech_synthesis_voice;
}
set fallbackSpeechSynthesisVoice(voice) {
this.application.fallback_speech_synthesis_voice = voice;
}
/** /**
* default language to use for speech synthesis if not provided in the app * default language to use for speech synthesis if not provided in the app
*/ */
@@ -208,6 +229,13 @@ class CallSession extends Emitter {
this.application.speech_synthesis_language = language; this.application.speech_synthesis_language = language;
} }
get fallbackSpeechSynthesisLanguage() {
return this.application.fallback_speech_synthesis_language;
}
set fallbackSpeechSynthesisLanguage(language) {
this.application.fallback_speech_synthesis_language = language;
}
/** /**
* default vendor to use for speech recognition if not provided in the app * default vendor to use for speech recognition if not provided in the app
*/ */
@@ -217,6 +245,13 @@ class CallSession extends Emitter {
set speechRecognizerVendor(vendor) { set speechRecognizerVendor(vendor) {
this.application.speech_recognizer_vendor = vendor; this.application.speech_recognizer_vendor = vendor;
} }
get fallbackSpeechRecognizerVendor() {
return this.application.fallback_speech_recognizer_vendor;
}
set fallbackSpeechRecognizerVendor(vendor) {
this.application.fallback_speech_recognizer_vendor = vendor;
}
/** /**
* default vendor to use for speech recognition if not provided in the app * default vendor to use for speech recognition if not provided in the app
*/ */
@@ -226,6 +261,13 @@ class CallSession extends Emitter {
set speechRecognizerLabel(label) { set speechRecognizerLabel(label) {
this.application.speech_recognizer_label = label; this.application.speech_recognizer_label = label;
} }
get fallbackSpeechRecognizerLabel() {
return this.application.fallback_speech_recognizer_label;
}
set fallbackSpeechRecognizerLabel(label) {
this.application.fallback_speech_recognizer_label = label;
}
/** /**
* default language to use for speech recognition if not provided in the app * default language to use for speech recognition if not provided in the app
*/ */
@@ -236,6 +278,13 @@ class CallSession extends Emitter {
this.application.speech_recognizer_language = language; this.application.speech_recognizer_language = language;
} }
get fallbackSpeechRecognizerLanguage() {
return this.application.fallback_speech_recognizer_language;
}
set fallbackSpeechRecognizerLanguage(language) {
this.application.fallback_speech_recognizer_language = language;
}
/** /**
* indicates whether the call currently in progress * indicates whether the call currently in progress
*/ */

View File

@@ -114,6 +114,19 @@ class TaskConfig extends Task {
cs.speechSynthesisVoice = this.synthesizer.voice !== 'default' cs.speechSynthesisVoice = this.synthesizer.voice !== 'default'
? this.synthesizer.voice ? this.synthesizer.voice
: cs.speechSynthesisVoice; : cs.speechSynthesisVoice;
// fallback vendor
cs.fallbackSpeechSynthesisVendor = this.synthesizer.fallbackVendor !== 'default'
? this.synthesizer.fallbackVendor
: cs.fallbackSpeechSynthesisVendor;
cs.fallbackSpeechSynthesisLabel = this.synthesizer.fallbackLabel !== 'default'
? this.synthesizer.fallbackLabel
: cs.fallbackSpeechSynthesisLabel;
cs.fallbackSpeechSynthesisLanguage = this.synthesizer.fallbackLanguage !== 'default'
? this.synthesizer.fallbackLanguage
: cs.fallbackSpeechSynthesisLanguage;
cs.fallbackSpeechSynthesisVoice = this.synthesizer.fallbackVoice !== 'default'
? this.synthesizer.fallbackVoice
: cs.fallbackSpeechSynthesisVoice;
this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer'); this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer');
} }
if (this.hasRecognizer) { if (this.hasRecognizer) {
@@ -126,6 +139,17 @@ class TaskConfig extends Task {
cs.speechRecognizerLanguage = this.recognizer.language !== 'default' cs.speechRecognizerLanguage = this.recognizer.language !== 'default'
? this.recognizer.language ? this.recognizer.language
: cs.speechRecognizerLanguage; : cs.speechRecognizerLanguage;
//fallback
cs.fallbackSpeechRecognizerVendor = this.recognizer.fallbackVendor !== 'default'
? this.recognizer.fallbackVendor
: cs.fallbackSpeechRecognizerVendor;
cs.fallbackSpeechRecognizerLabel = this.recognizer.fallbackLabel !== 'default'
? this.recognizer.fallbackLabel
: cs.fallbackSpeechRecognizerLabel;
cs.fallbackSpeechRecognizerLanguage = this.recognizer.fallbackLanguage !== 'default'
? this.recognizer.fallbackLanguage
: cs.fallbackSpeechRecognizerLanguage;
cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false; cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false;
if (cs.isContinuousAsr) { if (cs.isContinuousAsr) {
cs.asrTimeout = this.recognizer.asrTimeout; cs.asrTimeout = this.recognizer.asrTimeout;

View File

@@ -59,6 +59,12 @@ class Dialogflow extends Task {
this.language = this.data.tts.language || 'default'; this.language = this.data.tts.language || 'default';
this.voice = this.data.tts.voice || 'default'; this.voice = this.data.tts.voice || 'default';
this.speechSynthesisLabel = this.data.tts.label || null; this.speechSynthesisLabel = this.data.tts.label || null;
// fallback tts
this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
} }
this.bargein = this.data.bargein; this.bargein = this.data.bargein;
} }
@@ -119,9 +125,15 @@ class Dialogflow extends Task {
this.vendor = cs.speechSynthesisVendor; this.vendor = cs.speechSynthesisVendor;
this.language = cs.speechSynthesisLanguage; this.language = cs.speechSynthesisLanguage;
this.voice = cs.speechSynthesisVoice; this.voice = cs.speechSynthesisVoice;
this.speechSynthesisLabel = cs.speechSynthesisLabel;
} }
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', if (this.fallbackVendor === 'default') {
this.speechSynthesisLabel || cs.speechSynthesisLabel); this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
}
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',this.speechSynthesisLabel);
this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs)); this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs));
this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs)); this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs));
@@ -223,17 +235,7 @@ class Dialogflow extends Task {
} }
try { try {
const obj = { const {filePath, servedFromCache} = await this.fallbackSynthAudio(cs, intent, stats);
account_sid: cs.accountSid,
text: intent.fulfillmentText,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
};
this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
const {filePath, servedFromCache} = await synthAudio(stats, obj);
if (filePath) cs.trackTmpFile(filePath); if (filePath) cs.trackTmpFile(filePath);
if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length); if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length);
@@ -279,6 +281,46 @@ class Dialogflow extends Task {
} }
} }
async fallbackSynthAudio(cs, intent, stats) {
try {
const obj = {
account_sid: cs.accountSid,
text: intent.fulfillmentText,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
};
this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
return await synthAudio(stats, obj);
} catch (error) {
this.logger.info({error}, 'Failed to synthesize audio from primary vendor');
try {
if(this.fallbackVendor) {
const credentials = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
const obj = {
account_sid: cs.accountSid,
text: intent.fulfillmentText,
vendor: this.fallbackVendor,
language: this.fallbackLanguage,
voice: this.fallbackVoice,
salt: cs.callSid,
credentials
};
this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via fallback tts');
return await synthAudio(stats, obj);
}
} catch(err) {
this.logger.info({err}, 'Failed to synthesize audio from falllback vendor');
throw err;
}
throw error;
}
}
/** /**
* A transcription - either interim or final - has been returned. * A transcription - either interim or final - has been returned.
* If we are doing barge-in based on hotword detection, check for the hotword or phrase. * If we are doing barge-in based on hotword detection, check for the hotword or phrase.

View File

@@ -26,6 +26,12 @@ class Lex extends Task {
this.language = this.data.tts.language || 'default'; this.language = this.data.tts.language || 'default';
this.voice = this.data.tts.voice || 'default'; this.voice = this.data.tts.voice || 'default';
this.speechCredentialLabel = this.data.tts.label || null; this.speechCredentialLabel = this.data.tts.label || null;
// fallback tts
this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
} }
this.botName = `${this.bot}:${this.alias}:${this.region}`; this.botName = `${this.bot}:${this.alias}:${this.region}`;
@@ -103,9 +109,15 @@ class Lex extends Task {
this.vendor = cs.speechSynthesisVendor; this.vendor = cs.speechSynthesisVendor;
this.language = cs.speechSynthesisLanguage; this.language = cs.speechSynthesisLanguage;
this.voice = cs.speechSynthesisVoice; this.voice = cs.speechSynthesisVoice;
this.speechCredentialLabel = cs.speechSynthesisLabel;
} }
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', if (this.fallbackVendor === 'default') {
this.speechCredentialLabel || cs.speechSynthesisVendor); this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
}
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechCredentialLabel);
this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs)); this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs));
this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs)); this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs));
@@ -170,6 +182,41 @@ class Lex extends Task {
} }
} }
async fallbackSynthAudio(cs, msg, stats, synthAudio) {
try {
const {filePath} = await synthAudio(stats, {
account_sid: cs.accountSid,
text: msg,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
});
return filePath;
} catch (error) {
this.logger.info({error}, 'failed to synth audio from primary vendor');
if (this.fallbackVendor) {
try {
const credential = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
const {filePath} = await synthAudio(stats, {
account_sid: cs.accountSid,
text: msg,
vendor: this.fallbackVendor,
language: this.fallbackLanguage,
voice: this.fallbackVoice,
salt: cs.callSid,
credentials: credential
});
return filePath;
} catch(err) {
this.logger.info({err}, 'failed to synth audio from fallback vendor');
}
}
}
}
/** /**
* @param {*} evt - event data * @param {*} evt - event data
*/ */
@@ -189,16 +236,7 @@ class Lex extends Task {
try { try {
this.logger.debug(`tts with ${this.vendor} ${this.voice}`); this.logger.debug(`tts with ${this.vendor} ${this.voice}`);
// eslint-disable-next-line no-unused-vars const filePath = await this.fallbackSynthAudio(cs, msg, stats, synthAudio);
const {filePath, servedFromCache} = await synthAudio(stats, {
account_sid: cs.accountSid,
text: msg,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
});
if (filePath) cs.trackTmpFile(filePath); if (filePath) cs.trackTmpFile(filePath);
if (this.events.includes('start-play')) { if (this.events.includes('start-play')) {

View File

@@ -59,15 +59,28 @@ class TaskSay extends Task {
const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ? const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
this.synthesizer.vendor : this.synthesizer.vendor :
cs.speechSynthesisVendor; cs.speechSynthesisVendor;
const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
this.synthesizer.fallbackVendor :
cs.fallbackSpeechSynthesisVendor;
const language = this.synthesizer.language && this.synthesizer.language !== 'default' ? const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
this.synthesizer.language : this.synthesizer.language :
cs.speechSynthesisLanguage ; cs.speechSynthesisLanguage ;
const fallbackLanguage = this.synthesizer.fallbackLanguage && this.synthesizer.fallbackLanguage !== 'default' ?
this.synthesizer.fallbackLanguage :
cs.fallbackSpeechSynthesisLanguage ;
let voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ? let voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
this.synthesizer.voice : this.synthesizer.voice :
cs.speechSynthesisVoice; cs.speechSynthesisVoice;
const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
this.synthesizer.fallbackVoice :
cs.fallbackSpeechSynthesisVoice;
const fallbackLabel = this.synthesizer.fallbackLabel && this.synthesizer.fallbackLabel !== 'default' ?
this.synthesizer.fallbackLabel :
cs.fallbackSpeechSynthesisLabel;
const engine = this.synthesizer.engine || 'standard'; const engine = this.synthesizer.engine || 'standard';
const salt = cs.callSid; const salt = cs.callSid;
let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer?.label || cs.speechSynthesisLabel); let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer ?
this.data.synthesizer?.label : cs.speechSynthesisLabel);
/* parse Nuance voices into name and model */ /* parse Nuance voices into name and model */
let model; let model;
@@ -118,6 +131,8 @@ class TaskSay extends Task {
'tts.language': language, 'tts.language': language,
'tts.voice': voice 'tts.voice': voice
}); });
let filePathUrl, isFromCache, roundTripTime;
let executedVendor, executedLanguage;
try { try {
const {filePath, servedFromCache, rtt} = await synthAudio(stats, { const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
account_sid: cs.accountSid, account_sid: cs.accountSid,
@@ -131,37 +146,98 @@ class TaskSay extends Task {
credentials, credentials,
disableTtsCache : this.disableTtsCache disableTtsCache : this.disableTtsCache
}); });
this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
if (filePath) cs.trackTmpFile(filePath); span.setAttributes({'tts.cached': servedFromCache});
span.end();
if (!servedFromCache && !lastUpdated) { if (!servedFromCache && !lastUpdated) {
lastUpdated = true; lastUpdated = true;
updateSpeechCredentialLastUsed(credentials.speech_credential_sid) updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
.catch(() => {/*already logged error */}); .catch(() => {/*already logged error */});
} }
span.setAttributes({'tts.cached': servedFromCache});
span.end(); filePathUrl = filePath;
if (!servedFromCache && rtt) { isFromCache = servedFromCache;
this.notifyStatus({ roundTripTime = rtt;
event: 'synthesized-audio', executedVendor = vendor;
vendor, executedLanguage = language;
language,
characters: text.length, } catch (error) {
elapsedTime: rtt if (fallbackVendor) {
const fallbackcredentials = cs.getSpeechCredentials(fallbackVendor, 'tts', fallbackLabel);
const {span: fallbackSpan} = this.startChildSpan('fallback-tts-generation', {
'tts.vendor': fallbackVendor,
'tts.language': fallbackLanguage,
'tts.voice': fallbackVoice
}); });
try {
const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
account_sid: cs.accountSid,
text,
fallbackVendor,
fallbackLanguage,
fallbackVoice,
engine,
model,
salt,
credentials: fallbackcredentials,
disableTtsCache : this.disableTtsCache
});
fallbackSpan.setAttributes({'tts.cached': servedFromCache});
fallbackSpan.end();
if (!servedFromCache && !lastUpdated) {
lastUpdated = true;
updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
.catch(() => {/*already logged error */});
} }
return filePath;
filePathUrl = filePath;
isFromCache = servedFromCache;
roundTripTime = rtt;
executedVendor = fallbackVendor;
executedLanguage = fallbackLanguage;
} catch (err){ } catch (err){
this.logger.info({err}, 'Error synthesizing tts'); this.logger.info({err}, 'fallback Speech failed to synthesize audio');
fallbackSpan.end();
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.TTS_FAILURE,
vendor: fallbackVendor,
detail: err.message
}).catch((err) => this.logger.info({err}, 'Error generating alert for fallback tts failure'));
}
}
this.logger.info({error}, 'Error synthesizing tts');
span.end(); span.end();
writeAlerts({ writeAlerts({
account_sid: cs.accountSid, account_sid: cs.accountSid,
alert_type: AlertType.TTS_FAILURE, alert_type: AlertType.TTS_FAILURE,
vendor, vendor,
detail: err.message detail: error.message
}).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure')); }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure'));
this.notifyError({msg: 'TTS error', details: err.message || err}); this.notifyError({msg: 'TTS error', details: error.message || error});
return; return;
} }
this.logger.debug(`file ${filePathUrl}, served from cache ${isFromCache}`);
if (filePathUrl) cs.trackTmpFile(filePathUrl);
if (!isFromCache && roundTripTime) {
this.notifyStatus({
event: 'synthesized-audio',
vendor: executedVendor,
language: executedLanguage,
characters: text.length,
elapsedTime: roundTripTime
});
}
return filePathUrl;
}; };
const arr = this.text.map((t) => generateAudio(t)); const arr = this.text.map((t) => generateAudio(t));