Feat/fallback speech 02 (#429)

* feat fallback speech

* feat fallback speech

* feat fallback speech

* gather verb

* fix

* wip

* wip

* wip

* wip

* wip

* transcribe

* transcribe

* fix

* wip

* wip

* wip

* wip

* fix

* wip

* fix
This commit is contained in:
Hoan Luu Huu
2023-08-22 20:22:01 +07:00
committed by GitHub
parent 6f0dbef433
commit b1ecf069bf
13 changed files with 501 additions and 223 deletions

View File

@@ -180,6 +180,13 @@ class CallSession extends Emitter {
this.application.speech_synthesis_vendor = vendor;
}
get fallbackSpeechSynthesisVendor() {
return this.application.fallback_speech_synthesis_vendor;
}
set fallbackSpeechSynthesisVendor(vendor) {
this.application.fallback_speech_synthesis_vendor = vendor;
}
/**
* default label to use for speech synthesis if not provided in the app
*/
@@ -189,6 +196,13 @@ class CallSession extends Emitter {
set speechSynthesisLabel(label) {
this.application.speech_synthesis_label = label;
}
get fallbackSpeechSynthesisLabel() {
return this.application.fallback_speech_synthesis_label;
}
set fallbackSpeechSynthesisLabel(label) {
this.application.fallback_speech_synthesis_label = label;
}
/**
* default voice to use for speech synthesis if not provided in the app
*/
@@ -198,6 +212,13 @@ class CallSession extends Emitter {
set speechSynthesisVoice(voice) {
this.application.speech_synthesis_voice = voice;
}
get fallbackSpeechSynthesisVoice() {
return this.application.fallback_speech_synthesis_voice;
}
set fallbackSpeechSynthesisVoice(voice) {
this.application.fallback_speech_synthesis_voice = voice;
}
/**
* default language to use for speech synthesis if not provided in the app
*/
@@ -208,6 +229,13 @@ class CallSession extends Emitter {
this.application.speech_synthesis_language = language;
}
get fallbackSpeechSynthesisLanguage() {
return this.application.fallback_speech_synthesis_language;
}
set fallbackSpeechSynthesisLanguage(language) {
this.application.fallback_speech_synthesis_language = language;
}
/**
* default vendor to use for speech recognition if not provided in the app
*/
@@ -217,6 +245,13 @@ class CallSession extends Emitter {
set speechRecognizerVendor(vendor) {
this.application.speech_recognizer_vendor = vendor;
}
get fallbackSpeechRecognizerVendor() {
return this.application.fallback_speech_recognizer_vendor;
}
set fallbackSpeechRecognizerVendor(vendor) {
this.application.fallback_speech_recognizer_vendor = vendor;
}
/**
* default vendor to use for speech recognition if not provided in the app
*/
@@ -226,6 +261,13 @@ class CallSession extends Emitter {
set speechRecognizerLabel(label) {
this.application.speech_recognizer_label = label;
}
get fallbackSpeechRecognizerLabel() {
return this.application.fallback_speech_recognizer_label;
}
set fallbackSpeechRecognizerLabel(label) {
this.application.fallback_speech_recognizer_label = label;
}
/**
* default language to use for speech recognition if not provided in the app
*/
@@ -236,6 +278,13 @@ class CallSession extends Emitter {
this.application.speech_recognizer_language = language;
}
get fallbackSpeechRecognizerLanguage() {
return this.application.fallback_speech_recognizer_language;
}
set fallbackSpeechRecognizerLanguage(language) {
this.application.fallback_speech_recognizer_language = language;
}
/**
* indicates whether the call currently in progress
*/
@@ -679,6 +728,10 @@ class CallSession extends Emitter {
this.logger.info(`Speech credential vendor: ${credential.vendor}
${credential.label ? `, label: ${credential.label}` : ''} is chosen`);
if ('google' === vendor) {
if (type === 'tts' && !credential.tts_tested_ok ||
type === 'stt' && !credential.stt_tested_ok) {
return;
}
try {
const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n'));
return {

View File

@@ -121,6 +121,20 @@ class TaskConfig extends Task {
cs.speechSynthesisVoice = this.synthesizer.voice !== 'default'
? this.synthesizer.voice
: cs.speechSynthesisVoice;
// fallback vendor
cs.fallbackSpeechSynthesisVendor = this.synthesizer.fallbackVendor !== 'default'
? this.synthesizer.fallbackVendor
: cs.fallbackSpeechSynthesisVendor;
cs.fallbackSpeechSynthesisLabel = this.synthesizer.fallbackLabel !== 'default'
? this.synthesizer.fallbackLabel
: cs.fallbackSpeechSynthesisLabel;
cs.fallbackSpeechSynthesisLanguage = this.synthesizer.fallbackLanguage !== 'default'
? this.synthesizer.fallbackLanguage
: cs.fallbackSpeechSynthesisLanguage;
cs.fallbackSpeechSynthesisVoice = this.synthesizer.fallbackVoice !== 'default'
? this.synthesizer.fallbackVoice
: cs.fallbackSpeechSynthesisVoice;
this.logger.info({synthesizer: this.synthesizer}, 'Config: updated synthesizer');
}
if (this.hasRecognizer) {
@@ -133,6 +147,18 @@ class TaskConfig extends Task {
cs.speechRecognizerLanguage = this.recognizer.language !== 'default'
? this.recognizer.language
: cs.speechRecognizerLanguage;
//fallback
cs.fallbackSpeechRecognizerVendor = this.recognizer.fallbackVendor !== 'default'
? this.recognizer.fallbackVendor
: cs.fallbackSpeechRecognizerVendor;
cs.fallbackSpeechRecognizerLabel = this.recognizer.fallbackLabel !== 'default'
? this.recognizer.fallbackLabel
: cs.fallbackSpeechRecognizerLabel;
cs.fallbackSpeechRecognizerLanguage = this.recognizer.fallbackLanguage !== 'default'
? this.recognizer.fallbackLanguage
: cs.fallbackSpeechRecognizerLanguage;
cs.isContinuousAsr = typeof this.recognizer.asrTimeout === 'number' ? true : false;
if (cs.isContinuousAsr) {
cs.asrTimeout = this.recognizer.asrTimeout;

View File

@@ -58,7 +58,13 @@ class Dialogflow extends Task {
this.vendor = this.data.tts.vendor || 'default';
this.language = this.data.tts.language || 'default';
this.voice = this.data.tts.voice || 'default';
this.speechSynthesisLabel = this.data.tts.label || null;
this.speechSynthesisLabel = this.data.tts.label || 'default';
// fallback tts
this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
}
this.bargein = this.data.bargein;
}
@@ -119,9 +125,15 @@ class Dialogflow extends Task {
this.vendor = cs.speechSynthesisVendor;
this.language = cs.speechSynthesisLanguage;
this.voice = cs.speechSynthesisVoice;
this.speechSynthesisLabel = cs.speechSynthesisLabel;
}
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
this.speechSynthesisLabel || cs.speechSynthesisLabel);
if (this.fallbackVendor === 'default') {
this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
}
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechSynthesisLabel);
this.ep.addCustomEventListener('dialogflow::intent', this._onIntent.bind(this, ep, cs));
this.ep.addCustomEventListener('dialogflow::transcription', this._onTranscription.bind(this, ep, cs));
@@ -223,17 +235,7 @@ class Dialogflow extends Task {
}
try {
const obj = {
account_sid: cs.accountSid,
text: intent.fulfillmentText,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
};
this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
const {filePath, servedFromCache} = await synthAudio(stats, obj);
const {filePath, servedFromCache} = await this._fallbackSynthAudio(cs, intent, stats, synthAudio);
if (filePath) cs.trackTmpFile(filePath);
if (!this.ttsCredentials && !servedFromCache) cs.billForTts(intent.fulfillmentText.length);
@@ -279,6 +281,46 @@ class Dialogflow extends Task {
}
}
async _fallbackSynthAudio(cs, intent, stats, synthAudio) {
try {
const obj = {
account_sid: cs.accountSid,
text: intent.fulfillmentText,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
};
this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via tts');
return await synthAudio(stats, obj);
} catch (error) {
this.logger.info({error}, 'Failed to synthesize audio from primary vendor');
try {
if (this.fallbackVendor) {
const credentials = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
const obj = {
account_sid: cs.accountSid,
text: intent.fulfillmentText,
vendor: this.fallbackVendor,
language: this.fallbackLanguage,
voice: this.fallbackVoice,
salt: cs.callSid,
credentials
};
this.logger.debug({obj}, 'Dialogflow:_onIntent - playing message via fallback tts');
return await synthAudio(stats, obj);
}
} catch (err) {
this.logger.info({err}, 'Failed to synthesize audio from falllback vendor');
throw err;
}
throw error;
}
}
/**
* A transcription - either interim or final - has been returned.
* If we are doing barge-in based on hotword detection, check for the hotword or phrase.

View File

@@ -1,7 +1,5 @@
const Task = require('./task');
const {
TaskName,
TaskPreconditions,
GoogleTranscriptionEvents,
NuanceTranscriptionEvents,
AwsTranscriptionEvents,
@@ -19,6 +17,7 @@ const {
} = require('../config');
const makeTask = require('./make_task');
const assert = require('assert');
const SttTask = require('./stt-task');
const compileTranscripts = (logger, evt, arr) => {
if (!Array.isArray(arr) || arr.length === 0) return;
@@ -30,23 +29,9 @@ const compileTranscripts = (logger, evt, arr) => {
evt.alternatives[0].transcript = t.trim();
};
class TaskGather extends Task {
class TaskGather extends SttTask {
constructor(logger, opts, parentTask) {
super(logger, opts);
this.preconditions = TaskPreconditions.Endpoint;
const {
setChannelVarsForStt,
normalizeTranscription,
removeSpeechListeners,
setSpeechCredentialsAtRuntime,
compileSonioxTranscripts
} = require('../utils/transcription-utils')(logger);
this.setChannelVarsForStt = setChannelVarsForStt;
this.normalizeTranscription = normalizeTranscription;
this.removeSpeechListeners = removeSpeechListeners;
this.compileSonioxTranscripts = compileSonioxTranscripts;
super(logger, opts, parentTask);
[
'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits',
'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
@@ -62,16 +47,10 @@ class TaskGather extends Task {
this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true;
this.minBargeinWordCount = this.data.minBargeinWordCount || 1;
if (this.data.recognizer) {
const recognizer = this.data.recognizer;
this.vendor = recognizer.vendor;
this.language = recognizer.language;
/* let credentials be supplied in the recognizer object at runtime */
this.sttCredentials = setSpeechCredentialsAtRuntime(recognizer);
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
this.asrTimeout = typeof recognizer.asrTimeout === 'number' ? recognizer.asrTimeout * 1000 : 0;
if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = recognizer.asrDtmfTerminationDigit;
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
this.data.recognizer.asrTimeout * 1000 : 0;
if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
this.isContinuousAsr = this.asrTimeout > 0;
if (Array.isArray(this.data.recognizer.hints) &&
@@ -79,10 +58,7 @@ class TaskGather extends Task {
logger.debug('Gather: an empty hints array was supplied, so we will mask global hints');
this.maskGlobalSttHints = true;
}
this.data.recognizer.hints = this.data.recognizer.hints || [];
this.data.recognizer.altLanguages = this.data.recognizer.altLanguages || [];
}
else this.data.recognizer = {hints: [], altLanguages: []};
this.digitBuffer = '';
this._earlyMedia = this.data.earlyMedia === true;
@@ -97,11 +73,6 @@ class TaskGather extends Task {
/* buffer speech for continuous asr */
this._bufferedTranscripts = [];
/* buffer for soniox transcripts */
this._sonioxTranscripts = [];
this.parentTask = parentTask;
this.partialTranscriptsCount = 0;
}
@@ -137,7 +108,6 @@ class TaskGather extends Task {
this.logger.debug({options: this.data}, 'Gather:exec');
await super.exec(cs);
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) {
const {hints, hintsBoost} = cs.globalSttHints;
@@ -184,58 +154,59 @@ class TaskGather extends Task {
this.language = cs.speechRecognizerLanguage;
if (this.data.recognizer) this.data.recognizer.language = this.language;
}
if ('default' === this.label || !this.label) {
this.label = cs.speechRecognizerLabel;
if (this.data.recognizer) this.data.recognizer.label = this.label;
}
// Fallback options
if ('default' === this.fallbackVendor || !this.fallbackVendor) {
this.fallbackVendor = cs.fallbackSpeechRecognizerVendor;
if (this.data.recognizer) this.data.recognizer.fallbackVendor = this.fallbackVendor;
}
if ('default' === this.fallbackLanguage || !this.fallbackLanguage) {
this.fallbackLanguage = cs.fallbackSpeechRecognizerLanguage;
if (this.data.recognizer) this.data.recognizer.fallbackLanguage = this.fallbackLanguage;
}
if ('default' === this.fallbackLabel || !this.fallbackLabel) {
this.fallbackLabel = cs.fallbackSpeechRecognizerLabel;
if (this.data.recognizer) this.data.recognizer.fallbackLabel = this.fallbackLabel;
}
if (!this.data.recognizer.vendor) {
this.data.recognizer.vendor = this.vendor;
}
if (this.needsStt && !this.sttCredentials) this.sttCredentials =
cs.getSpeechCredentials(this.vendor, 'stt', this.data.recognizer?.label || cs.speechRecognizerLabel);
if (this.needsStt && !this.sttCredentials) {
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_NOT_PROVISIONED,
vendor: this.vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
// Notify application that STT vender is wrong.
this.notifyError({
msg: 'ASR error',
details: `No speech-to-text service credentials for ${this.vendor} have been configured`
});
this.notifyTaskDone();
throw new Error(`No speech-to-text service credentials for ${this.vendor} have been configured`);
try {
this.sttCredentials = await this._initSpeechCredentials(cs, this.vendor, this.label);
} catch (error) {
if (this.fallbackVendor && this.isHandledByPrimaryProvider) {
await this._fallback();
} else {
throw error;
}
}
}
if (this.vendor === 'nuance' && this.sttCredentials.client_id) {
/* get nuance access token */
const {client_id, secret} = this.sttCredentials;
const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
this.logger.debug({client_id}, `Gather:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
this.sttCredentials = {...this.sttCredentials, access_token};
}
else if (this.vendor == 'ibm' && this.sttCredentials.stt_api_key) {
/* get ibm access token */
const {stt_api_key, stt_region} = this.sttCredentials;
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
this.logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
this.sttCredentials = {...this.sttCredentials, access_token, stt_region};
}
const startListening = (cs, ep) => {
const startListening = async(cs, ep) => {
this._startTimer();
if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
if (this.input.includes('speech') && !this.listenDuringPrompt) {
this._initSpeech(cs, ep)
.then(() => {
if (this.killed) {
this.logger.info('Gather:exec - task was quickly killed so do not transcribe');
return;
}
this._startTranscribing(ep);
return updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid);
})
.catch((err) => {
this.logger.error({err}, 'error in initSpeech');
});
try {
await this._initSpeech(cs, ep);
if (this.killed) {
this.logger.info('Gather:exec - task was quickly killed so do not transcribe');
return;
}
this._startTranscribing(ep);
return updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid);
} catch (e) {
if (this.fallbackVendor && this.isHandledByPrimaryProvider) {
await this._fallback();
startListening(cs, ep);
} else {
this.logger.error({error: e}, 'error in initSpeech');
}
}
}
};
@@ -705,8 +676,22 @@ class TaskGather extends Task {
_onJambonzConnect(_cs, _ep) {
this.logger.debug('TaskGather:_onJambonzConnect');
}
_onJambonzError(cs, _ep, evt) {
async _onJambonzError(cs, _ep, evt) {
this.logger.info({evt}, 'TaskGather:_onJambonzError');
if (this.isHandledByPrimaryProvider && this.fallbackVendor) {
_ep.stopTranscription({vendor: this.vendor})
.catch((err) => this.logger.error({err}, `Error stopping transcription for primary vendor ${this.vendor}`));
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
try {
await this._fallback();
await this._initSpeech(cs, _ep);
this._startTranscribing(_ep);
updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid);
return;
} catch (error) {
this.logger.info({error}, `There is error while falling back to ${this.fallbackVendor}`);
}
}
const {writeAlerts, AlertType} = cs.srf.locals;
if (this.vendor === 'nuance') {

View File

@@ -25,7 +25,13 @@ class Lex extends Task {
this.vendor = this.data.tts.vendor || 'default';
this.language = this.data.tts.language || 'default';
this.voice = this.data.tts.voice || 'default';
this.speechCredentialLabel = this.data.tts.label || null;
this.speechCredentialLabel = this.data.tts.label || 'default';
// fallback tts
this.fallbackVendor = this.data.tts.fallbackVendor || 'default';
this.fallbackLanguage = this.data.tts.fallbackLanguage || 'default';
this.fallbackVoice = this.data.tts.fallbackLanguage || 'default';
this.fallbackLabel = this.data.tts.fallbackLabel || 'default';
}
this.botName = `${this.bot}:${this.alias}:${this.region}`;
@@ -103,9 +109,16 @@ class Lex extends Task {
this.vendor = cs.speechSynthesisVendor;
this.language = cs.speechSynthesisLanguage;
this.voice = cs.speechSynthesisVoice;
this.speechCredentialLabel = cs.speechSynthesisLabel;
}
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts',
this.speechCredentialLabel || cs.speechSynthesisVendor);
if (this.fallbackVendor === 'default') {
this.fallbackVendor = cs.fallbackSpeechSynthesisVendor;
this.fallbackLanguage = cs.fallbackSpeechSynthesisLanguage;
this.fallbackVoice = cs.fallbackSpeechSynthesisVoice;
this.fallbackLabel = cs.fallbackSpeechSynthesisLabel;
}
this.ttsCredentials = cs.getSpeechCredentials(this.vendor, 'tts', this.speechCredentialLabel);
this.ep.addCustomEventListener('lex::intent', this._onIntent.bind(this, ep, cs));
this.ep.addCustomEventListener('lex::transcription', this._onTranscription.bind(this, ep, cs));
@@ -170,6 +183,41 @@ class Lex extends Task {
}
}
async _fallbackSynthAudio(cs, msg, stats, synthAudio) {
try {
const {filePath} = await synthAudio(stats, {
account_sid: cs.accountSid,
text: msg,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
});
return filePath;
} catch (error) {
this.logger.info({error}, 'failed to synth audio from primary vendor');
if (this.fallbackVendor) {
try {
const credential = cs.getSpeechCredentials(this.fallbackVendor, 'tts', this.fallbackLabel);
const {filePath} = await synthAudio(stats, {
account_sid: cs.accountSid,
text: msg,
vendor: this.fallbackVendor,
language: this.fallbackLanguage,
voice: this.fallbackVoice,
salt: cs.callSid,
credentials: credential
});
return filePath;
} catch (err) {
this.logger.info({err}, 'failed to synth audio from fallback vendor');
}
}
}
}
/**
* @param {*} evt - event data
*/
@@ -189,16 +237,7 @@ class Lex extends Task {
try {
this.logger.debug(`tts with ${this.vendor} ${this.voice}`);
// eslint-disable-next-line no-unused-vars
const {filePath, servedFromCache} = await synthAudio(stats, {
account_sid: cs.accountSid,
text: msg,
vendor: this.vendor,
language: this.language,
voice: this.voice,
salt: cs.callSid,
credentials: this.ttsCredentials
});
const filePath = await this._fallbackSynthAudio(cs, msg, stats, synthAudio);
if (filePath) cs.trackTmpFile(filePath);
if (this.events.includes('start-play')) {

View File

@@ -37,6 +37,7 @@ class TaskSay extends Task {
this.synthesizer = this.data.synthesizer || {};
this.disableTtsCache = this.data.disableTtsCache;
this.options = this.synthesizer.options || {};
this.isHandledByPrimaryProvider = true;
}
get name() { return TaskName.Say; }
@@ -49,26 +50,15 @@ class TaskSay extends Task {
return `${this.name}{${this.text[0]}}`;
}
async exec(cs, {ep}) {
await super.exec(cs);
async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label}) {
const {srf} = cs;
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf);
const {writeAlerts, AlertType, stats} = srf.locals;
const {synthAudio} = srf.locals.dbHelpers;
const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
this.synthesizer.vendor :
cs.speechSynthesisVendor;
const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
this.synthesizer.language :
cs.speechSynthesisLanguage ;
let voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
this.synthesizer.voice :
cs.speechSynthesisVoice;
const engine = this.synthesizer.engine || 'standard';
const salt = cs.callSid;
let credentials = cs.getSpeechCredentials(vendor, 'tts', this.data.synthesizer?.label || cs.speechSynthesisLabel);
let credentials = cs.getSpeechCredentials(vendor, 'tts', label);
/* parse Nuance voices into name and model */
let model;
if (vendor === 'nuance' && voice) {
@@ -90,7 +80,6 @@ class TaskSay extends Task {
}
this.logger.info({vendor, language, voice, model}, 'TaskSay:exec');
this.ep = ep;
try {
if (!credentials) {
writeAlerts({
@@ -160,31 +149,83 @@ class TaskSay extends Task {
detail: err.message
}).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure'));
this.notifyError({msg: 'TTS error', details: err.message || err});
return;
throw err;
}
};
const arr = this.text.map((t) => generateAudio(t));
const filepath = (await Promise.all(arr)).filter((fp) => fp && fp.length);
this.notifyStatus({event: 'start-playback'});
while (!this.killed && (this.loop === 'forever' || this.loop--) && this.ep?.connected) {
let segment = 0;
while (!this.killed && segment < filepath.length) {
if (cs.isInConference) {
const {memberId, confName, confUuid} = cs;
await this.playToConfMember(this.ep, memberId, confName, confUuid, filepath[segment]);
}
else {
this.logger.debug(`Say:exec sending command to play file ${filepath[segment]}`);
await ep.play(filepath[segment]);
this.logger.debug(`Say:exec completed play file ${filepath[segment]}`);
}
segment++;
}
}
return (await Promise.all(arr)).filter((fp) => fp && fp.length);
} catch (err) {
this.logger.info(err, 'TaskSay:exec error');
throw err;
}
}
async exec(cs, {ep}) {
await super.exec(cs);
this.ep = ep;
const vendor = this.synthesizer.vendor && this.synthesizer.vendor !== 'default' ?
this.synthesizer.vendor :
cs.speechSynthesisVendor;
const language = this.synthesizer.language && this.synthesizer.language !== 'default' ?
this.synthesizer.language :
cs.speechSynthesisLanguage ;
const voice = this.synthesizer.voice && this.synthesizer.voice !== 'default' ?
this.synthesizer.voice :
cs.speechSynthesisVoice;
const label = this.synthesizer.label && this.synthesizer.label !== 'default' ?
this.synthesizer.label :
cs.speechSynthesisLabel;
const fallbackVendor = this.synthesizer.fallbackVendor && this.synthesizer.fallbackVendor !== 'default' ?
this.synthesizer.fallbackVendor :
cs.fallbackSpeechSynthesisVendor;
const fallbackLanguage = this.synthesizer.fallbackLanguage && this.synthesizer.fallbackLanguage !== 'default' ?
this.synthesizer.fallbackLanguage :
cs.fallbackSpeechSynthesisLanguage ;
const fallbackVoice = this.synthesizer.fallbackVoice && this.synthesizer.fallbackVoice !== 'default' ?
this.synthesizer.fallbackVoice :
cs.fallbackSpeechSynthesisVoice;
const fallbackLabel = this.synthesizer.fallbackLabel && this.synthesizer.fallbackLabel !== 'default' ?
this.synthesizer.fallbackLabel :
cs.fallbackSpeechSynthesisLabel;
let filepath;
try {
filepath = await this._synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label});
} catch (error) {
if (fallbackVendor && this.isHandledByPrimaryProvider) {
this.isHandledByPrimaryProvider = false;
this.logger.info(`Synthesize error, fallback to ${fallbackVendor}`);
filepath = await this._synthesizeWithSpecificVendor(cs, ep,
{
vendor: fallbackVendor,
language: fallbackLanguage,
voice: fallbackVoice,
label: fallbackLabel
});
} else {
throw error;
}
}
this.notifyStatus({event: 'start-playback'});
while (!this.killed && (this.loop === 'forever' || this.loop--) && this.ep?.connected) {
let segment = 0;
while (!this.killed && segment < filepath.length) {
if (cs.isInConference) {
const {memberId, confName, confUuid} = cs;
await this.playToConfMember(this.ep, memberId, confName, confUuid, filepath[segment]);
}
else {
this.logger.debug(`Say:exec sending command to play file ${filepath[segment]}`);
await ep.play(filepath[segment]);
this.logger.debug(`Say:exec completed play file ${filepath[segment]}`);
}
segment++;
}
}
this.emit('playDone');
}

100
lib/tasks/stt-task.js Normal file
View File

@@ -0,0 +1,100 @@
const Task = require('./task');
const assert = require('assert');
const { TaskPreconditions } = require('../utils/constants');
class SttTask extends Task {
constructor(logger, data, parentTask) {
super(logger, data);
this.parentTask = parentTask;
this.preconditions = TaskPreconditions.Endpoint;
const {
setChannelVarsForStt,
normalizeTranscription,
removeSpeechListeners,
setSpeechCredentialsAtRuntime,
compileSonioxTranscripts
} = require('../utils/transcription-utils')(logger);
this.setChannelVarsForStt = setChannelVarsForStt;
this.normalizeTranscription = normalizeTranscription;
this.removeSpeechListeners = removeSpeechListeners;
this.compileSonioxTranscripts = compileSonioxTranscripts;
this.isHandledByPrimaryProvider = true;
if (this.data.recognizer) {
const recognizer = this.data.recognizer;
this.vendor = recognizer.vendor;
this.language = recognizer.language;
this.label = recognizer.label;
//fallback
this.fallbackVendor = recognizer.fallbackVendor || 'default';
this.fallbackLanguage = recognizer.fallbackLanguage || 'default';
this.fallbackLabel = recognizer.fallbackLabel || 'default';
/* let credentials be supplied in the recognizer object at runtime */
this.sttCredentials = setSpeechCredentialsAtRuntime(recognizer);
} else {
this.data.recognizer = {hints: [], altLanguages: []};
}
/* buffer for soniox transcripts */
this._sonioxTranscripts = [];
}
async _initSpeechCredentials(cs, vendor, label) {
const {getNuanceAccessToken, getIbmAccessToken} = this.cs.srf.locals.dbHelpers;
let credentials = cs.getSpeechCredentials(vendor, 'stt', label);
if (!credentials) {
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info(`ERROR stt using ${vendor} requested but creds not supplied`);
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_NOT_PROVISIONED,
vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
// Notify application that STT vender is wrong.
this.notifyError({
msg: 'ASR error',
details: `No speech-to-text service credentials for ${vendor} have been configured`
});
this.notifyTaskDone();
throw new Error(`No speech-to-text service credentials for ${vendor} have been configured`);
}
if (vendor === 'nuance' && credentials.client_id) {
/* get nuance access token */
const {client_id, secret} = credentials;
const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
this.logger.debug({client_id}, `got nuance access token ${servedFromCache ? 'from cache' : ''}`);
credentials = {...credentials, access_token};
}
else if (vendor == 'ibm' && credentials.stt_api_key) {
/* get ibm access token */
const {stt_api_key, stt_region} = credentials;
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
this.logger.debug({stt_api_key}, `got ibm access token ${servedFromCache ? 'from cache' : ''}`);
credentials = {...credentials, access_token, stt_region};
}
return credentials;
}
async _fallback() {
assert(this.fallbackVendor, 'fallback failed without fallbackVendor configuration');
this.isHandledByPrimaryProvider = false;
this.logger.info(`Failed to use primary STT provider, fallback to ${this.fallbackVendor}`);
this.vendor = this.fallbackVendor;
this.language = this.fallbackLanguage;
this.label = this.fallbackLabel;
this.data.recognizer.vendor = this.vendor;
this.data.recognizer.language = this.language;
this.data.recognizer.label = this.label;
this.sttCredentials = await this._initSpeechCredentials(this.cs, this.vendor, this.label);
}
}
module.exports = SttTask;

View File

@@ -1,8 +1,6 @@
const Task = require('./task');
const assert = require('assert');
const {
TaskName,
TaskPreconditions,
GoogleTranscriptionEvents,
NuanceTranscriptionEvents,
AwsTranscriptionEvents,
@@ -14,47 +12,21 @@ const {
JambonzTranscriptionEvents
} = require('../utils/constants');
const { normalizeJambones } = require('@jambonz/verb-specifications');
const SttTask = require('./stt-task');
const STT_LISTEN_SPAN_NAME = 'stt-listen';
class TaskTranscribe extends Task {
class TaskTranscribe extends SttTask {
constructor(logger, opts, parentTask) {
super(logger, opts);
this.preconditions = TaskPreconditions.Endpoint;
this.parentTask = parentTask;
const {
setChannelVarsForStt,
normalizeTranscription,
removeSpeechListeners,
setSpeechCredentialsAtRuntime,
compileSonioxTranscripts
} = require('../utils/transcription-utils')(logger);
this.setChannelVarsForStt = setChannelVarsForStt;
this.normalizeTranscription = normalizeTranscription;
this.removeSpeechListeners = removeSpeechListeners;
this.compileSonioxTranscripts = compileSonioxTranscripts;
super(logger, opts, parentTask);
this.transcriptionHook = this.data.transcriptionHook;
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
if (this.data.recognizer) {
const recognizer = this.data.recognizer;
this.vendor = recognizer.vendor;
this.language = recognizer.language;
/* let credentials be supplied in the recognizer object at runtime */
this.sttCredentials = setSpeechCredentialsAtRuntime(recognizer);
this.interim = !!recognizer.interim;
this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;
this.data.recognizer.hints = this.data.recognizer.hints || [];
this.data.recognizer.altLanguages = this.data.recognizer.altLanguages || [];
this.interim = !!this.data.recognizer.interim;
this.separateRecognitionPerChannel = this.data.recognizer.separateRecognitionPerChannel;
}
else this.data.recognizer = {hints: [], altLanguages: []};
/* buffer for soniox transcripts */
this._sonioxTranscripts = [];
this.childSpan = [null, null];
@@ -70,7 +42,6 @@ class TaskTranscribe extends Task {
async exec(cs, {ep, ep2}) {
super.exec(cs);
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
if (cs.hasGlobalSttHints) {
const {hints, hintsBoost} = cs.globalSttHints;
@@ -98,39 +69,40 @@ class TaskTranscribe extends Task {
this.language = cs.speechRecognizerLanguage;
if (this.data.recognizer) this.data.recognizer.language = this.language;
}
if ('default' === this.label || !this.label) {
this.label = cs.speechRecognizerLabel;
if (this.data.recognizer) this.data.recognizer.label = this.label;
}
// Fallback options
if ('default' === this.fallbackVendor || !this.fallbackVendor) {
this.fallbackVendor = cs.fallbackSpeechRecognizerVendor;
if (this.data.recognizer) this.data.recognizer.fallbackVendor = this.fallbackVendor;
}
if ('default' === this.fallbackLanguage || !this.fallbackLanguage) {
this.fallbackLanguage = cs.fallbackSpeechRecognizerLanguage;
if (this.data.recognizer) this.data.recognizer.fallbackLanguage = this.fallbackLanguage;
}
if ('default' === this.fallbackLabel || !this.fallbackLabel) {
this.fallbackLabel = cs.fallbackSpeechRecognizerLabel;
if (this.data.recognizer) this.data.recognizer.fallbackLabel = this.fallbackLabel;
}
if (!this.data.recognizer.vendor) {
this.data.recognizer.vendor = this.vendor;
}
if (!this.sttCredentials) this.sttCredentials =
cs.getSpeechCredentials(this.vendor, 'stt', this.data.recognizer?.label || cs.speechRecognizerLabel);
if (!this.sttCredentials) {
try {
this.sttCredentials = await this._initSpeechCredentials(cs, this.vendor, this.label);
} catch (error) {
if (this.fallbackVendor && this.isHandledByPrimaryProvider) {
await this._fallback();
} else {
throw error;
}
}
}
try {
if (!this.sttCredentials) {
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info(`TaskTranscribe:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_NOT_PROVISIONED,
vendor: this.vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
throw new Error('no provisioned speech credentials for TTS');
}
if (this.vendor === 'nuance' && this.sttCredentials.client_id) {
/* get nuance access token */
const {client_id, secret} = this.sttCredentials;
const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
this.logger.debug({client_id},
`Transcribe:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
this.sttCredentials = {...this.sttCredentials, access_token};
}
else if (this.vendor == 'ibm' && this.sttCredentials.stt_api_key) {
/* get ibm access token */
const {stt_api_key, stt_region} = this.sttCredentials;
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
this.logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
this.sttCredentials = {...this.sttCredentials, access_token, stt_region};
}
await this._startTranscribing(cs, ep, 1);
if (this.separateRecognitionPerChannel && ep2) {
await this._startTranscribing(cs, ep2, 2);
@@ -494,23 +466,41 @@ class TaskTranscribe extends Task {
_onIbmError(cs, _ep, _channel, evt) {
this.logger.info({evt}, 'TaskTranscribe:_onIbmError');
}
_onJambonzError(cs, _ep, evt) {
async _onJambonzError(cs, _ep, evt) {
this.logger.info({evt}, 'TaskTranscribe:_onJambonzError');
const {writeAlerts, AlertType} = cs.srf.locals;
if (this.isHandledByPrimaryProvider && this.fallbackVendor) {
_ep.stopTranscription({vendor: this.vendor})
.catch((err) => this.logger.error({err}, `Error stopping transcription for primary vendor ${this.vendor}`));
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
try {
await this._fallback();
let channel = 1;
if (this.ep !== _ep) {
channel = 2;
}
this._startTranscribing(cs, _ep, channel);
updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid);
return;
} catch (error) {
this.logger.info({error}, `There is error while falling back to ${this.fallbackVendor}`);
}
} else {
const {writeAlerts, AlertType} = cs.srf.locals;
if (this.vendor === 'nuance') {
const {code, error} = evt;
if (code === 404 && error === 'No speech') return this._resolve('timeout');
if (code === 413 && error === 'Too much speech') return this._resolve('timeout');
if (this.vendor === 'nuance') {
const {code, error} = evt;
if (code === 404 && error === 'No speech') return this._resolve('timeout');
if (code === 413 && error === 'Too much speech') return this._resolve('timeout');
}
this.logger.info({evt}, 'TaskTranscribe:_onJambonzError');
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_FAILURE,
message: `Custom speech vendor ${this.vendor} error: ${evt.error}`,
vendor: this.vendor,
}).catch((err) => this.logger.info({err}, 'Error generating alert for jambonz custom connection failure'));
this.notifyError({msg: 'ASR error', details:`Custom speech vendor ${this.vendor} error: ${evt.error}`});
}
this.logger.info({evt}, 'TaskTranscribe:_onJambonzError');
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_FAILURE,
message: `Custom speech vendor ${this.vendor} error: ${evt.error}`,
vendor: this.vendor,
}).catch((err) => this.logger.info({err}, 'Error generating alert for jambonz custom connection failure'));
this.notifyError({msg: 'ASR error', details:`Custom speech vendor ${this.vendor} error: ${evt.error}`});
}
_startAsrTimer(channel) {

View File

@@ -356,12 +356,12 @@ module.exports = (logger) => {
...(rOpts.punctuation === false && {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 0}),
...(rOpts.words == false && {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 0}),
...(rOpts.diarization === false && {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 0}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
...(rOpts.hints?.length > 0 && typeof rOpts.hints[0] === 'string' &&
{GOOGLE_SPEECH_HINTS: rOpts.hints.join(',')}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' &&
...(rOpts.hints?.length > 0 && typeof rOpts.hints[0] === 'object' &&
{GOOGLE_SPEECH_HINTS: JSON.stringify(rOpts.hints)}),
...(typeof rOpts.hintsBoost === 'number' && {GOOGLE_SPEECH_HINTS_BOOST: rOpts.hintsBoost}),
...(rOpts.altLanguages.length > 0 &&
...(rOpts.altLanguages?.length > 0 &&
{GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES: [...new Set(rOpts.altLanguages)].join(',')}),
...(rOpts.interactionType &&
{GOOGLE_SPEECH_METADATA_INTERACTION_TYPE: rOpts.interactionType}),

View File

@@ -88,8 +88,8 @@ test('test create-call call-hook basic authentication', async(t) => {
let verbs = [
{
"verb": "say",
"text": "hello"
"verb": "pause",
"length": 1
}
];
await provisionCallHook(from, verbs);

View File

@@ -1063,7 +1063,7 @@ CREATE TABLE `speech_credentials` (
LOCK TABLES `speech_credentials` WRITE;
/*!40000 ALTER TABLE `speech_credentials` DISABLE KEYS */;
INSERT INTO `speech_credentials` VALUES ('2add163c-34f2-45c6-a016-f955d218ffb6',NULL,'bb845d4b-83a9-4cde-a6e9-50f3743bab3f','google','credential-goes-here',1,1,NULL,'2021-04-03 15:42:10',1,1,'2023-05-31 03:44:21'),('2add347f-34f2-45c6-a016-f955d218ffb6',NULL,'bb845d4b-83a9-4cde-a6e9-50f3743bab3f','microsoft','credential-goes-here',1,1,NULL,'2021-04-03 15:42:10',1,1,'2023-05-31 03:44:21'),('84154212-5c99-4c94-8993-bc2a46288daa',NULL,'bb845d4b-83a9-4cde-a6e9-50f3743bab3f','aws','credential-goes-here',1,1,NULL,NULL,NULL,NULL,'2023-05-31 03:44:21');
INSERT INTO `speech_credentials` VALUES ('2add163c-34f2-45c6-a016-f955d218ffb6',NULL,'bb845d4b-83a9-4cde-a6e9-50f3743bab3f','google','credential-goes-here',1,1,NULL,'2021-04-03 15:42:10',1,1,'2023-05-31 03:44:21'),('2add347f-34f2-45c6-a016-f955d218ffb6',NULL,'bb845d4b-83a9-4cde-a6e9-50f3743bab3f','microsoft','credential-goes-here',1,1,NULL,'2021-04-03 15:42:10',1,1,'2023-05-31 03:44:21'),('84154212-5c99-4c94-8993-bc2a46288daa',NULL,'bb845d4b-83a9-4cde-a6e9-50f3743bab3f','aws','credential-goes-here',1,1,NULL,NULL,1,1,'2023-05-31 03:44:21');
/*!40000 ALTER TABLE `speech_credentials` ENABLE KEYS */;
UNLOCK TABLES;

View File

@@ -479,8 +479,10 @@ app_json TEXT,
speech_synthesis_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
speech_synthesis_language VARCHAR(12) NOT NULL DEFAULT 'en-US',
speech_synthesis_voice VARCHAR(64),
speech_synthesis_label VARCHAR(64),
speech_recognizer_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
speech_recognizer_language VARCHAR(64) NOT NULL DEFAULT 'en-US',
speech_recognizer_label VARCHAR(64),
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
record_all_calls BOOLEAN NOT NULL DEFAULT false,
PRIMARY KEY (application_sid)

View File

@@ -7,7 +7,7 @@ require('./dial-tests');
require('./webhooks-tests');
require('./say-tests');
require('./gather-tests');
// require('./transcribe-tests');
require('./transcribe-tests');
require('./sip-request-tests');
require('./create-call-test');
require('./play-tests');