|
|
|
|
@@ -9,6 +9,7 @@ const {
|
|
|
|
|
|
|
|
|
|
const makeTask = require('./make_task');
|
|
|
|
|
const assert = require('assert');
|
|
|
|
|
const GATHER_STABILITY_THRESHOLD = Number(process.env.JAMBONZ_GATHER_STABILITY_THRESHOLD || 0.7);
|
|
|
|
|
|
|
|
|
|
class TaskGather extends Task {
|
|
|
|
|
constructor(logger, opts, parentTask) {
|
|
|
|
|
@@ -16,12 +17,15 @@ class TaskGather extends Task {
|
|
|
|
|
this.preconditions = TaskPreconditions.Endpoint;
|
|
|
|
|
|
|
|
|
|
[
|
|
|
|
|
'finishOnKey', 'hints', 'input', 'numDigits',
|
|
|
|
|
'partialResultHook',
|
|
|
|
|
'finishOnKey', 'hints', 'input', 'numDigits', 'minDigits', 'maxDigits',
|
|
|
|
|
'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
|
|
|
|
|
'speechTimeout', 'timeout', 'say', 'play'
|
|
|
|
|
].forEach((k) => this[k] = this.data[k]);
|
|
|
|
|
|
|
|
|
|
this.timeout = (this.timeout || 5) * 1000;
|
|
|
|
|
/* when collecting dtmf, bargein on dtmf is true unless explicitly set to false */
|
|
|
|
|
if (this.dtmfBargein !== false && this.input.includes('digits')) this.dtmfBargein = true;
|
|
|
|
|
|
|
|
|
|
this.timeout = (this.timeout || 15) * 1000;
|
|
|
|
|
this.interim = this.partialResultCallback;
|
|
|
|
|
if (this.data.recognizer) {
|
|
|
|
|
const recognizer = this.data.recognizer;
|
|
|
|
|
@@ -44,6 +48,10 @@ class TaskGather extends Task {
|
|
|
|
|
this.profanityOption = recognizer.profanityOption || 'raw';
|
|
|
|
|
this.requestSnr = recognizer.requestSnr || false;
|
|
|
|
|
this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
|
|
|
|
|
|
|
|
|
|
/* barge in configuration */
|
|
|
|
|
this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true;
|
|
|
|
|
this.minBargeinWordCount = this.data.minBargeinWordCount || 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.digitBuffer = '';
|
|
|
|
|
@@ -52,6 +60,12 @@ class TaskGather extends Task {
|
|
|
|
|
if (this.say) this.sayTask = makeTask(this.logger, {say: this.say}, this);
|
|
|
|
|
if (this.play) this.playTask = makeTask(this.logger, {play: this.play}, this);
|
|
|
|
|
|
|
|
|
|
if (this.sayTask || this.playTask) {
|
|
|
|
|
// this is specially for barge in where we want to make a bargebale promt
|
|
|
|
|
// to a user without listening after the say task has finished
|
|
|
|
|
this.listenAfterSpeech = typeof this.data.listenAfterSpeech === 'boolean' ? this.data.listenAfterSpeech : true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.parentTask = parentTask;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -84,29 +98,56 @@ class TaskGather extends Task {
|
|
|
|
|
throw new Error(`no speech-to-text service credentials for ${this.vendor} have been configured`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const startListening = (cs, ep) => {
|
|
|
|
|
this._startTimer();
|
|
|
|
|
if (this.input.includes('speech') && !this.listenDuringPrompt) {
|
|
|
|
|
this._initSpeech(cs, ep)
|
|
|
|
|
.then(() => {
|
|
|
|
|
this._startTranscribing(ep);
|
|
|
|
|
return updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid);
|
|
|
|
|
})
|
|
|
|
|
.catch(() => {});
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
if (this.sayTask) {
|
|
|
|
|
this.sayTask.exec(cs, ep); // kicked off, _not_ waiting for it to complete
|
|
|
|
|
this.sayTask.on('playDone', (err) => {
|
|
|
|
|
if (!this.killed) this._startTimer();
|
|
|
|
|
if (err) return this.logger.error({err}, 'Gather:exec Error playing tts');
|
|
|
|
|
this.logger.info('Gather: say task completed');
|
|
|
|
|
if (!this.killed) {
|
|
|
|
|
if (this.listenAfterSpeech === true) {
|
|
|
|
|
startListening(cs, ep);
|
|
|
|
|
} else {
|
|
|
|
|
this.notifyTaskDone();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
else if (this.playTask) {
|
|
|
|
|
this.playTask.exec(cs, ep); // kicked off, _not_ waiting for it to complete
|
|
|
|
|
this.playTask.on('playDone', (err) => {
|
|
|
|
|
if (!this.killed) this._startTimer();
|
|
|
|
|
if (err) return this.logger.error({err}, 'Gather:exec Error playing url');
|
|
|
|
|
if (!this.killed) {
|
|
|
|
|
if (this.listenAfterSpeech === true) {
|
|
|
|
|
startListening(cs, ep);
|
|
|
|
|
} else {
|
|
|
|
|
this.notifyTaskDone();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
else this._startTimer();
|
|
|
|
|
else startListening(cs, ep);
|
|
|
|
|
|
|
|
|
|
if (this.input.includes('speech')) {
|
|
|
|
|
if (this.input.includes('speech') && this.listenDuringPrompt) {
|
|
|
|
|
await this._initSpeech(cs, ep);
|
|
|
|
|
this._startTranscribing(ep);
|
|
|
|
|
updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid)
|
|
|
|
|
.catch(() => {/*already logged error */});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.input.includes('digits')) {
|
|
|
|
|
if (this.input.includes('digits') || this.dtmfBargein) {
|
|
|
|
|
ep.on('dtmf', this._onDtmf.bind(this, cs, ep));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -125,89 +166,111 @@ class TaskGather extends Task {
|
|
|
|
|
super.kill(cs);
|
|
|
|
|
this._killAudio(cs);
|
|
|
|
|
this.ep.removeAllListeners('dtmf');
|
|
|
|
|
clearTimeout(this.interDigitTimer);
|
|
|
|
|
this._resolve('killed');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_onDtmf(cs, ep, evt) {
|
|
|
|
|
this.logger.debug(evt, 'TaskGather:_onDtmf');
|
|
|
|
|
if (evt.dtmf === this.finishOnKey) this._resolve('dtmf-terminator-key');
|
|
|
|
|
clearTimeout(this.interDigitTimer);
|
|
|
|
|
let resolved = false;
|
|
|
|
|
if (this.dtmfBargein) this._killAudio(cs);
|
|
|
|
|
if (evt.dtmf === this.finishOnKey && this.input.includes('digits')) {
|
|
|
|
|
resolved = true;
|
|
|
|
|
this._resolve('dtmf-terminator-key');
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
this.digitBuffer += evt.dtmf;
|
|
|
|
|
if (this.digitBuffer.length === this.numDigits) this._resolve('dtmf-num-digits');
|
|
|
|
|
const len = this.digitBuffer.length;
|
|
|
|
|
if (len === this.numDigits || len === this.maxDigits) {
|
|
|
|
|
resolved = true;
|
|
|
|
|
this._resolve('dtmf-num-digits');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!resolved && this.interDigitTimeout > 0 && this.digitBuffer.length >= this.minDigits) {
|
|
|
|
|
/* start interDigitTimer */
|
|
|
|
|
const ms = this.interDigitTimeout * 1000;
|
|
|
|
|
this.logger.debug(`starting interdigit timer of ${ms}`);
|
|
|
|
|
this.interDigitTimer = setTimeout(() => this._resolve('dtmf-interdigit-timeout'), ms);
|
|
|
|
|
}
|
|
|
|
|
this._killAudio(cs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async _initSpeech(cs, ep) {
|
|
|
|
|
const opts = {};
|
|
|
|
|
try {
|
|
|
|
|
const opts = {};
|
|
|
|
|
|
|
|
|
|
if (this.vad.enable) {
|
|
|
|
|
opts.START_RECOGNIZING_ON_VAD = 1;
|
|
|
|
|
if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs;
|
|
|
|
|
if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode;
|
|
|
|
|
}
|
|
|
|
|
if (this.vad.enable) {
|
|
|
|
|
opts.START_RECOGNIZING_ON_VAD = 1;
|
|
|
|
|
if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs;
|
|
|
|
|
if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ('google' === this.vendor) {
|
|
|
|
|
if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
|
|
|
|
|
Object.assign(opts, {
|
|
|
|
|
GOOGLE_SPEECH_USE_ENHANCED: true,
|
|
|
|
|
GOOGLE_SPEECH_SINGLE_UTTERANCE: true,
|
|
|
|
|
GOOGLE_SPEECH_MODEL: 'command_and_search'
|
|
|
|
|
});
|
|
|
|
|
if (this.hints && this.hints.length > 1) {
|
|
|
|
|
opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
|
|
|
|
}
|
|
|
|
|
if (this.altLanguages && this.altLanguages.length > 1) {
|
|
|
|
|
opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
|
|
|
|
|
}
|
|
|
|
|
if (this.profanityFilter === true) {
|
|
|
|
|
Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true});
|
|
|
|
|
}
|
|
|
|
|
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
|
|
|
|
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
|
|
|
|
|
}
|
|
|
|
|
else if (['aws', 'polly'].includes(this.vendor)) {
|
|
|
|
|
if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
|
|
|
|
|
if (this.vocabularyFilterName) {
|
|
|
|
|
opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
|
|
|
|
|
opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
|
|
|
|
|
}
|
|
|
|
|
if (this.sttCredentials) {
|
|
|
|
|
if ('google' === this.vendor) {
|
|
|
|
|
if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
|
|
|
|
|
Object.assign(opts, {
|
|
|
|
|
AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
|
|
|
|
|
AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
|
|
|
|
|
AWS_REGION: this.sttCredentials.region
|
|
|
|
|
GOOGLE_SPEECH_USE_ENHANCED: true,
|
|
|
|
|
GOOGLE_SPEECH_SINGLE_UTTERANCE: true,
|
|
|
|
|
GOOGLE_SPEECH_MODEL: 'command_and_search'
|
|
|
|
|
});
|
|
|
|
|
if (this.hints && this.hints.length > 1) {
|
|
|
|
|
opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
|
|
|
|
}
|
|
|
|
|
if (this.altLanguages && this.altLanguages.length > 1) {
|
|
|
|
|
opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
|
|
|
|
|
}
|
|
|
|
|
if (this.profanityFilter === true) {
|
|
|
|
|
Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true});
|
|
|
|
|
}
|
|
|
|
|
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
|
|
|
|
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
|
|
|
|
|
}
|
|
|
|
|
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
|
|
|
|
}
|
|
|
|
|
else if ('microsoft' === this.vendor) {
|
|
|
|
|
if (this.sttCredentials) {
|
|
|
|
|
Object.assign(opts, {
|
|
|
|
|
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
|
|
|
|
|
'AZURE_REGION': this.sttCredentials.region
|
|
|
|
|
});
|
|
|
|
|
else if (['aws', 'polly'].includes(this.vendor)) {
|
|
|
|
|
if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
|
|
|
|
|
if (this.vocabularyFilterName) {
|
|
|
|
|
opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
|
|
|
|
|
opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
|
|
|
|
|
}
|
|
|
|
|
if (this.sttCredentials) {
|
|
|
|
|
Object.assign(opts, {
|
|
|
|
|
AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
|
|
|
|
|
AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
|
|
|
|
|
AWS_REGION: this.sttCredentials.region
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
|
|
|
|
}
|
|
|
|
|
if (this.hints && this.hints.length > 1) {
|
|
|
|
|
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
|
|
|
|
}
|
|
|
|
|
//if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
|
|
|
|
|
//if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
|
|
|
|
|
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
|
|
|
|
|
opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
|
|
|
|
|
else if ('microsoft' === this.vendor) {
|
|
|
|
|
if (this.sttCredentials) {
|
|
|
|
|
Object.assign(opts, {
|
|
|
|
|
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
|
|
|
|
|
'AZURE_REGION': this.sttCredentials.region
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (this.hints && this.hints.length > 1) {
|
|
|
|
|
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
|
|
|
|
}
|
|
|
|
|
//if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
|
|
|
|
|
//if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
|
|
|
|
|
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
|
|
|
|
|
opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
|
|
|
|
|
|
|
|
|
|
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
|
|
|
|
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep));
|
|
|
|
|
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
|
|
|
|
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected,
|
|
|
|
|
this._onNoSpeechDetected.bind(this, cs, ep));
|
|
|
|
|
}
|
|
|
|
|
await ep.set(opts)
|
|
|
|
|
.catch((err) => this.logger.error(err, 'Error setting channel variables'));
|
|
|
|
|
} catch (err) {
|
|
|
|
|
this.logger.error(err, 'could not init speech for listening');
|
|
|
|
|
}
|
|
|
|
|
await ep.set(opts)
|
|
|
|
|
.catch((err) => this.logger.info(err, 'Error setting channel variables'));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_startTranscribing(ep) {
|
|
|
|
|
ep.startTranscription({
|
|
|
|
|
vendor: this.vendor,
|
|
|
|
|
locale: this.language,
|
|
|
|
|
interim: this.partialResultCallback ? true : false,
|
|
|
|
|
interim: this.partialResultCallback || this.bargein,
|
|
|
|
|
}).catch((err) => {
|
|
|
|
|
const {writeAlerts, AlertType} = this.cs.srf.locals;
|
|
|
|
|
this.logger.error(err, 'TaskGather:_startTranscribing error');
|
|
|
|
|
@@ -249,25 +312,50 @@ class TaskGather extends Task {
|
|
|
|
|
_onTranscription(cs, ep, evt) {
|
|
|
|
|
if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
|
|
|
|
|
if ('microsoft' === this.vendor) {
|
|
|
|
|
const nbest = evt.NBest;
|
|
|
|
|
const newEvent = {
|
|
|
|
|
is_final: evt.RecognitionStatus === 'Success',
|
|
|
|
|
alternatives: [
|
|
|
|
|
{
|
|
|
|
|
confidence: nbest[0].Confidence,
|
|
|
|
|
transcript: nbest[0].Display
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
};
|
|
|
|
|
evt = newEvent;
|
|
|
|
|
const final = evt.RecognitionStatus === 'Success';
|
|
|
|
|
if (final) {
|
|
|
|
|
const nbest = evt.NBest;
|
|
|
|
|
evt = {
|
|
|
|
|
is_final: true,
|
|
|
|
|
alternatives: [
|
|
|
|
|
{
|
|
|
|
|
confidence: nbest[0].Confidence,
|
|
|
|
|
transcript: nbest[0].Display
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
evt = {
|
|
|
|
|
is_final: false,
|
|
|
|
|
alternatives: [
|
|
|
|
|
{
|
|
|
|
|
transcript: evt.Text
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
this.logger.debug(evt, 'TaskGather:_onTranscription');
|
|
|
|
|
if (evt.is_final) this._resolve('speech', evt);
|
|
|
|
|
else if (this.partialResultHook) {
|
|
|
|
|
this.cs.requestor.request('verb:hook', this.partialResultHook, Object.assign({speech: evt}, this.cs.callInfo))
|
|
|
|
|
.catch((err) => this.logger.info(err, 'GatherTask:_onTranscription error'));
|
|
|
|
|
else {
|
|
|
|
|
/* google has a measure of stability:
|
|
|
|
|
https://cloud.google.com/speech-to-text/docs/basics#streaming_responses
|
|
|
|
|
others do not.
|
|
|
|
|
*/
|
|
|
|
|
const isStableEnough = typeof evt.stability === 'undefined' || evt.stability > GATHER_STABILITY_THRESHOLD;
|
|
|
|
|
|
|
|
|
|
if (this.bargein && isStableEnough &&
|
|
|
|
|
evt.alternatives[0].transcript.split(' ').length >= this.minBargeinWordCount) {
|
|
|
|
|
this.logger.debug('Gather:_onTranscription - killing audio due to speech bargein');
|
|
|
|
|
this._killAudio(cs);
|
|
|
|
|
}
|
|
|
|
|
if (this.partialResultHook) {
|
|
|
|
|
this.cs.requestor.request(this.partialResultHook, Object.assign({speech: evt}, this.cs.callInfo))
|
|
|
|
|
.catch((err) => this.logger.info(err, 'GatherTask:_onTranscription error'));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
_onEndOfUtterance(cs, ep) {
|
|
|
|
|
this.logger.info('TaskGather:_onEndOfUtterance');
|
|
|
|
|
if (!this.resolved && !this.killed) {
|
|
|
|
|
@@ -283,6 +371,7 @@ class TaskGather extends Task {
|
|
|
|
|
if (this.resolved) return;
|
|
|
|
|
this.resolved = true;
|
|
|
|
|
this.logger.debug(`TaskGather:resolve with reason ${reason}`);
|
|
|
|
|
clearTimeout(this.interDigitTimer);
|
|
|
|
|
|
|
|
|
|
if (this.ep && this.ep.connected) {
|
|
|
|
|
this.ep.stopTranscription({vendor: this.vendor})
|
|
|
|
|
|