const Task = require('./task'); const { TaskName, TaskPreconditions, GoogleTranscriptionEvents, AwsTranscriptionEvents, AzureTranscriptionEvents } = require('../utils/constants'); const makeTask = require('./make_task'); const assert = require('assert'); class TaskGather extends Task { constructor(logger, opts, parentTask) { super(logger, opts); this.preconditions = TaskPreconditions.Endpoint; [ 'finishOnKey', 'hints', 'input', 'numDigits', 'partialResultHook', 'speechTimeout', 'timeout', 'say', 'play' ].forEach((k) => this[k] = this.data[k]); this.timeout = (this.timeout || 5) * 1000; this.interim = this.partialResultCallback; if (this.data.recognizer) { const recognizer = this.data.recognizer; this.vendor = recognizer.vendor; this.language = recognizer.language; this.hints = recognizer.hints || []; this.altLanguages = recognizer.altLanguages || []; /* vad: if provided, we dont connect to recognizer until voice activity is detected */ const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {}; this.vad = {enable, voiceMs, mode}; /* aws options */ this.vocabularyName = recognizer.vocabularyName; this.vocabularyFilterName = recognizer.vocabularyFilterName; this.filterMethod = recognizer.filterMethod; /* microsoft options */ this.outputFormat = recognizer.outputFormat || 'simple'; this.profanityOption = recognizer.profanityOption || 'raw'; this.requestSnr = recognizer.requestSnr || false; this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0; } this.digitBuffer = ''; this._earlyMedia = this.data.earlyMedia === true; if (this.say) this.sayTask = makeTask(this.logger, {say: this.say}, this); if (this.play) this.playTask = makeTask(this.logger, {play: this.play}, this); this.parentTask = parentTask; } get name() { return TaskName.Gather; } get needsStt() { return this.input.includes('speech'); } get earlyMedia() { return (this.sayTask && this.sayTask.earlyMedia) || (this.playTask && this.playTask.earlyMedia); } async exec(cs, ep) { await super.exec(cs); const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf); this.ep = ep; if ('default' === this.vendor || !this.vendor) this.vendor = cs.speechRecognizerVendor; if ('default' === this.language || !this.language) this.language = cs.speechRecognizerLanguage; this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt'); if (this.needsStt && !this.sttCredentials) { const {writeAlerts, AlertType} = cs.srf.locals; this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but creds not supplied`); writeAlerts({ account_sid: cs.accountSid, alert_type: AlertType.STT_NOT_PROVISIONED, vendor: this.vendor }).catch((err) => this.logger.info({err}, 'Error generating alert for no stt')); throw new Error(`no speech-to-text service credentials for ${this.vendor} have been configured`); } try { if (this.sayTask) { this.sayTask.exec(cs, ep); // kicked off, _not_ waiting for it to complete this.sayTask.on('playDone', (err) => { if (!this.killed) this._startTimer(); }); } else if (this.playTask) { this.playTask.exec(cs, ep); // kicked off, _not_ waiting for it to complete this.playTask.on('playDone', (err) => { if (!this.killed) this._startTimer(); }); } else this._startTimer(); if (this.input.includes('speech')) { await this._initSpeech(cs, ep); this._startTranscribing(ep); updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid) .catch(() => {/*already logged error */}); } if (this.input.includes('digits')) { ep.on('dtmf', this._onDtmf.bind(this, cs, ep)); } await this.awaitTaskDone(); } catch (err) { this.logger.error(err, 'TaskGather:exec error'); } ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription); ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance); ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription); ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription); ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected); } kill(cs) { super.kill(cs); this._killAudio(cs); this.ep.removeAllListeners('dtmf'); this._resolve('killed'); } _onDtmf(cs, ep, evt) { this.logger.debug(evt, 'TaskGather:_onDtmf'); if (evt.dtmf === this.finishOnKey) this._resolve('dtmf-terminator-key'); else { this.digitBuffer += evt.dtmf; if (this.digitBuffer.length === this.numDigits) this._resolve('dtmf-num-digits'); } this._killAudio(cs); } async _initSpeech(cs, ep) { const opts = {}; if (this.vad.enable) { opts.START_RECOGNIZING_ON_VAD = 1; if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs; if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode; } if ('google' === this.vendor) { if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials); Object.assign(opts, { GOOGLE_SPEECH_USE_ENHANCED: true, GOOGLE_SPEECH_SINGLE_UTTERANCE: true, GOOGLE_SPEECH_MODEL: 'command_and_search' }); if (this.hints && this.hints.length > 1) { opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(','); } if (this.altLanguages && this.altLanguages.length > 1) { opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(','); } if (this.profanityFilter === true) { Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true}); } ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep)); } else if (['aws', 'polly'].includes(this.vendor)) { if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName; if (this.vocabularyFilterName) { opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName; opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask'; } if (this.sttCredentials) { Object.assign(opts, { AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId, AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey, AWS_REGION: this.sttCredentials.region }); } ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); } else if ('microsoft' === this.vendor) { if (this.sttCredentials) { Object.assign(opts, { 'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key, 'AZURE_REGION': this.sttCredentials.region }); } if (this.hints && this.hints.length > 1) { opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(','); } //if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1; //if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption; if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs; opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1; ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep)); } await ep.set(opts) .catch((err) => this.logger.info(err, 'Error setting channel variables')); } _startTranscribing(ep) { ep.startTranscription({ vendor: this.vendor, locale: this.language, interim: this.partialResultCallback ? true : false, }).catch((err) => { const {writeAlerts, AlertType} = this.cs.srf.locals; this.logger.error(err, 'TaskGather:_startTranscribing error'); writeAlerts({ account_sid: this.cs.accountSid, alert_type: AlertType.STT_FAILURE, vendor: this.vendor, detail: err.message }); }).catch((err) => this.logger.info({err}, 'Error generating alert for tts failure')); } _startTimer() { assert(!this._timeoutTimer); this.logger.debug(`Gather:_startTimer: timeout ${this.timeout}`); this._timeoutTimer = setTimeout(() => this._resolve('timeout'), this.timeout); } _clearTimer() { if (this._timeoutTimer) { clearTimeout(this._timeoutTimer); this._timeoutTimer = null; } } _killAudio(cs) { if (this.sayTask && !this.sayTask.killed) { this.sayTask.removeAllListeners('playDone'); this.sayTask.kill(cs); this.sayTask = null; } if (this.playTask && !this.playTask.killed) { this.playTask.removeAllListeners('playDone'); this.playTask.kill(cs); this.playTask = null; } } _onTranscription(cs, ep, evt) { if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0]; if ('microsoft' === this.vendor) { const nbest = evt.NBest; const newEvent = { is_final: evt.RecognitionStatus === 'Success', alternatives: [ { confidence: nbest[0].Confidence, transcript: nbest[0].Display } ] }; evt = newEvent; } this.logger.debug(evt, 'TaskGather:_onTranscription'); if (evt.is_final) this._resolve('speech', evt); else if (this.partialResultHook) { this.cs.requestor.request(this.partialResultHook, Object.assign({speech: evt}, this.cs.callInfo)) .catch((err) => this.logger.info(err, 'GatherTask:_onTranscription error')); } } _onEndOfUtterance(cs, ep) { this.logger.info('TaskGather:_onEndOfUtterance'); if (!this.resolved && !this.killed) { this._startTranscribing(ep); } } _onNoSpeechDetected(cs, ep) { this._resolve('timeout'); } async _resolve(reason, evt) { if (this.resolved) return; this.resolved = true; this.logger.debug(`TaskGather:resolve with reason ${reason}`); if (this.ep && this.ep.connected) { this.ep.stopTranscription({vendor: this.vendor}) .catch((err) => this.logger.error({err}, 'Error stopping transcription')); } this._clearTimer(); if (reason.startsWith('dtmf')) { await this.performAction({digits: this.digitBuffer, reason: 'dtmfDetected'}); } else if (reason.startsWith('speech')) { if (this.parentTask) this.parentTask.emit('transcription', evt); else await this.performAction({speech: evt, reason: 'speechDetected'}); } else if (reason.startsWith('timeout')) { if (this.parentTask) this.parentTask.emit('timeout', evt); else await this.performAction({reason: 'timeout'}); } this.notifyTaskDone(); } } module.exports = TaskGather;