Feature/nuance stt (#185)

* initial changes to gather to support nuance stt * updateSpeechCredentialLastUsed could be called without a speech_credential_sid if credentials are passed in the flow * fix bugname * typo * added handlers for nuance * logging * major refactor of parsing transcriptions * initial support for nuance in transcribe verb * updates from testing * cleanup some tests * update action * typo * gather: start nuance timers after say/play completes * update drachtio-fsrmf * refactor some code * typo * log nuance error detail * timeout handling * typo * handle nuance 413 response when recognition times out * typo in specs.json * add support for nuance resources * fixes and tests for transcribe * remove logging from test * initial support for kryptonEndpoint * try getting access token even when using krypton * typo in kryptonEndpoint property * add support for Nuance tts * parse nuance voice and model for tts * use nuance credentials from db * update to db-helpers@0.7.0 with caching option * add support for azure audio logging in gather/transcribe * sync package-lock.json
2026-02-14 18:30:59 +00:00 · 2022-11-01 12:23:49 -04:00
parent 203b9774ca
commit 509bb065bb
20 changed files with 1100 additions and 974 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -3,25 +3,22 @@ const {
  TaskName,
  TaskPreconditions,
  GoogleTranscriptionEvents,
+  NuanceTranscriptionEvents,
  AwsTranscriptionEvents,
  AzureTranscriptionEvents
 } = require('../utils/constants');

 const makeTask = require('./make_task');
 const assert = require('assert');
-//const GATHER_STABILITY_THRESHOLD =  Number(process.env.JAMBONZ_GATHER_STABILITY_THRESHOLD || 0.7);

 const compileTranscripts = (logger, evt, arr) => {
-  //logger.debug({arr, evt}, 'compile transcripts');
  if (!Array.isArray(arr) || arr.length === 0) return;
  let t = '';
  for (const a of arr) {
-    //logger.debug(`adding ${a.alternatives[0].transcript}`);
    t += ` ${a.alternatives[0].transcript}`;
  }
  t += ` ${evt.alternatives[0].transcript}`;
  evt.alternatives[0].transcript = t.trim();
-  //logger.debug(`compiled transcript: ${evt.alternatives[0].transcript}`);
 };

 class TaskGather extends Task {
@@ -29,6 +26,15 @@ class TaskGather extends Task {
    super(logger, opts);
    this.preconditions = TaskPreconditions.Endpoint;

+    const {
+      setChannelVarsForStt,
+      normalizeTranscription,
+      removeSpeechListeners
+    } = require('../utils/transcription-utils')(logger);
+    this.setChannelVarsForStt = setChannelVarsForStt;
+    this.normalizeTranscription = normalizeTranscription;
+    this.removeSpeechListeners = removeSpeechListeners;
+
    [
      'finishOnKey', 'hints', 'input', 'numDigits', 'minDigits', 'maxDigits',
      'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
@@ -47,47 +53,23 @@ class TaskGather extends Task {
      const recognizer = this.data.recognizer;
      this.vendor = recognizer.vendor;
      this.language = recognizer.language;
-      this.hints = recognizer.hints || [];
-      this.hintsBoost = recognizer.hintsBoost;
-      this.profanityFilter = recognizer.profanityFilter;
-      this.punctuation = !!recognizer.punctuation;
-      this.enhancedModel = !!recognizer.enhancedModel;
-      this.model = recognizer.model || 'command_and_search';
-      this.words = !!recognizer.words;
-      this.singleUtterance = recognizer.singleUtterance || true;
-      this.diarization = !!recognizer.diarization;
-      this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
-      this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
-      this.interactionType = recognizer.interactionType || 'unspecified';
-      this.naicsCode = recognizer.naicsCode || 0;
-      this.altLanguages = recognizer.altLanguages || [];
+
+      if (recognizer.vendor === 'nuance') {
+        const {clientId, secret} = recognizer.nuanceOptions;
+        if (clientId && secret) {
+          this.sttCredentials = {client_id: clientId, secret};
+        }
+      }

      /* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
      this.asrTimeout = typeof recognizer.asrTimeout === 'number' ? recognizer.asrTimeout * 1000 : 0;
      if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = recognizer.asrDtmfTerminationDigit;
      this.isContinuousAsr = this.asrTimeout > 0;

-      /* vad: if provided, we dont connect to recognizer until voice activity is detected */
-      const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {};
-      this.vad = {enable, voiceMs, mode};
-
-      /* aws options */
-      this.vocabularyName = recognizer.vocabularyName;
-      this.vocabularyFilterName = recognizer.vocabularyFilterName;
-      this.filterMethod = recognizer.filterMethod;
-
-      /* microsoft options */
-      this.outputFormat = recognizer.outputFormat || 'simple';
-      this.profanityOption = recognizer.profanityOption || 'raw';
-      this.requestSnr = recognizer.requestSnr || false;
-      this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
-      this.azureServiceEndpoint = recognizer.azureServiceEndpoint;
-      this.azureSttEndpointId = recognizer.azureSttEndpointId;
-    }
-    else {
-      this.hints = [];
-      this.altLanguages = [];
+      this.data.recognizer.hints = this.data.recognizer.hints || [];
+      this.data.recognizer.altLanguages = this.data.recognizer.altLanguages || [];
    }
+    else this.data.recognizer = {hints: [], altLanguages: []};

    this.digitBuffer = '';
    this._earlyMedia = this.data.earlyMedia === true;
@@ -134,21 +116,22 @@ class TaskGather extends Task {
    this.logger.debug('Gather:exec');
    await super.exec(cs);
    const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
+    const {getNuanceAccessToken} = cs.srf.locals.dbHelpers;

    if (cs.hasGlobalSttHints) {
      const {hints, hintsBoost} = cs.globalSttHints;
-      this.hints = this.hints.concat(hints);
-      if (!this.hintsBoost && hintsBoost) this.hintsBoost = hintsBoost;
-      this.logger.debug({hints: this.hints, hintsBoost: this.hintsBoost},
+      this.data.recognizer.hints = this.data.recognizer.hints.concat(hints);
+      if (!this.data.recognizer.hintsBoost && hintsBoost) this.data.recognizer.hintsBoost = hintsBoost;
+      this.logger.debug({hints: this.data.recognizer.hints, hintsBoost: this.data.recognizer.hintsBoost},
        'Gather:exec - applying global sttHints');
    }
    if (cs.hasAltLanguages) {
-      this.altLanguages = this.altLanguages.concat(cs.altLanguages);
+      this.data.recognizer.altLanguages = this.data.recognizer.altLanguages.concat(cs.altLanguages);
      this.logger.debug({altLanguages: this.altLanguages},
        'Gather:exec - applying altLanguages');
    }
-    if (cs.hasGlobalSttPunctuation) {
-      this.punctuation = cs.globalSttPunctuation;
+    if (cs.hasGlobalSttPunctuation && !this.data.recognizer.punctuation) {
+      this.data.recognizer.punctuation = cs.globalSttPunctuation;
    }
    if (!this.isContinuousAsr && cs.isContinuousAsr) {
      this.isContinuousAsr = true;
@@ -162,7 +145,8 @@ class TaskGather extends Task {
    this.ep = ep;
    if ('default' === this.vendor || !this.vendor) this.vendor = cs.speechRecognizerVendor;
    if ('default' === this.language || !this.language) this.language = cs.speechRecognizerLanguage;
-    this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
+
+    if (this.needsStt && !this.sttCredentials) this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
    if (this.needsStt && !this.sttCredentials) {
      const {writeAlerts, AlertType} = cs.srf.locals;
      this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
@@ -175,16 +159,27 @@ class TaskGather extends Task {
      throw new Error(`no speech-to-text service credentials for ${this.vendor} have been configured`);
    }

+    this.logger.info({sttCredentials: this.sttCredentials}, 'Gather:exec - sttCredentials');
+    if (this.vendor === 'nuance' && this.sttCredentials.client_id) {
+      /* get nuance access token */
+      const {client_id, secret} = this.sttCredentials;
+      const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
+      this.logger.debug({client_id}, `Gather:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
+      this.sttCredentials = {...this.sttCredentials, access_token};
+    }
    const startListening = (cs, ep) => {
      this._startTimer();
      if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
      if (this.input.includes('speech') && !this.listenDuringPrompt) {
+        this.logger.debug('Gather:exec - calling _initSpeech');
        this._initSpeech(cs, ep)
          .then(() => {
            this._startTranscribing(ep);
            return updateSpeechCredentialLastUsed(this.sttCredentials.speech_credential_sid);
          })
-          .catch(() => {});
+          .catch((err) => {
+            this.logger.error({err}, 'error in initSpeech');
+          });
      }
    };

@@ -198,7 +193,15 @@ class TaskGather extends Task {
          span.end();
          if (err) this.logger.error({err}, 'Gather:exec Error playing tts');
          this.logger.debug('Gather: nested say task completed');
-          if (!this.killed) startListening(cs, ep);
+          if (!this.killed) {
+            startListening(cs, ep);
+            if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
+              this.logger.debug('Gather:exec - starting transcription timers after say completes');
+              ep.startTranscriptionTimers((err) => {
+                if (err) this.logger.error({err}, 'Gather:exec - error starting transcription timers');
+              });
+            }
+          }
        });
      }
      else if (this.playTask) {
@@ -210,7 +213,15 @@ class TaskGather extends Task {
          span.end();
          if (err) this.logger.error({err}, 'Gather:exec Error playing url');
          this.logger.debug('Gather: nested play task completed');
-          if (!this.killed) startListening(cs, ep);
+          if (!this.killed) {
+            startListening(cs, ep);
+            if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
+              this.logger.debug('Gather:exec - starting transcription timers after play completes');
+              ep.startTranscriptionTimers((err) => {
+                if (err) this.logger.error({err}, 'Gather:exec - error starting transcription timers');
+              });
+            }
+          }
        });
      }
      else startListening(cs, ep);
@@ -230,14 +241,7 @@ class TaskGather extends Task {
    } catch (err) {
      this.logger.error(err, 'TaskGather:exec error');
    }
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
-    ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
-    ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
-    ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
+    this.removeSpeechListeners(ep);
  }

  kill(cs) {
@@ -292,106 +296,52 @@ class TaskGather extends Task {
  }

  async _initSpeech(cs, ep) {
-    const opts = {};
+    const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
+    this.logger.debug(opts, 'TaskGather:_initSpeech - channel vars');
+    switch (this.vendor) {
+      case 'google':
+        this.bugname = 'google_transcribe';
+        ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
+        ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
+        ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
+        break;

-    if (this.vad?.enable) {
-      opts.START_RECOGNIZING_ON_VAD = 1;
-      if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs;
-      else opts.RECOGNIZER_VAD_VOICE_MS = 125;
-      if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode;
-    }
+      case 'aws':
+      case 'polly':
+        this.bugname = 'aws_transcribe';
+        ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
+        ep.addCustomEventListener(AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
+        break;
+      case 'microsoft':
+        this.bugname = 'azure_transcribe';
+        ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
+        ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected,
+          this._onNoSpeechDetected.bind(this, cs, ep));
+        ep.addCustomEventListener(AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
+        break;
+      case 'nuance':
+        this.bugname = 'nuance_transcribe';
+        ep.addCustomEventListener(NuanceTranscriptionEvents.Transcription,
+          this._onTranscription.bind(this, cs, ep));
+        ep.addCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech,
+          this._onStartOfSpeech.bind(this, cs, ep));
+        ep.addCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete,
+          this._onTranscriptionComplete.bind(this, cs, ep));
+        ep.addCustomEventListener(NuanceTranscriptionEvents.VadDetected,
+          this._onVadDetected.bind(this, cs, ep));
+        ep.addCustomEventListener(NuanceTranscriptionEvents.Error,
+          this._onNuanceError.bind(this, cs, ep));

-    if ('google' === this.vendor) {
-      this.bugname = 'google_transcribe';
-      if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
-      [
-        ['enhancedModel', 'GOOGLE_SPEECH_USE_ENHANCED'],
-        ['separateRecognitionPerChannel', 'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL'],
-        ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
-        ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
-        ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
-        ['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'],
-        ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
-      ].forEach((arr) => {
-        if (this[arr[0]]) opts[arr[1]] = true;
-        else if (this[arr[0]] === false) opts[arr[1]] = false;
-      });
-      if (this.hints.length > 0) {
-        opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
-        if (typeof this.hintsBoost === 'number') {
-          opts.GOOGLE_SPEECH_HINTS_BOOST = this.hintsBoost;
+        /* stall timers until prompt finishes playing */
+        if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
+          opts.NUANCE_STALL_TIMERS = 1;
        }
-      }
-      if (this.altLanguages.length > 0) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
-      else opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = '';
-      if ('unspecified' !== this.interactionType) {
-        opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
-      }
-      opts.GOOGLE_SPEECH_MODEL = this.model;
-      if (this.diarization && this.diarizationMinSpeakers > 0) {
-        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
-      }
-      if (this.diarization && this.diarizationMaxSpeakers > 0) {
-        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT = this.diarizationMaxSpeakers;
-      }
-      if (this.naicsCode > 0) opts.GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE = this.naicsCode;
-      ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
-      ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
-      ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
-    }
-    else if (['aws', 'polly'].includes(this.vendor)) {
-      this.bugname = 'aws_transcribe';
-      if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
-      if (this.vocabularyFilterName) {
-        opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
-        opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
-      }
-      if (this.sttCredentials) {
-        Object.assign(opts, {
-          AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
-          AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
-          AWS_REGION: this.sttCredentials.region
-        });
-      }
-      ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
-      ep.addCustomEventListener(AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
-    }
-    else if ('microsoft' === this.vendor) {
-      this.bugname = 'azure_transcribe';
-      if (this.sttCredentials) {
-        const {api_key, region, use_custom_stt, custom_stt_endpoint} = this.sttCredentials;

-        Object.assign(opts, {
-          'AZURE_SUBSCRIPTION_KEY': api_key,
-          'AZURE_REGION': region
-        });
-        if (this.azureSttEndpointId) {
-          Object.assign(opts, {'AZURE_SERVICE_ENDPOINT_ID': this.azureSttEndpointId});
-        }
-        else if (use_custom_stt && custom_stt_endpoint) {
-          Object.assign(opts, {'AZURE_SERVICE_ENDPOINT_ID': custom_stt_endpoint});
-        }
-      }
-      if (this.hints && this.hints.length > 0) {
-        opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
-      }
-      if (this.altLanguages && this.altLanguages.length > 0) {
-        opts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
-      }
-      else {
-        opts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = '';
-      }
-      if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
-      if (this.profanityOption && this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
-      if (this.azureServiceEndpoint) opts.AZURE_SERVICE_ENDPOINT = this.azureServiceEndpoint;
-      if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
-      else if (this.timeout === 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = 120000;  // lengthy
-      opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
-
-      ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
-      ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep));
-      ep.addCustomEventListener(AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
+        break;
+      default:
+        throw new Error(`Invalid vendor ${this.vendor}`);
    }
+
    await ep.set(opts)
      .catch((err) => this.logger.info(err, 'Error setting channel variables'));
  }
@@ -489,40 +439,12 @@ class TaskGather extends Task {

  _onTranscription(cs, ep, evt, fsEvent) {
    // make sure this is not a transcript from answering machine detection
+    this.logger.debug({evt}, 'Gather:_onTranscription');
    const bugname = fsEvent.getHeader('media-bugname');
    const finished = fsEvent.getHeader('transcription-session-finished');
    if (bugname && this.bugname !== bugname) return;

-    if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
-    if ('microsoft' === this.vendor) {
-      const final = evt.RecognitionStatus === 'Success';
-      if (final) {
-        // don't sort based on confidence: https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues/1463
-        //const nbest = evt.NBest.sort((a, b) => b.Confidence - a.Confidence);
-        const nbest = evt.NBest;
-        const language_code = evt.PrimaryLanguage?.Language || this.language;
-        evt = {
-          is_final: true,
-          language_code,
-          alternatives: [
-            {
-              confidence: nbest[0].Confidence,
-              transcript: nbest[0].Display
-            }
-          ]
-        };
-      }
-      else {
-        evt = {
-          is_final: false,
-          alternatives: [
-            {
-              transcript: evt.Text
-            }
-          ]
-        };
-      }
-    }
+    evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);

    /* count words for bargein feature */
    const words = evt.alternatives[0].transcript.split(' ').length;
@@ -607,6 +529,24 @@ class TaskGather extends Task {
    }
  }

+  _onStartOfSpeech(cs, ep) {
+    this.logger.debug('TaskGather:_onStartOfSpeech');
+  }
+  _onTranscriptionComplete(cs, ep) {
+    this.logger.debug('TaskGather:_onTranscriptionComplete');
+  }
+  _onNuanceError(cs, ep, evt) {
+    const {code, error, details} = evt;
+    if (code === 404 && error === 'No speech') {
+      this.logger.debug({code, error, details}, 'TaskGather:_onNuanceError');
+      return this._resolve('timeout');
+    }
+    this.logger.info({code, error, details}, 'TaskGather:_onNuanceError');
+    if (code === 413 && error === 'Too much speech') {
+      return this._resolve('timeout');
+    }
+  }
+
  _onVadDetected(cs, ep) {
    if (this.bargein && this.minBargeinWordCount === 0) {
      this.logger.debug('TaskGather:_onVadDetected');