Feature/nuance stt (#185)

* initial changes to gather to support nuance stt * updateSpeechCredentialLastUsed could be called without a speech_credential_sid if credentials are passed in the flow * fix bugname * typo * added handlers for nuance * logging * major refactor of parsing transcriptions * initial support for nuance in transcribe verb * updates from testing * cleanup some tests * update action * typo * gather: start nuance timers after say/play completes * update drachtio-fsrmf * refactor some code * typo * log nuance error detail * timeout handling * typo * handle nuance 413 response when recognition times out * typo in specs.json * add support for nuance resources * fixes and tests for transcribe * remove logging from test * initial support for kryptonEndpoint * try getting access token even when using krypton * typo in kryptonEndpoint property * add support for Nuance tts * parse nuance voice and model for tts * use nuance credentials from db * update to db-helpers@0.7.0 with caching option * add support for azure audio logging in gather/transcribe * sync package-lock.json
2026-02-14 18:30:59 +00:00 · 2022-11-01 12:23:49 -04:00
parent 203b9774ca
commit 509bb065bb
20 changed files with 1100 additions and 974 deletions
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -4,8 +4,10 @@ const {
  TaskPreconditions,
  GoogleTranscriptionEvents,
  AzureTranscriptionEvents,
-  AwsTranscriptionEvents
+  AwsTranscriptionEvents,
+  NuanceTranscriptionEvents
 } = require('../utils/constants');
+const normalizeJambones = require('../utils/normalize-jambones');

 class TaskTranscribe extends Task {
  constructor(logger, opts, parentTask) {
@@ -13,6 +15,10 @@ class TaskTranscribe extends Task {
    this.preconditions = TaskPreconditions.Endpoint;
    this.parentTask = parentTask;

+    const {setChannelVarsForStt, normalizeTranscription} = require('../utils/transcription-utils')(logger);
+    this.setChannelVarsForStt = setChannelVarsForStt;
+    this.normalizeTranscription = normalizeTranscription;
+
    this.transcriptionHook = this.data.transcriptionHook;
    this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);

@@ -22,39 +28,15 @@ class TaskTranscribe extends Task {
    this.interim = !!recognizer.interim;
    this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;

-    /* vad: if provided, we dont connect to recognizer until voice activity is detected */
-    const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {};
-    this.vad = {enable, voiceMs, mode};
+    if (recognizer.vendor === 'nuance') {
+      const {clientId, secret} = recognizer.nuanceOptions;
+      if (clientId && secret) {
+        this.sttCredentials = {client_id: clientId, secret};
+      }
+    }

-    /* google-specific options */
-    this.hints = recognizer.hints || [];
-    this.hintsBoost = recognizer.hintsBoost;
-    this.profanityFilter = recognizer.profanityFilter;
-    this.punctuation = !!recognizer.punctuation;
-    this.enhancedModel = !!recognizer.enhancedModel;
-    this.model = recognizer.model || 'phone_call';
-    this.words = !!recognizer.words;
-    this.singleUtterance = recognizer.singleUtterance || false;
-    this.diarization = !!recognizer.diarization;
-    this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
-    this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
-    this.interactionType = recognizer.interactionType || 'unspecified';
-    this.naicsCode = recognizer.naicsCode || 0;
-    this.altLanguages = recognizer.altLanguages || [];
-
-    /* aws-specific options */
-    this.identifyChannels = !!recognizer.identifyChannels;
-    this.vocabularyName = recognizer.vocabularyName;
-    this.vocabularyFilterName = recognizer.vocabularyFilterName;
-    this.filterMethod = recognizer.filterMethod;
-
-    /* microsoft options */
-    this.outputFormat = recognizer.outputFormat || 'simple';
-    this.profanityOption = recognizer.profanityOption || 'raw';
-    this.requestSnr = recognizer.requestSnr || false;
-    this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
-    this.azureServiceEndpoint = recognizer.azureServiceEndpoint;
-    this.azureSttEndpointId = recognizer.azureSttEndpointId;
+    recognizer.hints = recognizer.hints || [];
+    recognizer.altLanguages = recognizer.altLanguages || [];
  }

  get name() { return TaskName.Transcribe; }
@@ -62,21 +44,22 @@ class TaskTranscribe extends Task {
  async exec(cs, {ep, ep2}) {
    super.exec(cs);
    const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
+    const {getNuanceAccessToken} = cs.srf.locals.dbHelpers;

    if (cs.hasGlobalSttHints) {
      const {hints, hintsBoost} = cs.globalSttHints;
-      this.hints = this.hints.concat(hints);
-      if (!this.hintsBoost && hintsBoost) this.hintsBoost = hintsBoost;
-      this.logger.debug({hints: this.hints, hintsBoost: this.hintsBoost},
-        'Transcribe:exec - applying global `sttHints');
+      this.data.recognizer.hints = this.data.recognizer.hints.concat(hints);
+      if (!this.data.recognizer.hintsBoost && hintsBoost) this.data.recognizer.hintsBoost = hintsBoost;
+      this.logger.debug({hints: this.data.recognizer.hints, hintsBoost: this.data.recognizer.hintsBoost},
+        'Transcribe:exec - applying global sttHints');
    }
    if (cs.hasAltLanguages) {
-      this.altLanguages = this.altLanguages.concat(cs.altLanguages);
+      this.data.recognizer.altLanguages = this.data.recognizer.altLanguages.concat(cs.altLanguages);
      this.logger.debug({altLanguages: this.altLanguages},
-        'Gather:exec - applying altLanguages');
+        'Transcribe:exec - applying altLanguages');
    }
-    if (cs.hasGlobalSttPunctuation) {
-      this.punctuation = cs.globalSttPunctuation;
+    if (cs.hasGlobalSttPunctuation && !this.data.recognizer.punctuation) {
+      this.data.recognizer.punctuation = cs.globalSttPunctuation;
    }

    this.ep = ep;
@@ -96,6 +79,16 @@ class TaskTranscribe extends Task {
        }).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
        throw new Error('no provisioned speech credentials for TTS');
      }
+
+      if (this.vendor === 'nuance' && this.sttCredentials.client_id) {
+        /* get nuance access token */
+        const {client_id, secret} = this.sttCredentials;
+        const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
+        this.logger.debug({client_id},
+          `Transcribe:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
+        this.sttCredentials = {...this.sttCredentials, access_token};
+      }
+
      await this._startTranscribing(cs, ep, 1);
      if (this.separateRecognitionPerChannel && ep2) {
        await this._startTranscribing(cs, ep2, 2);
@@ -110,13 +103,21 @@ class TaskTranscribe extends Task {
      this.parentTask && this.parentTask.emit('error', err);
    }
    ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.NoAudioDetected);
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.MaxDurationExceeded);
+    ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
+    ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
+
    ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(AwsTranscriptionEvents.NoAudioDetected);
-    ep.removeCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded);
+    ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
+
    ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
    ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
+
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
  }

  async kill(cs) {
@@ -140,124 +141,53 @@ class TaskTranscribe extends Task {
  }

  async _startTranscribing(cs, ep, channel) {
-    const opts = {};
+    const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
+    switch (this.vendor) {
+      case 'google':
+        this.bugname = 'google_transcribe';
+        ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription,
+          this._onTranscription.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(GoogleTranscriptionEvents.NoAudioDetected,
+          this._onNoAudio.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(GoogleTranscriptionEvents.MaxDurationExceeded,
+          this._onMaxDurationExceeded.bind(this, cs, ep, channel));
+        break;

-    if (this.vad.enable) {
-      opts.START_RECOGNIZING_ON_VAD = 1;
-      if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs;
-      if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode;
+      case 'aws':
+      case 'polly':
+        this.bugname = 'aws_transcribe';
+        ep.addCustomEventListener(AwsTranscriptionEvents.Transcription,
+          this._onTranscription.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected,
+          this._onNoAudio.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
+          this._onMaxDurationExceeded.bind(this, cs, ep, channel));
+        break;
+      case 'microsoft':
+        this.bugname = 'azure_transcribe';
+        ep.addCustomEventListener(AzureTranscriptionEvents.Transcription,
+          this._onTranscription.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected,
+          this._onNoAudio.bind(this, cs, ep, channel));
+        break;
+      case 'nuance':
+        this.bugname = 'nuance_transcribe';
+        ep.addCustomEventListener(NuanceTranscriptionEvents.Transcription,
+          this._onTranscription.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech,
+          this._onStartOfSpeech.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete,
+          this._onTranscriptionComplete.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(AzureTranscriptionEvents.Error,
+          this._onNuanceError.bind(this, cs, ep, channel));
+        break;
+      default:
+        throw new Error(`Invalid vendor ${this.vendor}`);
    }

-    ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription,
-      this._onTranscription.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(GoogleTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(GoogleTranscriptionEvents.MaxDurationExceeded,
-      this._onMaxDurationExceeded.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
-      this._onMaxDurationExceeded.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(AzureTranscriptionEvents.Transcription,
-      this._onTranscription.bind(this, cs, ep, channel));
-    ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoAudio.bind(this, cs, ep, channel));
+    await ep.set(opts)
+      .catch((err) => this.logger.info(err, 'Error setting channel variables'));

-    if (this.vendor === 'google') {
-      this.bugname = 'google_transcribe';
-      if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
-      [
-        ['enhancedModel', 'GOOGLE_SPEECH_USE_ENHANCED'],
-        //['separateRecognitionPerChannel', 'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL'],
-        ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
-        ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
-        ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
-        ['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'],
-        ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
-      ].forEach((arr) => {
-        if (this[arr[0]]) opts[arr[1]] = true;
-        else if (this[arr[0]] === false) opts[arr[1]] = false;
-      });
-      if (this.hints.length > 0) {
-        opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
-        if (typeof this.hintsBoost === 'number') {
-          opts.GOOGLE_SPEECH_HINTS_BOOST = this.hintsBoost;
-        }
-      }
-      if (this.altLanguages.length > 0) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
-      else opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = '';
-      if ('unspecified' !== this.interactionType) {
-        opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
-      }
-      opts.GOOGLE_SPEECH_MODEL = this.model;
-      if (this.diarization && this.diarizationMinSpeakers > 0) {
-        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
-      }
-      if (this.diarization && this.diarizationMaxSpeakers > 0) {
-        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT = this.diarizationMaxSpeakers;
-      }
-      if (this.naicsCode > 0) opts.GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE = this.naicsCode;
-
-      await ep.set(opts)
-        .catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with google'));
-    }
-    else if (this.vendor === 'aws') {
-      this.bugname = 'aws_transcribe';
-      [
-        ['diarization', 'AWS_SHOW_SPEAKER_LABEL'],
-        ['identifyChannels', 'AWS_ENABLE_CHANNEL_IDENTIFICATION']
-      ].forEach((arr) => {
-        if (this[arr[0]]) opts[arr[1]] = true;
-      });
-      if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
-      if (this.vocabularyFilterName) {
-        opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
-        opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
-      }
-
-      if (this.sttCredentials) {
-        Object.assign(opts, {
-          AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
-          AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
-          AWS_REGION: this.sttCredentials.region
-        });
-      }
-      else {
-        Object.assign(opts, {
-          AWS_ACCESS_KEY_ID: process.env.AWS_ACCESS_KEY_ID,
-          AWS_SECRET_ACCESS_KEY: process.env.AWS_SECRET_ACCESS_KEY,
-          AWS_REGION: process.env.AWS_REGION
-        });
-      }
-
-      await ep.set(opts)
-        .catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with aws'));
-    }
-    else if (this.vendor === 'microsoft') {
-      this.bugname = 'azure_transcribe';
-      const {api_key, region, use_custom_stt, custom_stt_endpoint} = this.sttCredentials;
-      Object.assign(opts, {
-        'AZURE_SUBSCRIPTION_KEY': api_key,
-        'AZURE_REGION': region
-      });
-      if (this.azureSttEndpointId) {
-        Object.assign(opts, {'AZURE_SERVICE_ENDPOINT_ID': this.azureSttEndpointId});
-      }
-      else if (use_custom_stt && custom_stt_endpoint) {
-        Object.assign(opts, {'AZURE_SERVICE_ENDPOINT_ID': custom_stt_endpoint});
-      }
-      if (this.hints && this.hints.length > 0) {
-        opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
-      }
-      if (this.altLanguages.length > 0) opts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
-      else opts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = '';
-      if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
-      if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
-      if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
-      if (this.outputFormat !== 'simple') opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
-      if (this.azureServiceEndpoint) opts.AZURE_SERVICE_ENDPOINT = this.azureServiceEndpoint;
-
-      await ep.set(opts)
-        .catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with azure'));
-    }
    await this._transcribe(ep);
  }

@@ -271,50 +201,43 @@ class TaskTranscribe extends Task {
    });
  }

-  _onTranscription(cs, ep, channel, evt, fsEvent) {
+  async _onTranscription(cs, ep, channel, evt, fsEvent) {
    // make sure this is not a transcript from answering machine detection
    const bugname = fsEvent.getHeader('media-bugname');
    if (bugname && this.bugname !== bugname) return;

-    this.logger.debug({evt, channel}, 'TaskTranscribe:_onTranscription');
-    if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
-    if ('microsoft' === this.vendor) {
-      const nbest = evt.NBest;
-      const language_code = evt.PrimaryLanguage?.Language || this.language;
-      const alternatives = nbest ? nbest.map((n) => {
-        return {
-          confidence: n.Confidence,
-          transcript: n.Display
-        };
-      }) :
-        [
-          {
-            transcript: evt.DisplayText || evt.Text
-          }
-        ];
+    this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');

-      const newEvent = {
-        is_final: evt.RecognitionStatus === 'Success',
-        channel,
-        language_code,
-        alternatives
-      };
-      evt = newEvent;
-    }
+    evt = this.normalizeTranscription(evt, this.vendor, channel, this.language);
+
+    this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');

    if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
      this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
      return this._transcribe(ep);
    }

-    evt.channel_tag = channel;
-
    if (this.transcriptionHook) {
      const b3 = this.getTracingPropagation();
      const httpHeaders = b3 && {b3};
-      this.cs.requestor.request('verb:hook', this.transcriptionHook,
-        Object.assign({speech: evt}, this.cs.callInfo), httpHeaders)
-        .catch((err) => this.logger.info(err, 'TranscribeTask:_onTranscription error'));
+      try {
+        const json = await this.cs.requestor.request('verb:hook', this.transcriptionHook, {
+          ...this.cs.callInfo,
+          ...httpHeaders,
+          speech: evt
+        });
+        this.logger.info({json}, 'sent transcriptionHook');
+        if (json && Array.isArray(json) && !this.parentTask) {
+          const makeTask = require('./make_task');
+          const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
+          if (tasks && tasks.length > 0) {
+            this.logger.info({tasks: tasks}, `${this.name} replacing application with ${tasks.length} tasks`);
+            this.cs.replaceApplication(tasks);
+          }
+        }
+      } catch (err) {
+        this.logger.info(err, 'TranscribeTask:_onTranscription error');
+      }
    }
    if (this.parentTask) {
      this.parentTask.emit('transcription', evt);