Feature/nuance stt (#185)

* initial changes to gather to support nuance stt * updateSpeechCredentialLastUsed could be called without a speech_credential_sid if credentials are passed in the flow * fix bugname * typo * added handlers for nuance * logging * major refactor of parsing transcriptions * initial support for nuance in transcribe verb * updates from testing * cleanup some tests * update action * typo * gather: start nuance timers after say/play completes * update drachtio-fsrmf * refactor some code * typo * log nuance error detail * timeout handling * typo * handle nuance 413 response when recognition times out * typo in specs.json * add support for nuance resources * fixes and tests for transcribe * remove logging from test * initial support for kryptonEndpoint * try getting access token even when using krypton * typo in kryptonEndpoint property * add support for Nuance tts * parse nuance voice and model for tts * use nuance credentials from db * update to db-helpers@0.7.0 with caching option * add support for azure audio logging in gather/transcribe * sync package-lock.json
2026-02-13 01:39:26 +00:00 · 2022-11-01 12:23:49 -04:00
parent 203b9774ca
commit 509bb065bb
20 changed files with 1100 additions and 974 deletions
--- a/lib/utils/constants.json
+++ b/lib/utils/constants.json
@@ -67,6 +67,13 @@
    "MaxDurationExceeded": "google_transcribe::max_duration_exceeded",
    "VadDetected": "google_transcribe::vad_detected"
  },
+  "NuanceTranscriptionEvents": {
+    "Transcription": "nuance_transcribe::transcription",
+    "StartOfSpeech": "nuance_transcribe::start_of_speech",
+    "TranscriptionComplete": "nuance_transcribe::end_of_transcription",
+    "Error": "nuance_transcribe::error",
+    "VadDetected": "nuance_transcribe::vad_detected"
+  },
  "AwsTranscriptionEvents": {
    "Transcription": "aws_transcribe::transcription",
    "EndOfTranscript": "aws_transcribe::end_of_transcript",
--- a/lib/utils/db-utils.js
+++ b/lib/utils/db-utils.js
@@ -44,7 +44,13 @@ const speechMapper = (cred) => {
      const o = JSON.parse(decrypt(credential));
      obj.api_key = o.api_key;
    }
+    else if ('nuance' === obj.vendor) {
+      const o = JSON.parse(decrypt(credential));
+      obj.client_id = o.client_id;
+      obj.secret = o.secret;
+    }
  } catch (err) {
+    console.log(err);
  }
  return obj;
 };
@@ -65,7 +71,8 @@ module.exports = (logger, srf) => {
    const haveAws = speech.find((s) => s.vendor === 'aws');
    const haveMicrosoft = speech.find((s) => s.vendor === 'microsoft');
    const haveWellsaid = speech.find((s) => s.vendor === 'wellsaid');
-    if (!haveGoogle || !haveAws || !haveMicrosoft) {
+    const haveNuance = speech.find((s) => s.vendor === 'nuance');
+    if (!haveGoogle || !haveAws || !haveMicrosoft || !haveWellsaid || !haveNuance) {
      const [r3] = await pp.query(sqlSpeechCredentialsForSP, account_sid);
      if (r3.length) {
        if (!haveGoogle) {
@@ -84,6 +91,10 @@ module.exports = (logger, srf) => {
          const wellsaid = r3.find((s) => s.vendor === 'wellsaid');
          if (wellsaid) speech.push(speechMapper(wellsaid));
        }
+        if (!haveNuance) {
+          const nuance = r3.find((s) => s.vendor === 'nuance');
+          if (nuance) speech.push(speechMapper(nuance));
+        }
      }
    }

@@ -94,6 +105,7 @@ module.exports = (logger, srf) => {
  };

  const updateSpeechCredentialLastUsed = async(speech_credential_sid) => {
+    if (!speech_credential_sid) return;
    const pp = pool.promise();
    const sql = 'UPDATE speech_credentials SET last_used = NOW() WHERE speech_credential_sid = ?';
    try {
--- a/lib/utils/install-srf-locals.js
+++ b/lib/utils/install-srf-locals.js
@@ -152,7 +152,8 @@ function installSrfLocals(srf, logger) {
    popFront,
    removeFromList,
    lengthOfList,
-    getListPosition
+    getListPosition,
+    getNuanceAccessToken
  } = require('@jambonz/realtimedb-helpers')({
    host: process.env.JAMBONES_REDIS_HOST,
    port: process.env.JAMBONES_REDIS_PORT || 6379
@@ -204,7 +205,8 @@ function installSrfLocals(srf, logger) {
      popFront,
      removeFromList,
      lengthOfList,
-      getListPosition
+      getListPosition,
+      getNuanceAccessToken
    },
    parentLogger: logger,
    getSBC,
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -1,9 +1,32 @@
+const {
+  TaskName,
+  AzureTranscriptionEvents,
+  GoogleTranscriptionEvents,
+  AwsTranscriptionEvents,
+  NuanceTranscriptionEvents
+} = require('./constants');
+
 module.exports = (logger) => {
-  const normalizeTranscription = (evt, vendor, channel) => {
-    if ('aws' === vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
-    if ('microsoft' === vendor) {
+  const normalizeTranscription = (evt, vendor, channel, language) => {
+    let newEvent = JSON.parse(JSON.stringify(evt));
+
+    /* add in channel_tag and provide the full vendor-specific event */
+    newEvent = {
+      ...(vendor === 'aws' ? newEvent[0] : newEvent),
+      language_code: language,
+      channel_tag: channel
+    };
+
+
+    if ('aws' === vendor && Array.isArray(evt) && evt.length > 0) {
+      newEvent = {
+        ...newEvent,
+        vendor: {event: evt, name: vendor}
+      };
+    }
+    else if ('microsoft' === vendor) {
      const nbest = evt.NBest;
-      const language_code = evt.PrimaryLanguage?.Language || this.language;
+      const language_code = evt.PrimaryLanguage?.Language || language;
      const alternatives = nbest ? nbest.map((n) => {
        return {
          confidence: n.Confidence,
@@ -16,18 +39,194 @@ module.exports = (logger) => {
          }
        ];

-      const newEvent = {
+      newEvent = {
+        ...newEvent,
        is_final: evt.RecognitionStatus === 'Success',
        channel,
        language_code,
-        alternatives
+        alternatives,
+        vendor: {event: evt, name: vendor}
      };
-      evt = newEvent;
    }
-    evt.channel_tag = channel;
-    //logger.debug({evt}, 'normalized transcription');
-    return evt;
+    return newEvent;
  };

-  return {normalizeTranscription};
+  const setChannelVarsForStt = (task, sttCredentials, rOpts = {}) => {
+    let opts = {};
+    const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {};
+    const vad = {enable, voiceMs, mode};
+
+    /* voice activity detection works across vendors */
+    opts = {
+      ...opts,
+      ...(vad.enable && {START_RECOGNIZING_ON_VAD: 1}),
+      ...(vad.enable && vad.voiceMs && {RECOGNIZER_VAD_VOICE_MS: vad.voiceMs}),
+      ...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}),
+    };
+
+    if ('google' === rOpts.vendor) {
+      opts = {
+        ...opts,
+        ...(sttCredentials &&
+          {GOOGLE_APPLICATION_CREDENTIALS: JSON.stringify(sttCredentials.credentials)}),
+        ...(rOpts.enhancedModel &&
+            {GOOGLE_SPEECH_USE_ENHANCED: 1}),
+        ...(rOpts.separateRecognitionPerChannel &&
+          {GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 1}),
+        ...(rOpts.profanityFilter &&
+          {GOOGLE_SPEECH_PROFANITY_FILTER: 1}),
+        ...(rOpts.punctuation &&
+          {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1}),
+        ...(rOpts.words &&
+          {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 1}),
+        ...((rOpts.singleUtterance ||  task.name === TaskName.Gather) &&
+          {GOOGLE_SPEECH_SINGLE_UTTERANCE: 1}),
+        ...(rOpts.diarization &&
+          {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 1}),
+        ...(rOpts.diarization && rOpts.diarizationMinSpeakers > 0 &&
+          {GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT: rOpts.diarizationMinSpeakers}),
+        ...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 &&
+          {GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}),
+        ...(rOpts.enhancedModel === false &&
+          {GOOGLE_SPEECH_USE_ENHANCED: 0}),
+        ...(rOpts.separateRecognitionPerChannel === false &&
+          {GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 0}),
+        ...(rOpts.profanityFilter === false &&
+          {GOOGLE_SPEECH_PROFANITY_FILTER: 0}),
+        ...(rOpts.punctuation === false &&
+          {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 0}),
+        ...(rOpts.words  == false &&
+          {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 0}),
+        ...((rOpts.singleUtterance === false || task.name === TaskName.Transcribe) &&
+          {GOOGLE_SPEECH_SINGLE_UTTERANCE: 0}),
+        ...(rOpts.diarization === false &&
+          {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 0}),
+        ...(rOpts.hints.length > 0 &&
+          {GOOGLE_SPEECH_HINTS: rOpts.hints.join(',')}),
+        ...(typeof rOpts.hintsBoost === 'number' &&
+          {GOOGLE_SPEECH_HINTS_BOOST: rOpts.hintsBoost}),
+        ...(rOpts.altLanguages.length > 0 &&
+          {GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES: rOpts.altLanguages.join(',')}),
+        ...(rOpts.interactionType &&
+          {GOOGLE_SPEECH_METADATA_INTERACTION_TYPE: rOpts.interactionType}),
+        ...{GOOGLE_SPEECH_MODEL: rOpts.model || (task.name === TaskName.Gather ? 'command_and_search' : 'phone_call')},
+        ...(rOpts.naicsCode > 0 &&
+          {GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE: rOpts.naicsCode}),
+      };
+    }
+    else if (['aws', 'polly'].includes(rOpts.vendor)) {
+      opts = {
+        ...opts,
+        ...(rOpts.vocabularyName && {AWS_VOCABULARY_NAME: rOpts.vocabularyName}),
+        ...(rOpts.vocabularyFilterName && {AWS_VOCABULARY_FILTER_NAME: rOpts.vocabularyFilterName}),
+        ...(rOpts.filterMethod && {AWS_VOCABULARY_FILTER_METHOD: rOpts.filterMethod}),
+        ...(sttCredentials && {
+          AWS_ACCESS_KEY_ID: sttCredentials.accessKeyId,
+          AWS_SECRET_ACCESS_KEY: sttCredentials.secretAccessKey,
+          AWS_REGION: sttCredentials.region
+        }),
+      };
+    }
+    else if ('microsoft' === rOpts.vendor) {
+      opts = {
+        ...opts,
+        ...(rOpts.hints  && rOpts.hints.length > 0 &&
+          {AZURE_SPEECH_HINTS: rOpts.hints.map((h) => h.trim()).join(',')}),
+        ...(rOpts.altLanguages && rOpts.altLanguages.length > 0 &&
+          {AZURE_SERVICE_ENDPOINT_ID: rOpts.sttCredentials}),
+        ...(rOpts.requestSnr && {AZURE_REQUEST_SNR: 1}),
+        ...(rOpts.profanityOption && {AZURE_PROFANITY_OPTION: rOpts.profanityOption}),
+        ...(rOpts.azureServiceEndpoint && {AZURE_SERVICE_ENDPOINT: rOpts.azureServiceEndpoint}),
+        ...(rOpts.initialSpeechTimeoutMs > 0 &&
+          {AZURE_INITIAL_SPEECH_TIMEOUT_MS: rOpts.initialSpeechTimeoutMs}),
+        ...(rOpts.requestSnr && {AZURE_REQUEST_SNR: 1}),
+        ...(rOpts.audioLogging && {AZURE_AUDIO_LOGGING: 1}),
+        ...{AZURE_USE_OUTPUT_FORMAT_DETAILED: 1},
+        ...(sttCredentials && {
+          AZURE_SUBSCRIPTION_KEY: sttCredentials.api_key,
+          AZURE_REGION: sttCredentials.region,
+        }),
+        ...(sttCredentials.use_custom_stt && sttCredentials.custom_stt_endpoint &&
+          {AZURE_SERVICE_ENDPOINT_ID: sttCredentials.custom_stt_endpoint})
+      };
+    }
+    else if ('nuance' === rOpts.vendor) {
+      /**
+       * Note: all nuance options are in recognizer.nuanceOptions, should migrate
+       * other vendor settings to similar nested structure
+       */
+      const {nuanceOptions = {}} = rOpts;
+      opts = {
+        ...opts,
+        ...(sttCredentials.access_token) &&
+          {NUANCE_ACCESS_TOKEN: sttCredentials.access_token},
+        ...(sttCredentials.krypton_endpoint) &&
+          {NUANCE_KRYPTON_ENDPOINT: sttCredentials.krypton_endpoint},
+        ...(nuanceOptions.topic) &&
+          {NUANCE_TOPIC: nuanceOptions.topic},
+        ...(nuanceOptions.utteranceDetectionMode) &&
+          {NUANCE_UTTERANCE_DETECTION_MODE: nuanceOptions.utteranceDetectionMode},
+        ...(nuanceOptions.punctuation) && {NUANCE_PUNCTUATION: nuanceOptions.punctuation},
+        ...(nuanceOptions.profanityFilter) &&
+          {NUANCE_FILTER_PROFANITY: nuanceOptions.profanityFilter},
+        ...(nuanceOptions.includeTokenization) &&
+          {NUANCE_INCLUDE_TOKENIZATION: nuanceOptions.includeTokenization},
+        ...(nuanceOptions.discardSpeakerAdaptation) &&
+          {NUANCE_DISCARD_SPEAKER_ADAPTATION: nuanceOptions.discardSpeakerAdaptation},
+        ...(nuanceOptions.suppressCallRecording) &&
+          {NUANCE_SUPPRESS_CALL_RECORDING: nuanceOptions.suppressCallRecording},
+        ...(nuanceOptions.maskLoadFailures) &&
+          {NUANCE_MASK_LOAD_FAILURES: nuanceOptions.maskLoadFailures},
+        ...(nuanceOptions.suppressInitialCapitalization) &&
+          {NUANCE_SUPPRESS_INITIAL_CAPITALIZATION: nuanceOptions.suppressInitialCapitalization},
+        ...(nuanceOptions.allowZeroBaseLmWeight)
+          && {NUANCE_ALLOW_ZERO_BASE_LM_WEIGHT: nuanceOptions.allowZeroBaseLmWeight},
+        ...(nuanceOptions.filterWakeupWord) &&
+          {NUANCE_FILTER_WAKEUP_WORD: nuanceOptions.filterWakeupWord},
+        ...(nuanceOptions.resultType) &&
+          {NUANCE_RESULT_TYPE: nuanceOptions.resultType || rOpts.interim ? 'partial' : 'final'},
+        ...(nuanceOptions.noInputTimeoutMs) &&
+          {NUANCE_NO_INPUT_TIMEOUT_MS: nuanceOptions.noInputTimeoutMs},
+        ...(nuanceOptions.recognitionTimeoutMs) &&
+          {NUANCE_RECOGNITION_TIMEOUT_MS: nuanceOptions.recognitionTimeoutMs},
+        ...(nuanceOptions.utteranceEndSilenceMs) &&
+          {NUANCE_UTTERANCE_END_SILENCE_MS: nuanceOptions.utteranceEndSilenceMs},
+        ...(nuanceOptions.maxHypotheses) &&
+          {NUANCE_MAX_HYPOTHESES: nuanceOptions.maxHypotheses},
+        ...(nuanceOptions.speechDomain) &&
+          {NUANCE_SPEECH_DOMAIN: nuanceOptions.speechDomain},
+        ...(nuanceOptions.formatting) &&
+          {NUANCE_FORMATTING: nuanceOptions.formatting},
+        ...(nuanceOptions.resources) &&
+          {NUANCE_RESOURCES: JSON.stringify(nuanceOptions.resources)},
+      };
+    }
+    logger.debug({opts}, 'recognizer channel vars');
+    return opts;
+  };
+
+  const removeSpeechListeners = (ep) => {
+    ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
+    ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
+
+    ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
+
+    ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
+    ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
+
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
+    ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
+
+  };
+  return {
+    normalizeTranscription,
+    setChannelVarsForStt,
+    removeSpeechListeners
+  };
 };