feat: rest:dial amd (#339)

Add support for sending 'amd' property in createCall REST API and also added support for using any of the speech vendors for STT --------- Co-authored-by: Dave Horton <daveh@beachdognet.com>
2025-12-20 16:50:39 +00:00 · 2023-05-17 03:20:08 +07:00
parent 6933e82d46
commit 32a84b7b19
4 changed files with 196 additions and 38 deletions
--- a/lib/utils/amd-utils.js
+++ b/lib/utils/amd-utils.js
@@ -1,9 +1,16 @@
 const Emitter = require('events');
 const {readFile} = require('fs');
 const {
+  TaskName,
  GoogleTranscriptionEvents,
  AwsTranscriptionEvents,
  AzureTranscriptionEvents,
+  NuanceTranscriptionEvents,
+  NvidiaTranscriptionEvents,
+  IbmTranscriptionEvents,
+  SonioxTranscriptionEvents,
+  DeepgramTranscriptionEvents,
+  JambonzTranscriptionEvents,
  AmdEvents,
  AvmdEvents
 } = require('./constants');
@@ -54,6 +61,11 @@ class Amd extends Emitter {
    this.thresholdWordCount = opts.thresholdWordCount || 9;
    const {normalizeTranscription} = require('./transcription-utils')(logger);
    this.normalizeTranscription = normalizeTranscription;
+    const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
+    this.getNuanceAccessToken = getNuanceAccessToken;
+    this.getIbmAccessToken = getIbmAccessToken;
+    const {setChannelVarsForStt} = require('./transcription-utils')(logger);
+    this.setChannelVarsForStt = setChannelVarsForStt;

    const {
      noSpeechTimeoutMs = 5000,
@@ -229,51 +241,92 @@ module.exports = (logger) => {

  const startAmd = async(cs, ep, task, opts) => {
    const amd = ep.amd = new Amd(logger, cs, opts);
-    const {vendor, language, sttCredentials} = amd;
-    const sttOpts = {};
+    const {vendor, language} = amd;
+    let sttCredentials = amd.sttCredentials;
    const hints = voicemailHints[language] || [];

+    if (vendor === 'nuance' && sttCredentials.client_id) {
+      /* get nuance access token */
+      const {getNuanceAccessToken} = amd;
+      const {client_id, secret} = sttCredentials;
+      const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
+      logger.debug({client_id}, `Gather:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
+      sttCredentials = {...sttCredentials, access_token};
+    }
+    else if (vendor == 'ibm' && sttCredentials.stt_api_key) {
+      /* get ibm access token */
+      const {getIbmAccessToken} = amd;
+      const {stt_api_key, stt_region} = sttCredentials;
+      const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
+      logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
+      sttCredentials = {...sttCredentials, access_token, stt_region};
+    }
+
    /* set stt options */
    logger.info(`starting amd for vendor ${vendor} and language ${language}`);
-    if ('google' === vendor) {
-      sttOpts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(sttCredentials.credentials);
-      sttOpts.GOOGLE_SPEECH_USE_ENHANCED = true;
-      sttOpts.GOOGLE_SPEECH_HINTS = hints.join(',');
-      if (opts.recognizer?.altLanguages) {
-        sttOpts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = opts.recognizer.altLanguages.join(',');
-      }
-      ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
-      ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, onEndOfUtterance.bind(null, cs, ep, task));
-    }
-    else if (['aws', 'polly'].includes(vendor)) {
-      Object.assign(sttOpts, {
-        AWS_ACCESS_KEY_ID: sttCredentials.accessKeyId,
-        AWS_SECRET_ACCESS_KEY: sttCredentials.secretAccessKey,
-        AWS_REGION: sttCredentials.region
-      });
-      ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
-    }
-    else if ('microsoft' === vendor) {
-      Object.assign(sttOpts, {
-        'AZURE_SUBSCRIPTION_KEY': sttCredentials.api_key,
-        'AZURE_REGION': sttCredentials.region
-      });
-      sttOpts.AZURE_SPEECH_HINTS = hints.join(',');
-      if (opts.recognizer?.altLanguages) {
-        sttOpts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = opts.recognizer.altLanguages.join(',');
-      }
-      sttOpts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = opts.resolveTimeoutMs || 20000;
+    const sttOpts = amd.setChannelVarsForStt({name: TaskName.Gather}, sttCredentials, {
+      vendor,
+      hints,
+      enhancedModel: true,
+      altLanguages: opts.recognizer?.altLanguages || [],
+      initialSpeechTimeoutMs: opts.resolveTimeoutMs,
+    });

-      ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
-      ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, onNoSpeechDetected.bind(null, cs, ep, task));
-    }
    await ep.set(sttOpts).catch((err) => logger.info(err, 'Error setting channel variables'));

+    amd.transcriptionHandler = onTranscription.bind(null, cs, ep, task);
+    amd.EndOfUtteranceHandler = onEndOfUtterance.bind(null, cs, ep, task);
+    amd.noSpeechHandler = onNoSpeechDetected.bind(null, cs, ep, task);
+
+    switch (vendor) {
+      case 'google':
+        ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, amd.EndOfUtteranceHandler);
+        break;
+
+      case 'aws':
+      case 'polly':
+        ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        break;
+      case 'microsoft':
+        ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, amd.noSpeechHandler);
+        break;
+      case 'nuance':
+        ep.addCustomEventListener(NuanceTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        break;
+
+      case 'deepgram':
+        ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        break;
+
+      case 'soniox':
+        amd.bugname = 'soniox_amd_transcribe';
+        ep.addCustomEventListener(SonioxTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        break;
+
+      case 'ibm':
+        ep.addCustomEventListener(IbmTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        break;
+
+      case 'nvidia':
+        ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription, amd.transcriptionHandler);
+        break;
+
+      default:
+        if (vendor.startsWith('custom:')) {
+          ep.addCustomEventListener(JambonzTranscriptionEvents.Transcription, amd.transcriptionHandler);
+          break;
+        }
+        else {
+          throw new Error(`Invalid vendor ${this.vendor}`);
+        }
+    }
    amd
      .on(AmdEvents.NoSpeechDetected, (evt) => {
        task.emit('amd', {type: AmdEvents.NoSpeechDetected, ...evt});
        try {
-          ep.connected && ep.stopTranscription({vendor, bugname});
+          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
@@ -281,7 +334,7 @@ module.exports = (logger) => {
      .on(AmdEvents.HumanDetected, (evt) => {
        task.emit('amd', {type: AmdEvents.HumanDetected, ...evt});
        try {
-          ep.connected && ep.stopTranscription({vendor, bugname});
+          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
@@ -292,7 +345,7 @@ module.exports = (logger) => {
      .on(AmdEvents.DecisionTimeout, (evt) => {
        task.emit('amd', {type: AmdEvents.DecisionTimeout, ...evt});
        try {
-          ep.connected && ep.stopTranscription({vendor, bugname});
+          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
@@ -300,7 +353,7 @@ module.exports = (logger) => {
      .on(AmdEvents.ToneTimeout, (evt) => {
        //task.emit('amd', {type: AmdEvents.ToneTimeout, ...evt});
        try {
-          ep.connected && ep.execute('avmd_stop').catch((err) => logger.info(err, 'Error stopping avmd'));
+          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping avmd');
        }
@@ -308,7 +361,7 @@ module.exports = (logger) => {
      .on(AmdEvents.MachineStoppedSpeaking, () => {
        task.emit('amd', {type: AmdEvents.MachineStoppedSpeaking});
        try {
-          ep.connected && ep.stopTranscription({vendor, bugname});
+          stopAmd(ep, task);
        } catch (err) {
          logger.info({err}, 'Error stopping transcription');
        }
@@ -327,6 +380,19 @@ module.exports = (logger) => {
    if (ep.amd) {
      vendor = ep.amd.vendor;
      ep.amd.stopAllTimers();
+
+      ep.removeListener(GoogleTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(GoogleTranscriptionEvents.EndOfUtterance, ep.amd.EndOfUtteranceHandler);
+      ep.removeListener(AwsTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(AzureTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(AzureTranscriptionEvents.NoSpeechDetected, ep.amd.noSpeechHandler);
+      ep.removeListener(NuanceTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(DeepgramTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(SonioxTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(IbmTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(NvidiaTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+      ep.removeListener(JambonzTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
+
      ep.amd = null;
    }