Feature/deepgram stt (#190)

* initial changes to support deepgram stt * fixes for normalizing vendor-specific transcriptions * update to latest drachtio-fsmrf with support for deepgram stt * deepgram parsing error * hints support for deepgram * handling deepgram errors * ignore late arriving transcripts for deepgram * handling of empty transcripts * transcribe changes * allow deepgram stt credentials to be provided at run time * bind channel in transcription handler * fixes for transcribe when handling empty transcripts * more empty transcript fixes * update tests to latest modules * add test cases for deepgram speech recognition
2025-12-20 16:50:39 +00:00 · 2022-11-12 19:48:59 -05:00
parent f511e6ab6b
commit 8686348454
12 changed files with 2148 additions and 152 deletions
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -5,7 +5,8 @@ const {
  GoogleTranscriptionEvents,
  AzureTranscriptionEvents,
  AwsTranscriptionEvents,
-  NuanceTranscriptionEvents
+  NuanceTranscriptionEvents,
+  DeepgramTranscriptionEvents
 } = require('../utils/constants');
 const normalizeJambones = require('../utils/normalize-jambones');

@@ -15,9 +16,14 @@ class TaskTranscribe extends Task {
    this.preconditions = TaskPreconditions.Endpoint;
    this.parentTask = parentTask;

-    const {setChannelVarsForStt, normalizeTranscription} = require('../utils/transcription-utils')(logger);
+    const {
+      setChannelVarsForStt,
+      normalizeTranscription,
+      removeSpeechListeners
+    } = require('../utils/transcription-utils')(logger);
    this.setChannelVarsForStt = setChannelVarsForStt;
    this.normalizeTranscription = normalizeTranscription;
+    this.removeSpeechListeners = removeSpeechListeners;

    this.transcriptionHook = this.data.transcriptionHook;
    this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
@@ -28,12 +34,17 @@ class TaskTranscribe extends Task {
    this.interim = !!recognizer.interim;
    this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;

+    /* let credentials be supplied in the recognizer object at runtime */
    if (recognizer.vendor === 'nuance') {
      const {clientId, secret} = recognizer.nuanceOptions;
      if (clientId && secret) {
        this.sttCredentials = {client_id: clientId, secret};
      }
    }
+    else if (recognizer.vendor === 'deepgram') {
+      const {apiKey} = recognizer.deepgramOptions;
+      if (apiKey) this.sttCredentials = {api_key: apiKey};
+    }

    recognizer.hints = recognizer.hints || [];
    recognizer.altLanguages = recognizer.altLanguages || [];
@@ -69,7 +80,7 @@ class TaskTranscribe extends Task {
    if (!this.data.recognizer.vendor) {
      this.data.recognizer.vendor = this.vendor;
    }
-    this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
+    if (!this.sttCredentials) this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');

    try {
      if (!this.sttCredentials) {
@@ -105,22 +116,7 @@ class TaskTranscribe extends Task {
      this.logger.info(err, 'TaskTranscribe:exec - error');
      this.parentTask && this.parentTask.emit('error', err);
    }
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
-    ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
-
-    ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
-
-    ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
-    ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
-
-    ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
-    ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
-    ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
-    ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
-    ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
+    this.removeSpeechListeners(ep);
  }

  async kill(cs) {
@@ -184,6 +180,15 @@ class TaskTranscribe extends Task {
        ep.addCustomEventListener(AzureTranscriptionEvents.Error,
          this._onNuanceError.bind(this, cs, ep, channel));
        break;
+      case 'deepgram':
+        this.bugname = 'deepgram_transcribe';
+        ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription,
+          this._onTranscription.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect,
+          this._onDeepgramConnect.bind(this, cs, ep, channel));
+        ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
+          this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
+        break;
      default:
        throw new Error(`Invalid vendor ${this.vendor}`);
    }
@@ -215,9 +220,15 @@ class TaskTranscribe extends Task {

    this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');

-    if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
-      this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
-      return this._transcribe(ep);
+    if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) {
+      if (['microsoft', 'deepgram'].includes(this.vendor)) {
+        this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
+      }
+      else {
+        this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again');
+        this._transcribe(ep);
+      }
+      return;
    }

    if (this.transcriptionHook) {
@@ -268,6 +279,34 @@ class TaskTranscribe extends Task {
      this._timer = null;
    }
  }
+  _onNuanceError(_cs, _ep, evt) {
+    const {code, error, details} = evt;
+    if (code === 404 && error === 'No speech') {
+      this.logger.debug({code, error, details}, 'TaskTranscribe:_onNuanceError');
+      return this._resolve('timeout');
+    }
+    this.logger.info({code, error, details}, 'TaskTranscribe:_onNuanceError');
+    if (code === 413 && error === 'Too much speech') {
+      return this._resolve('timeout');
+    }
+  }
+  _onDeepgramConnect(_cs, _ep) {
+    this.logger.debug('TaskTranscribe:_onDeepgramConnect');
+  }
+
+  _onDeepGramConnectFailure(cs, _ep, evt) {
+    const {reason} = evt;
+    const {writeAlerts, AlertType} = cs.srf.locals;
+    this.logger.info({evt}, 'TaskTranscribe:_onDeepgramConnectFailure');
+    writeAlerts({
+      account_sid: cs.accountSid,
+      alert_type: AlertType.STT_FAILURE,
+      message: `Failed connecting to Deepgram speech recognizer: ${reason}`,
+      vendor: 'deepgram',
+    }).catch((err) => this.logger.info({err}, 'Error generating alert for deepgram connection failure'));
+    this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
+    this.notifyTaskDone();
+  }
 }

 module.exports = TaskTranscribe;