deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)

* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip) * wip * deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts * wip * fixes for deepgram compiling multiple transcripts * test deepgram utteranceEndMs * more handling of utteranceEndMs * wip * better handling of digit strings collected over multiple deepgram responses * wip * add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final * apply deepgram fixes to transcribe * cleanup continnuous asr * more continuous asr fixes for deepgram * update to verb-specifications for handling SttTask properties * set log level for tests back to error
2025-12-20 16:50:39 +00:00 · 2023-10-30 13:57:25 -04:00
parent 67f8f7181a
commit f43a5c1491
4 changed files with 179 additions and 60 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -20,16 +20,6 @@ const makeTask = require('./make_task');
 const assert = require('assert');
 const SttTask = require('./stt-task');

-const compileTranscripts = (logger, evt, arr) => {
-  if (!Array.isArray(arr) || arr.length === 0) return;
-  let t = '';
-  for (const a of arr) {
-    t += ` ${a.alternatives[0].transcript}`;
-  }
-  t += ` ${evt.alternatives[0].transcript}`;
-  evt.alternatives[0].transcript = t.trim();
-};
-
 class TaskGather extends SttTask {
  constructor(logger, opts, parentTask) {
    super(logger, opts, parentTask);
@@ -51,8 +41,10 @@ class TaskGather extends SttTask {
      /* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
      this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
        this.data.recognizer.asrTimeout * 1000 : 0;
-      if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
-      this.isContinuousAsr = this.asrTimeout > 0;
+      if (this.asrTimeout > 0) {
+        this.isContinuousAsr = true;
+        this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
+      }

      if (Array.isArray(this.data.recognizer.hints) &&
        0 == this.data.recognizer.hints.length && JAMBONES_GATHER_CLEAR_GLOBAL_HINTS_ON_EMPTY_HINTS) {
@@ -351,6 +343,13 @@ class TaskGather extends SttTask {
  async _setSpeechHandlers(cs, ep) {
    if (this._speechHandlersSet) return;
    this._speechHandlersSet = true;
+
+    /* some special deepgram logic */
+    if (this.vendor === 'deepgram') {
+      if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
+      if (this.data.recognizer?.deepgramOptions?.shortUtterance) this.shortUtterance = true;
+    }
+
    const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
    switch (this.vendor) {
      case 'google':
@@ -396,6 +395,9 @@ class TaskGather extends SttTask {
        ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
        ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
          this._onDeepGramConnectFailure.bind(this, cs, ep));
+
+        /* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
+        if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
        break;

      case 'soniox':
@@ -487,6 +489,12 @@ class TaskGather extends SttTask {
      interim: this.interim,
      bugname: this.bugname
    }, 'Gather:_startTranscribing');
+
+    /**
+     * Note: we don't need to ask deepgram for interim results, because they
+     * already send us words as they are finalized (is_final=true) even before
+     * the utterance is finalized (speech_final=true)
+     */
    ep.startTranscription({
      vendor: this.vendor,
      locale: this.language,
@@ -522,11 +530,13 @@ class TaskGather extends SttTask {
  }

  _startAsrTimer() {
+    if (this.vendor === 'deepgram') return; // no need
    assert(this.isContinuousAsr);
    this._clearAsrTimer();
    this._asrTimer = setTimeout(() => {
      this.logger.debug('_startAsrTimer - asr timer went off');
-      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
+      const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
    }, this.asrTimeout);
    this.logger.debug(`_startAsrTimer: set for ${this.asrTimeout}ms`);
  }
@@ -556,7 +566,8 @@ class TaskGather extends SttTask {
    this._clearFinalAsrTimer();
    this._finalAsrTimer = setTimeout(() => {
      this.logger.debug('_startFinalAsrTimer - final asr timer went off');
-      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
+      const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
    }, 1000);
    this.logger.debug('_startFinalAsrTimer: set for 1 second');
  }
@@ -595,11 +606,23 @@ class TaskGather extends SttTask {
    this.logger.debug({evt, bugname, finished}, `Gather:_onTranscription for vendor ${this.vendor}`);
    if (bugname && this.bugname !== bugname) return;

-    if (this.vendor === 'ibm') {
-      if (evt?.state === 'listening') return;
+    if (this.vendor === 'ibm' && evt?.state === 'listening') return;
+
+    if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
+      /* we will only get this when we have set utterance_end_ms */
+      if (this._bufferedTranscripts.length === 0) {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
+      }
+      else {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
+        evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+        this._bufferedTranscripts = [];
+        this._resolve('speech', evt);
+      }
+      return;
    }

-    evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);
+    evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance);
    if (evt.alternatives.length === 0) {
      this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
      return;
@@ -621,15 +644,27 @@ class TaskGather extends SttTask {
    const bufferedWords = this._sonioxTranscripts.length +
      this._bufferedTranscripts.reduce((count, e) => count + e.alternatives[0]?.transcript.split(' ').length, 0);

+    let emptyTranscript = false;
    if (evt.is_final) {
      if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
+        emptyTranscript = true;
        if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
          this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
+          return;
        }
-        else {
+        else if (this.vendor !== 'deepgram') {
          this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
+          return;
+        }
+        else if (this.isContinuousAsr) {
+          this.logger.info({evt},
+            'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
+          return;
+        }
+        else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
+          this.logger.info({evt},
+            'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
        }
-        return;
      }

      if (this.isContinuousAsr) {
@@ -641,14 +676,14 @@ class TaskGather extends SttTask {
            this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
            evt.alternatives[0].transcript = t.slice(0, -1);
          }
-          else this.logger.debug({t}, 'TaskGather:_onTranscription - no trailing punctuation');
        }
        this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
        this._bufferedTranscripts.push(evt);
        this._clearTimer();
        if (this._finalAsrTimer) {
          this._clearFinalAsrTimer();
-          return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
+          const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+          return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
        }
        this._startAsrTimer();

@@ -670,16 +705,25 @@ class TaskGather extends SttTask {
            evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
            this._sonioxTranscripts = [];
          }
+          else if (this.vendor === 'deepgram') {
+            /* compile transcripts into one */
+            if (!emptyTranscript) this._bufferedTranscripts.push(evt);
+            if (this.data.recognizer?.deepgramOptions?.utteranceEndMs) {
+              this.logger.debug('TaskGather:_onTranscription - got speech_final waiting for UtteranceEnd event');
+              return;
+            }
+            this.logger.debug({evt}, 'TaskGather:_onTranscription - compiling deepgram transcripts');
+            evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+            this._bufferedTranscripts = [];
+            this.logger.debug({evt}, 'TaskGather:_onTranscription - compiled deepgram transcripts');
+          }
+
+          /* here is where we return a final transcript */
          this._resolve('speech', evt);
        }
      }
    }
    else {
-      /* google has a measure of stability:
-        https://cloud.google.com/speech-to-text/docs/basics#streaming_responses
-        others do not.
-      */
-      //const isStableEnough = typeof evt.stability === 'undefined' || evt.stability > GATHER_STABILITY_THRESHOLD;
      this._clearTimer();
      this._startTimer();
      if (this.bargein && (words + bufferedWords) >= this.minBargeinWordCount) {
@@ -705,6 +749,14 @@ class TaskGather extends SttTask {
          this._sonioxTranscripts.push(evt.vendor.finalWords);
        }
      }
+      /* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
+      if (this.vendor === 'deepgram') {
+        const originalEvent = evt.vendor.evt;
+        if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
+          this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
+          this._bufferedTranscripts.push(evt);
+        }
+      }
    }
  }
  _onEndOfUtterance(cs, ep) {
@@ -719,7 +771,7 @@ class TaskGather extends SttTask {
     * getting a transcription.  This can happen if someone coughs or mumbles.
     * For that reason don't ask for a single utterance and we'll terminate the transcribe operation
     * once we get a final transcript.
-     * However, if the usr has specified a singleUtterance, then we need to restart here
+     * However, if the user has specified a singleUtterance, then we need to restart here
     * since we dont have a final transcript yet.
     */
    if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsSingleUtterance) {
@@ -858,18 +910,6 @@ class TaskGather extends SttTask {
    this._clearTimer();
    this._clearFastRecognitionTimer();

-    if (this.isContinuousAsr && reason.startsWith('speech')) {
-      evt = {
-        is_final: true,
-        transcripts: this._bufferedTranscripts
-      };
-      this.logger.debug({evt}, 'TaskGather:resolve continuous asr');
-    }
-    else if (!this.isContinuousAsr && reason.startsWith('speech') && this._bufferedTranscripts.length) {
-      compileTranscripts(this.logger, evt, this._bufferedTranscripts);
-      this.logger.debug({evt}, 'TaskGather:resolve buffered results');
-    }
-
    this.span.setAttributes({
      channel: 1,
      'stt.resolve': reason,