deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)

* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip) * wip * deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts * wip * fixes for deepgram compiling multiple transcripts * test deepgram utteranceEndMs * more handling of utteranceEndMs * wip * better handling of digit strings collected over multiple deepgram responses * wip * add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final * apply deepgram fixes to transcribe * cleanup continnuous asr * more continuous asr fixes for deepgram * update to verb-specifications for handling SttTask properties * set log level for tests back to error
2025-12-20 08:40:38 +00:00 · 2023-10-30 13:57:25 -04:00
parent 67f8f7181a
commit f43a5c1491
4 changed files with 179 additions and 60 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -20,16 +20,6 @@ const makeTask = require('./make_task');
 const assert = require('assert');
 const SttTask = require('./stt-task');

-const compileTranscripts = (logger, evt, arr) => {
-  if (!Array.isArray(arr) || arr.length === 0) return;
-  let t = '';
-  for (const a of arr) {
-    t += ` ${a.alternatives[0].transcript}`;
-  }
-  t += ` ${evt.alternatives[0].transcript}`;
-  evt.alternatives[0].transcript = t.trim();
-};
-
 class TaskGather extends SttTask {
  constructor(logger, opts, parentTask) {
    super(logger, opts, parentTask);
@@ -51,8 +41,10 @@ class TaskGather extends SttTask {
      /* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
      this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
        this.data.recognizer.asrTimeout * 1000 : 0;
-      if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
-      this.isContinuousAsr = this.asrTimeout > 0;
+      if (this.asrTimeout > 0) {
+        this.isContinuousAsr = true;
+        this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
+      }

      if (Array.isArray(this.data.recognizer.hints) &&
        0 == this.data.recognizer.hints.length && JAMBONES_GATHER_CLEAR_GLOBAL_HINTS_ON_EMPTY_HINTS) {
@@ -351,6 +343,13 @@ class TaskGather extends SttTask {
  async _setSpeechHandlers(cs, ep) {
    if (this._speechHandlersSet) return;
    this._speechHandlersSet = true;
+
+    /* some special deepgram logic */
+    if (this.vendor === 'deepgram') {
+      if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
+      if (this.data.recognizer?.deepgramOptions?.shortUtterance) this.shortUtterance = true;
+    }
+
    const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
    switch (this.vendor) {
      case 'google':
@@ -396,6 +395,9 @@ class TaskGather extends SttTask {
        ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
        ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
          this._onDeepGramConnectFailure.bind(this, cs, ep));
+
+        /* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
+        if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
        break;

      case 'soniox':
@@ -487,6 +489,12 @@ class TaskGather extends SttTask {
      interim: this.interim,
      bugname: this.bugname
    }, 'Gather:_startTranscribing');
+
+    /**
+     * Note: we don't need to ask deepgram for interim results, because they
+     * already send us words as they are finalized (is_final=true) even before
+     * the utterance is finalized (speech_final=true)
+     */
    ep.startTranscription({
      vendor: this.vendor,
      locale: this.language,
@@ -522,11 +530,13 @@ class TaskGather extends SttTask {
  }

  _startAsrTimer() {
+    if (this.vendor === 'deepgram') return; // no need
    assert(this.isContinuousAsr);
    this._clearAsrTimer();
    this._asrTimer = setTimeout(() => {
      this.logger.debug('_startAsrTimer - asr timer went off');
-      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
+      const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
    }, this.asrTimeout);
    this.logger.debug(`_startAsrTimer: set for ${this.asrTimeout}ms`);
  }
@@ -556,7 +566,8 @@ class TaskGather extends SttTask {
    this._clearFinalAsrTimer();
    this._finalAsrTimer = setTimeout(() => {
      this.logger.debug('_startFinalAsrTimer - final asr timer went off');
-      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
+      const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+      this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
    }, 1000);
    this.logger.debug('_startFinalAsrTimer: set for 1 second');
  }
@@ -595,11 +606,23 @@ class TaskGather extends SttTask {
    this.logger.debug({evt, bugname, finished}, `Gather:_onTranscription for vendor ${this.vendor}`);
    if (bugname && this.bugname !== bugname) return;

-    if (this.vendor === 'ibm') {
-      if (evt?.state === 'listening') return;
+    if (this.vendor === 'ibm' && evt?.state === 'listening') return;
+
+    if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
+      /* we will only get this when we have set utterance_end_ms */
+      if (this._bufferedTranscripts.length === 0) {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
+      }
+      else {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
+        evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+        this._bufferedTranscripts = [];
+        this._resolve('speech', evt);
+      }
+      return;
    }

-    evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);
+    evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance);
    if (evt.alternatives.length === 0) {
      this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
      return;
@@ -621,15 +644,27 @@ class TaskGather extends SttTask {
    const bufferedWords = this._sonioxTranscripts.length +
      this._bufferedTranscripts.reduce((count, e) => count + e.alternatives[0]?.transcript.split(' ').length, 0);

+    let emptyTranscript = false;
    if (evt.is_final) {
      if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
+        emptyTranscript = true;
        if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
          this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
+          return;
        }
-        else {
+        else if (this.vendor !== 'deepgram') {
          this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
+          return;
+        }
+        else if (this.isContinuousAsr) {
+          this.logger.info({evt},
+            'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
+          return;
+        }
+        else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
+          this.logger.info({evt},
+            'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
        }
-        return;
      }

      if (this.isContinuousAsr) {
@@ -641,14 +676,14 @@ class TaskGather extends SttTask {
            this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
            evt.alternatives[0].transcript = t.slice(0, -1);
          }
-          else this.logger.debug({t}, 'TaskGather:_onTranscription - no trailing punctuation');
        }
        this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
        this._bufferedTranscripts.push(evt);
        this._clearTimer();
        if (this._finalAsrTimer) {
          this._clearFinalAsrTimer();
-          return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
+          const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+          return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
        }
        this._startAsrTimer();

@@ -670,16 +705,25 @@ class TaskGather extends SttTask {
            evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
            this._sonioxTranscripts = [];
          }
+          else if (this.vendor === 'deepgram') {
+            /* compile transcripts into one */
+            if (!emptyTranscript) this._bufferedTranscripts.push(evt);
+            if (this.data.recognizer?.deepgramOptions?.utteranceEndMs) {
+              this.logger.debug('TaskGather:_onTranscription - got speech_final waiting for UtteranceEnd event');
+              return;
+            }
+            this.logger.debug({evt}, 'TaskGather:_onTranscription - compiling deepgram transcripts');
+            evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+            this._bufferedTranscripts = [];
+            this.logger.debug({evt}, 'TaskGather:_onTranscription - compiled deepgram transcripts');
+          }
+
+          /* here is where we return a final transcript */
          this._resolve('speech', evt);
        }
      }
    }
    else {
-      /* google has a measure of stability:
-        https://cloud.google.com/speech-to-text/docs/basics#streaming_responses
-        others do not.
-      */
-      //const isStableEnough = typeof evt.stability === 'undefined' || evt.stability > GATHER_STABILITY_THRESHOLD;
      this._clearTimer();
      this._startTimer();
      if (this.bargein && (words + bufferedWords) >= this.minBargeinWordCount) {
@@ -705,6 +749,14 @@ class TaskGather extends SttTask {
          this._sonioxTranscripts.push(evt.vendor.finalWords);
        }
      }
+      /* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
+      if (this.vendor === 'deepgram') {
+        const originalEvent = evt.vendor.evt;
+        if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
+          this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
+          this._bufferedTranscripts.push(evt);
+        }
+      }
    }
  }
  _onEndOfUtterance(cs, ep) {
@@ -719,7 +771,7 @@ class TaskGather extends SttTask {
     * getting a transcription.  This can happen if someone coughs or mumbles.
     * For that reason don't ask for a single utterance and we'll terminate the transcribe operation
     * once we get a final transcript.
-     * However, if the usr has specified a singleUtterance, then we need to restart here
+     * However, if the user has specified a singleUtterance, then we need to restart here
     * since we dont have a final transcript yet.
     */
    if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsSingleUtterance) {
@@ -858,18 +910,6 @@ class TaskGather extends SttTask {
    this._clearTimer();
    this._clearFastRecognitionTimer();

-    if (this.isContinuousAsr && reason.startsWith('speech')) {
-      evt = {
-        is_final: true,
-        transcripts: this._bufferedTranscripts
-      };
-      this.logger.debug({evt}, 'TaskGather:resolve continuous asr');
-    }
-    else if (!this.isContinuousAsr && reason.startsWith('speech') && this._bufferedTranscripts.length) {
-      compileTranscripts(this.logger, evt, this._bufferedTranscripts);
-      this.logger.debug({evt}, 'TaskGather:resolve buffered results');
-    }
-
    this.span.setAttributes({
      channel: 1,
      'stt.resolve': reason,
--- a/lib/tasks/stt-task.js
+++ b/lib/tasks/stt-task.js
@@ -16,12 +16,14 @@ class SttTask extends Task {
      normalizeTranscription,
      removeSpeechListeners,
      setSpeechCredentialsAtRuntime,
-      compileSonioxTranscripts
+      compileSonioxTranscripts,
+      consolidateTranscripts
    } = require('../utils/transcription-utils')(logger);
    this.setChannelVarsForStt = setChannelVarsForStt;
    this.normalizeTranscription = normalizeTranscription;
    this.removeSpeechListeners = removeSpeechListeners;
    this.compileSonioxTranscripts = compileSonioxTranscripts;
+    this.consolidateTranscripts = consolidateTranscripts;

    this.isHandledByPrimaryProvider = true;
    if (this.data.recognizer) {
@@ -138,6 +140,14 @@ class SttTask extends Task {
    addKey(key, evt.compiled_context, 3600 * 12)
      .catch((err) => this.logger.info({err}, `Error caching cobalt context for ${key}`));
  }
+
+  _doContinuousAsrWithDeepgram(asrTimeout) {
+    /* deepgram has an utterance_end_ms property that simplifies things */
+    assert(this.vendor === 'deepgram');
+    this.logger.debug(`_doContinuousAsrWithDeepgram - setting utterance_end_ms to ${asrTimeout}`);
+    const dgOptions = this.data.recognizer.deepgramOptions = this.data.recognizer.deepgramOptions || {};
+    dgOptions.utteranceEndMs = dgOptions.utteranceEndMs || asrTimeout;
+  }
 }

 module.exports = SttTask;
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -34,7 +34,9 @@ class TaskTranscribe extends SttTask {

    // Continuos asr timeout
    this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ? this.data.recognizer.asrTimeout * 1000 : 0;
-    this.isContinuousAsr = this.asrTimeout > 0;
+    if (this.asrTimeout > 0) {
+      this.isContinuousAsr = true;
+    }
    /* buffer speech for continuous asr */
    this._bufferedTranscripts = [];
  }
@@ -177,6 +179,12 @@ class TaskTranscribe extends SttTask {
  async _setSpeechHandlers(cs, ep, channel) {
    if (this[`_speechHandlersSet_${channel}`]) return;
    this[`_speechHandlersSet_${channel}`] = true;
+
+    /* some special deepgram logic */
+    if (this.vendor === 'deepgram') {
+      if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
+    }
+
    const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
    switch (this.vendor) {
      case 'google':
@@ -223,6 +231,10 @@ class TaskTranscribe extends SttTask {
          this._onDeepgramConnect.bind(this, cs, ep, channel));
        ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
          this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
+
+        /* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
+        if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
+
        break;
      case 'soniox':
        this.bugname = 'soniox_transcribe';
@@ -329,8 +341,20 @@ class TaskTranscribe extends SttTask {
    const bugname = fsEvent.getHeader('media-bugname');
    if (bugname && this.bugname !== bugname) return;

-    if (this.vendor === 'ibm') {
-      if (evt?.state === 'listening') return;
+    if (this.vendor === 'ibm' && evt?.state === 'listening') return;
+
+    if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
+      /* we will only get this when we have set utterance_end_ms */
+      if (this._bufferedTranscripts.length === 0) {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
+      }
+      else {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
+        evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+        this._bufferedTranscripts = [];
+        this._resolve('speech', evt);
+      }
+      return;
    }
    this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');

@@ -369,17 +393,6 @@ class TaskTranscribe extends SttTask {
    }
  }

-  _compileTranscripts() {
-    assert(this._bufferedTranscripts.length);
-    const evt = this._bufferedTranscripts[0];
-    let t = '';
-    for (const a of this._bufferedTranscripts) {
-      t += ` ${a.alternatives[0].transcript}`;
-    }
-    evt.alternatives[0].transcript = t.trim();
-    return evt;
-  }
-
  async _resolve(channel, evt) {
    /* we've got a transcript, so end the otel child span for this channel */
    if (this.childSpan[channel - 1] && this.childSpan[channel - 1].span) {
@@ -577,11 +590,12 @@ class TaskTranscribe extends SttTask {
  }

  _startAsrTimer(channel) {
+    if (this.vendor === 'deepgram') return; // no need
    assert(this.isContinuousAsr);
    this._clearAsrTimer(channel);
    this._asrTimer = setTimeout(() => {
      this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
-      const evt = this._compileTranscripts();
+      const evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
      this._bufferedTranscripts = [];
      this._resolve(channel, evt);
    }, this.asrTimeout);
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -52,6 +52,7 @@ const stickyVars = {
    'DEEPGRAM_SPEECH_SEARCH',
    'DEEPGRAM_SPEECH_REPLACE',
    'DEEPGRAM_SPEECH_ENDPOINTING',
+    'DEEPGRAM_SPEECH_UTTERANCE_END_MS',
    'DEEPGRAM_SPEECH_VAD_TURNOFF',
    'DEEPGRAM_SPEECH_TAG'
  ],
@@ -106,6 +107,53 @@ const stickyVars = {
  ]
 };

+const consolidateTranscripts = (bufferedTranscripts, channel, language) => {
+  if (bufferedTranscripts.length === 1) return bufferedTranscripts[0];
+  let totalConfidence = 0;
+  const finalTranscript = bufferedTranscripts.reduce((acc, evt) => {
+    totalConfidence += evt.alternatives[0].confidence;
+
+    let newTranscript = evt.alternatives[0].transcript;
+
+    // If new transcript consists only of digits, spaces, and a trailing comma or period
+    if (newTranscript.match(/^[\d\s]+[,.]?$/)) {
+      newTranscript = newTranscript.replace(/\s/g, '');  // Remove all spaces
+      if (newTranscript.endsWith(',')) {
+        newTranscript = newTranscript.slice(0, -1);  // Remove the trailing comma
+      } else if (newTranscript.endsWith('.')) {
+        newTranscript = newTranscript.slice(0, -1);  // Remove the trailing period
+      }
+    }
+
+    const lastChar = acc.alternatives[0].transcript.slice(-1);
+    const firstChar = newTranscript.charAt(0);
+
+    if (lastChar.match(/\d/) && firstChar.match(/\d/)) {
+      acc.alternatives[0].transcript += newTranscript;
+    } else {
+      acc.alternatives[0].transcript += ` ${newTranscript}`;
+    }
+
+    return acc;
+  }, {
+    language_code: language,
+    channel_tag: channel,
+    is_final: true,
+    alternatives: [{
+      transcript: ''
+    }]
+  });
+  finalTranscript.alternatives[0].confidence = bufferedTranscripts.length === 1 ?
+    bufferedTranscripts[0].alternatives[0].confidence :
+    totalConfidence / bufferedTranscripts.length;
+  finalTranscript.alternatives[0].transcript = finalTranscript.alternatives[0].transcript.trim();
+  finalTranscript.vendor = {
+    name: 'deepgram',
+    evt: bufferedTranscripts
+  };
+  return finalTranscript;
+};
+
 const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
  const words = finalWordChunks.flat();
  const transcript = words.reduce((acc, word) => {
@@ -163,7 +211,7 @@ const normalizeSoniox = (evt, channel, language) => {
  };
 };

-const normalizeDeepgram = (evt, channel, language) => {
+const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
  const copy = JSON.parse(JSON.stringify(evt));
  const alternatives = (evt.channel?.alternatives || [])
    .map((alt) => ({
@@ -171,10 +219,14 @@ const normalizeDeepgram = (evt, channel, language) => {
      transcript: alt.transcript,
    }));

+  /**
+   * note difference between is_final and speech_final in Deepgram:
+   * https://developers.deepgram.com/docs/understand-endpointing-interim-results
+   */
  return {
    language_code: language,
    channel_tag: channel,
-    is_final: evt.is_final,
+    is_final: shortUtterance ? evt.is_final : evt.speech_final,
    alternatives: [alternatives[0]],
    vendor: {
      name: 'deepgram',
@@ -325,12 +377,12 @@ const normalizeAws = (evt, channel, language) => {


 module.exports = (logger) => {
-  const normalizeTranscription = (evt, vendor, channel, language) => {
+  const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {

    //logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
    switch (vendor) {
      case 'deepgram':
-        return normalizeDeepgram(evt, channel, language);
+        return normalizeDeepgram(evt, channel, language, shortUtterance);
      case 'microsoft':
        return normalizeMicrosoft(evt, channel, language);
      case 'google':
@@ -536,6 +588,8 @@ module.exports = (logger) => {
          {DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
        ...('endpointing' in deepgramOptions) &&
          {DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
+        ...(deepgramOptions.utteranceEndMs) &&
+          {DEEPGRAM_SPEECH_UTTERANCE_END_MS: deepgramOptions.utteranceEndMs},
        ...(deepgramOptions.vadTurnoff) &&
          {DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
        ...(deepgramOptions.tag) &&
@@ -743,6 +797,7 @@ module.exports = (logger) => {
    setChannelVarsForStt,
    removeSpeechListeners,
    setSpeechCredentialsAtRuntime,
-    compileSonioxTranscripts
+    compileSonioxTranscripts,
+    consolidateTranscripts
  };
 };