deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)

* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip) * wip * deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts * wip * fixes for deepgram compiling multiple transcripts * test deepgram utteranceEndMs * more handling of utteranceEndMs * wip * better handling of digit strings collected over multiple deepgram responses * wip * add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final * apply deepgram fixes to transcribe * cleanup continnuous asr * more continuous asr fixes for deepgram * update to verb-specifications for handling SttTask properties * set log level for tests back to error
2026-02-12 09:19:34 +00:00 · 2023-10-30 13:57:25 -04:00
parent 67f8f7181a
commit f43a5c1491
4 changed files with 179 additions and 60 deletions
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -34,7 +34,9 @@ class TaskTranscribe extends SttTask {

    // Continuos asr timeout
    this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ? this.data.recognizer.asrTimeout * 1000 : 0;
-    this.isContinuousAsr = this.asrTimeout > 0;
+    if (this.asrTimeout > 0) {
+      this.isContinuousAsr = true;
+    }
    /* buffer speech for continuous asr */
    this._bufferedTranscripts = [];
  }
@@ -177,6 +179,12 @@ class TaskTranscribe extends SttTask {
  async _setSpeechHandlers(cs, ep, channel) {
    if (this[`_speechHandlersSet_${channel}`]) return;
    this[`_speechHandlersSet_${channel}`] = true;
+
+    /* some special deepgram logic */
+    if (this.vendor === 'deepgram') {
+      if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
+    }
+
    const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
    switch (this.vendor) {
      case 'google':
@@ -223,6 +231,10 @@ class TaskTranscribe extends SttTask {
          this._onDeepgramConnect.bind(this, cs, ep, channel));
        ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
          this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
+
+        /* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
+        if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
+
        break;
      case 'soniox':
        this.bugname = 'soniox_transcribe';
@@ -329,8 +341,20 @@ class TaskTranscribe extends SttTask {
    const bugname = fsEvent.getHeader('media-bugname');
    if (bugname && this.bugname !== bugname) return;

-    if (this.vendor === 'ibm') {
-      if (evt?.state === 'listening') return;
+    if (this.vendor === 'ibm' && evt?.state === 'listening') return;
+
+    if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
+      /* we will only get this when we have set utterance_end_ms */
+      if (this._bufferedTranscripts.length === 0) {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
+      }
+      else {
+        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
+        evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
+        this._bufferedTranscripts = [];
+        this._resolve('speech', evt);
+      }
+      return;
    }
    this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');

@@ -369,17 +393,6 @@ class TaskTranscribe extends SttTask {
    }
  }

-  _compileTranscripts() {
-    assert(this._bufferedTranscripts.length);
-    const evt = this._bufferedTranscripts[0];
-    let t = '';
-    for (const a of this._bufferedTranscripts) {
-      t += ` ${a.alternatives[0].transcript}`;
-    }
-    evt.alternatives[0].transcript = t.trim();
-    return evt;
-  }
-
  async _resolve(channel, evt) {
    /* we've got a transcript, so end the otel child span for this channel */
    if (this.childSpan[channel - 1] && this.childSpan[channel - 1].span) {
@@ -577,11 +590,12 @@ class TaskTranscribe extends SttTask {
  }

  _startAsrTimer(channel) {
+    if (this.vendor === 'deepgram') return; // no need
    assert(this.isContinuousAsr);
    this._clearAsrTimer(channel);
    this._asrTimer = setTimeout(() => {
      this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
-      const evt = this._compileTranscripts();
+      const evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
      this._bufferedTranscripts = [];
      this._resolve(channel, evt);
    }, this.asrTimeout);