Feat/ambient sounds (#678)

* initial support for coaching mode in conference * wip * wip * add support for answer verb * wip * wip * wip * wip * wip * updates to rename option to dub * wip * wip * wip * update verb-specs * wip * wip * wip * wip * wip * wip * wip * wip * add option to boost audio signal in main channel * wip * wip * wip * wip * wip * wip * for now, bypass use of streaming apis when generating tts audio for dub tracks * add nested dub to dial * wip * add support for filler noise * kill filler noise when gather killed * wip * wip * while using sayOnTrack, we have to enclose the say command in double quotes * disableTtsStreaming = false * allow transcribe of b leg only on dial verb * dub.say can either be text or object like say verb with text and synthesizer * remove loop for sayOnTrack * update speech-utils * fixes for testing transcribe verb and support for dub and boostAudioSignal in lcc commands * add dial.boostAudioSignal * fix bug where session-level recognizer settings incorrectly overwrite verb-level settings * update verb specs * update dial to support array of dub verbs * fix bug setting gain * lint * wip * update speech-utils * use new endpoint methods for mod_dub --------- Co-authored-by: Dave Horton <daveh@beachdognet.com>
2026-02-12 09:19:34 +00:00 · 2024-03-24 03:23:57 +07:00
parent ec58232b61
commit 5b1d8a8ff3
18 changed files with 915 additions and 278 deletions
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -32,8 +32,22 @@ class TaskTranscribe extends SttTask {
    }

    /* for nested transcribe in dial, unless the app explicitly says so we want to transcribe both legs */
-    if (this.parentTask?.name === TaskName.Dial && this.separateRecognitionPerChannel !== false) {
-      this.separateRecognitionPerChannel = true;
+    if (this.parentTask?.name === TaskName.Dial) {
+      if (this.data.channel === 1 || this.data.channel === 2) {
+        /* transcribe only the channel specified */
+        this.separateRecognitionPerChannel = false;
+        this.channel = this.data.channel;
+        logger.debug(`TaskTranscribe: transcribing only channel ${this.channel} in the Dial verb`);
+      }
+      else if (this.separateRecognitionPerChannel !== false) {
+        this.separateRecognitionPerChannel = true;
+      }
+      else {
+        this.channel = 1;
+      }
+    }
+    else {
+      this.channel = 1;
    }

    this.childSpan = [null, null];
@@ -51,6 +65,14 @@ class TaskTranscribe extends SttTask {

  get name() { return TaskName.Transcribe; }

+  get transcribing1() {
+    return this.channel === 1 || this.separateRecognitionPerChannel;
+  }
+
+  get transcribing2() {
+    return this.channel === 2 || this.separateRecognitionPerChannel && this.ep2;
+  }
+
  async exec(cs, {ep, ep2}) {
    await super.exec(cs, {ep, ep2});

@@ -73,8 +95,10 @@ class TaskTranscribe extends SttTask {
    }

    try {
-      await this._startTranscribing(cs, ep, 1);
-      if (this.separateRecognitionPerChannel && ep2) {
+      if (this.transcribing1) {
+        await this._startTranscribing(cs, ep, 1);
+      }
+      if (this.transcribing2) {
        await this._startTranscribing(cs, ep2, 2);
      }

@@ -91,7 +115,7 @@ class TaskTranscribe extends SttTask {

  async _stopTranscription() {
    let stopTranscription = false;
-    if (this.ep?.connected) {
+    if (this.transcribing1 && this.ep?.connected) {
      stopTranscription = true;
      this.ep.stopTranscription({
        vendor: this.vendor,
@@ -99,7 +123,7 @@ class TaskTranscribe extends SttTask {
      })
        .catch((err) => this.logger.info(err, 'Error TaskTranscribe:kill'));
    }
-    if (this.separateRecognitionPerChannel && this.ep2 && this.ep2.connected) {
+    if (this.transcribing2 && this.ep2.connected) {
      stopTranscription = true;
      this.ep2.stopTranscription({vendor: this.vendor, bugname: this.bugname})
        .catch((err) => this.logger.info(err, 'Error TaskTranscribe:kill'));
@@ -128,10 +152,8 @@ class TaskTranscribe extends SttTask {
          break;
        case TranscribeStatus.Resume:
          this.paused = false;
-          await this._startTranscribing(this.cs, this.ep, 1);
-          if (this.separateRecognitionPerChannel && this.ep2) {
-            await this._startTranscribing(this.cs, this.ep2, 2);
-          }
+          if (this.transcribing1) await this._startTranscribing(this.cs, this.ep, 1);
+          if (this.transcribing2) await this._startTranscribing(this.cs, this.ep2, 2);
          break;
      }
    }
@@ -294,7 +316,7 @@ class TaskTranscribe extends SttTask {
      vendor: this.vendor,
      interim: this.interim ? true : false,
      locale: this.language,
-      channels: /*this.separateRecognitionPerChannel ? 2 : */ 1,
+      channels: 1,
      bugname: this.bugname,
      hostport: this.hostport
    });
@@ -303,12 +325,12 @@ class TaskTranscribe extends SttTask {
  async _onTranscription(cs, ep, channel, evt, fsEvent) {
    // make sure this is not a transcript from answering machine detection
    const bugname = fsEvent.getHeader('media-bugname');
+    const finished = fsEvent.getHeader('transcription-session-finished');
    if (bugname && this.bugname !== bugname) return;
    if (this.paused) {
      this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - paused, ignoring transcript');
    }

-
    if (this.vendor === 'ibm' && evt?.state === 'listening') return;

    if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
@@ -319,8 +341,9 @@ class TaskTranscribe extends SttTask {
      else {
        this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
        evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
+        evt.is_final = true;
        this._bufferedTranscripts = [];
-        this._resolve('speech', evt);
+        this._resolve(channel, evt);
      }
      return;
    }
@@ -334,31 +357,87 @@ class TaskTranscribe extends SttTask {
      return;
    }

-    if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) {
-      if (['microsoft', 'deepgram'].includes(this.vendor)) {
-        this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
+    let emptyTranscript = false;
+    if (evt.is_final) {
+      if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
+        emptyTranscript = true;
+        if (finished === 'true' &&
+          ['microsoft', 'deepgram'].includes(this.vendor) &&
+          this._bufferedTranscripts.length === 0) {
+          this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
+          return;
+        }
+        else if (this.vendor !== 'deepgram') {
+          this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
+          return;
+        }
+        else if (this.isContinuousAsr) {
+          this.logger.info({evt},
+            'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
+          return;
+        }
+        else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
+          this.logger.info({evt},
+            'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
+        }
+      }
+      if (this.isContinuousAsr) {
+        /* append the transcript and start listening again for asrTimeout */
+        const t = evt.alternatives[0].transcript;
+        if (t) {
+          /* remove trailing punctuation */
+          if (/[,;:\.!\?]$/.test(t)) {
+            this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
+            evt.alternatives[0].transcript = t.slice(0, -1);
+          }
+        }
+        this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
+        this._bufferedTranscripts.push(evt);
+        this._startAsrTimer(channel);
+
+        /* some STT engines will keep listening after a final response, so no need to restart */
+        if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
      }
      else {
-        this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again');
-        this._transcribe(ep);
-      }
-      return;
-    }
+        if (this.vendor === 'soniox') {
+          /* compile transcripts into one */
+          this._sonioxTranscripts.push(evt.vendor.finalWords);
+          evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
+          this._sonioxTranscripts = [];
+        }
+        else if (this.vendor === 'deepgram') {
+          /* compile transcripts into one */
+          if (!emptyTranscript) this._bufferedTranscripts.push(evt);

-    if (this.vendor === 'soniox') {
-      /* compile transcripts into one */
-      this._sonioxTranscripts.push(evt.vendor.finalWords);
-      if (evt.is_final) {
-        evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
-        this._sonioxTranscripts = [];
+          /* deepgram can send an empty and final transcript; only if we have any buffered should we resolve */
+          if (this._bufferedTranscripts.length === 0) return;
+          evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
+          this._bufferedTranscripts = [];
+        }
+
+        /* here is where we return a final transcript */
+        this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
+        this._resolve(channel, evt);
+        /* some STT engines will keep listening after a final response, so no need to restart */
+        if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
      }
    }
+    else {
+      /* interim transcript */

-    if (this.isContinuousAsr && evt.is_final) {
-      this._bufferedTranscripts.push(evt);
-      this._startAsrTimer(channel);
-    } else {
-      await this._resolve(channel, evt);
+      /* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
+      if (this.vendor === 'deepgram') {
+        const originalEvent = evt.vendor.evt;
+        if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
+          this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
+          this._bufferedTranscripts.push(evt);
+        }
+      }
+
+      if (this.interim) {
+        this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending interim transcript');
+        this._resolve(channel, evt);
+      }
    }
  }

@@ -479,6 +558,7 @@ class TaskTranscribe extends SttTask {

      if (this.vendor === 'nuance') {
        const {code, error} = evt;
+        //TODO: fix below, currently _resolve does not send timeout events
        if (code === 404 && error === 'No speech') return this._resolve('timeout');
        if (code === 413 && error === 'Too much speech') return this._resolve('timeout');
      }