From 8999c85a71320b76c1f824d686bc76ff97f288c7 Mon Sep 17 00:00:00 2001
From: Dave Horton <daveh@beachdognet.com>
Date: Wed, 3 Apr 2024 14:30:49 -0400
Subject: [PATCH] Fixes/ws testing dh (#704)

* fixes from testing with translator app

* more updates

* linting

* update gh actions to node 20

* add support for google v2 preconfigured recognizer

* add support for google voice activity events

* update to speech-utils@0.0.45

* update speech-utils to support caching azure tts

* transcribe must buffer transcripts for channel 1 and 2 separately

* further fix for accumulating transcripts

* linting

* deepgram sends transcripts with empty alternatives array

* fix deepgram returning an empty array
---
 .github/workflows/build.yml      |  2 +-
 lib/session/call-session.js      | 57 ++++++++++++++++++++++++++------
 lib/tasks/dub.js                 |  2 ++
 lib/tasks/enqueue.js             |  1 +
 lib/tasks/transcribe.js          | 38 +++++++++++----------
 lib/utils/constants.json         |  1 +
 lib/utils/place-outdial.js       |  8 +++++
 lib/utils/transcription-utils.js |  8 +++--
 lib/utils/ws-requestor.js        |  6 ++--
 package-lock.json                | 28 ++++++++--------
 package.json                     |  4 +--
 11 files changed, 107 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bf152017..e5849a32 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -9,7 +9,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-node@v3
         with:
-          node-version: 18
+          node-version: 20
       - run: npm ci
       - run: npm run jslint
       - run: docker pull drachtio/sipp
diff --git a/lib/session/call-session.js b/lib/session/call-session.js
index 6bc992e9..766dd2a4 100644
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -112,13 +112,17 @@ class CallSession extends Emitter {
       this.requestor.removeAllListeners();
       this.application.requestor = newRequestor;
       this.requestor.on('command', this._onCommand.bind(this));
+      this.logger.debug(`CallSession: ${this.callSid} listener count ${this.requestor.listenerCount('command')}`);
       this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this));
       this.requestor.on('handover', handover.bind(this));
     };
 
-    this.requestor.on('command', this._onCommand.bind(this));
-    this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this));
-    this.requestor.on('handover', handover.bind(this));
+    if (!this.isConfirmCallSession) {
+      this.requestor.on('command', this._onCommand.bind(this));
+      this.logger.debug(`CallSession: ${this.callSid} listener count ${this.requestor.listenerCount('command')}`);
+      this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this));
+      this.requestor.on('handover', handover.bind(this));
+    }
   }
 
   /**
@@ -1367,6 +1371,30 @@ Duration=${duration} `
     task.whisper(tasks, callSid).catch((err) => this.logger.error(err, 'CallSession:_lccWhisper'));
   }
 
+  async _lccConfig(opts) {
+    this.logger.debug({opts}, 'CallSession:_lccConfig');
+    const t = normalizeJambones(this.logger, [
+      {
+        verb: 'config',
+        ...opts
+      }
+    ])
+      .map((tdata) => makeTask(this.logger, tdata));
+
+    const task = t[0];
+
+    const {span, ctx} = this.rootSpan.startChildSpan(`verb:${task.summary}`);
+    span.setAttributes({'verb.summary': task.summary});
+    task.span = span;
+    task.ctx = ctx;
+    try {
+      await task.exec(this, {ep: this.ep});
+    } catch (err) {
+      this.logger.error(err, 'CallSession:_lccConfig');
+    }
+    task.span.end();
+  }
+
   async _lccDub(opts, callSid) {
     this.logger.debug({opts}, `CallSession:_lccDub on call_sid ${callSid}`);
     const t = normalizeJambones(this.logger, [
@@ -1377,23 +1405,24 @@ Duration=${duration} `
     ])
       .map((tdata) => makeTask(this.logger, tdata));
 
-    const dubTask = t[0];
+    const task = t[0];
     const ep = this.currentTask?.name === TaskName.Dial && callSid === this.currentTask?.callSid ?
       this.currentTask.ep :
       this.ep;
 
-    const {span, ctx} = this.rootSpan.startChildSpan(`verb:${dubTask.summary}`);
-    span.setAttributes({'verb.summary': dubTask.summary});
-    dubTask.span = span;
-    dubTask.ctx = ctx;
+    const {span, ctx} = this.rootSpan.startChildSpan(`verb:${task.summary}`);
+    span.setAttributes({'verb.summary': task.summary});
+    task.span = span;
+    task.ctx = ctx;
     try {
-      await dubTask.exec(this, {ep});
+      await task.exec(this, {ep});
     } catch (err) {
       this.logger.error(err, 'CallSession:_lccDub');
     }
-    dubTask.span.end();
+    task.span.end();
   }
 
+
   async _lccBoostAudioSignal(opts, callSid) {
     const ep = this.currentTask?.name === TaskName.Dial && callSid === this.currentTask?.callSid ?
       this.currentTask.ep :
@@ -1664,6 +1693,10 @@ Duration=${duration} `
         this._lccCallStatus(data);
         break;
 
+      case 'config':
+        this._lccConfig(data, call_sid);
+        break;
+
       case 'dial':
         this._lccCallDial(data);
         break;
@@ -1978,6 +2011,10 @@ Duration=${duration} `
       }
       this.logger.debug(`CallSession:propagateAnswer - answered callSid ${this.callSid}`);
     }
+    else {
+      this.logger.debug('CallSession:propagateAnswer - call already answered - re-anchor media with a reinvite');
+      await this.dlg.modify(this.ep.local.sdp);
+    }
   }
 
   async _onRequestWithinDialog(req, res) {
diff --git a/lib/tasks/dub.js b/lib/tasks/dub.js
index 34ff86fd..2daaf56f 100644
--- a/lib/tasks/dub.js
+++ b/lib/tasks/dub.js
@@ -125,10 +125,12 @@ class TaskDub extends TtsTask {
     const path = filepath[0];
     if (!path.startsWith('say:{')) {
       /* we have a local file of mp3 or r8 of synthesized speech audio to play */
+      this.logger.info(`playing synthesized speech from file on track ${this.track}: ${path}`);
       this.play = path;
       await this._playOnTrack(cs, ep);
     }
     else {
+      this.logger.info(`doing actual text to speech file on track ${this.track}: ${path}`);
       await ep.dub({
         action: 'sayOnTrack',
         track: this.track,
diff --git a/lib/tasks/enqueue.js b/lib/tasks/enqueue.js
index 4133f8df..9544ba31 100644
--- a/lib/tasks/enqueue.js
+++ b/lib/tasks/enqueue.js
@@ -338,6 +338,7 @@ class TaskEnqueue extends Task {
       this.logger.error({err}, `TaskEnqueue:_playHook error retrieving list info for queue ${this.queueName}`);
     }
     const json = await cs.application.requestor.request('verb:hook', hook, params, httpHeaders);
+    this.logger.debug({json}, 'TaskEnqueue:_playHook: received response from waitHook');
     const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
 
     const allowedTasks = tasks.filter((t) => allowed.includes(t.name));
diff --git a/lib/tasks/transcribe.js b/lib/tasks/transcribe.js
index 1d78d5a5..95b57428 100644
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -58,7 +58,7 @@ class TaskTranscribe extends SttTask {
       this.isContinuousAsr = true;
     }
     /* buffer speech for continuous asr */
-    this._bufferedTranscripts = [];
+    this._bufferedTranscripts = [ [], [] ];  // for channel 1 and 2
     this.bugname_prefix = 'transcribe_';
     this.paused = false;
   }
@@ -326,6 +326,7 @@ class TaskTranscribe extends SttTask {
     // make sure this is not a transcript from answering machine detection
     const bugname = fsEvent.getHeader('media-bugname');
     const finished = fsEvent.getHeader('transcription-session-finished');
+    const bufferedTranscripts = this._bufferedTranscripts[channel - 1];
     if (bugname && this.bugname !== bugname) return;
     if (this.paused) {
       this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - paused, ignoring transcript');
@@ -335,14 +336,14 @@ class TaskTranscribe extends SttTask {
 
     if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
       /* we will only get this when we have set utterance_end_ms */
-      if (this._bufferedTranscripts.length === 0) {
+      if (bufferedTranscripts.length === 0) {
         this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
       }
       else {
         this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
-        evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
+        evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language, this.vendor);
         evt.is_final = true;
-        this._bufferedTranscripts = [];
+        this._bufferedTranscripts[channel - 1] = [];
         this._resolve(channel, evt);
       }
       return;
@@ -359,11 +360,11 @@ class TaskTranscribe extends SttTask {
 
     let emptyTranscript = false;
     if (evt.is_final) {
-      if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
+      if (evt.alternatives.length === 0 || evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
         emptyTranscript = true;
         if (finished === 'true' &&
           ['microsoft', 'deepgram'].includes(this.vendor) &&
-          this._bufferedTranscripts.length === 0) {
+          bufferedTranscripts.length === 0) {
           this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
           return;
         }
@@ -376,7 +377,7 @@ class TaskTranscribe extends SttTask {
             'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
           return;
         }
-        else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
+        else if (this.vendor === 'deepgram' && bufferedTranscripts.length > 0) {
           this.logger.info({evt},
             'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
         }
@@ -392,11 +393,12 @@ class TaskTranscribe extends SttTask {
           }
         }
         this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
-        this._bufferedTranscripts.push(evt);
+        bufferedTranscripts.push(evt);
         this._startAsrTimer(channel);
 
         /* some STT engines will keep listening after a final response, so no need to restart */
-        if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
+        if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google']
+          .includes(this.vendor)) this._startTranscribing(cs, ep, channel);
       }
       else {
         if (this.vendor === 'soniox') {
@@ -407,19 +409,20 @@ class TaskTranscribe extends SttTask {
         }
         else if (this.vendor === 'deepgram') {
           /* compile transcripts into one */
-          if (!emptyTranscript) this._bufferedTranscripts.push(evt);
+          if (!emptyTranscript) bufferedTranscripts.push(evt);
 
           /* deepgram can send an empty and final transcript; only if we have any buffered should we resolve */
-          if (this._bufferedTranscripts.length === 0) return;
-          evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
-          this._bufferedTranscripts = [];
+          if (bufferedTranscripts.length === 0) return;
+          evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language);
+          this._bufferedTranscripts[channel - 1] = [];
         }
 
         /* here is where we return a final transcript */
         this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
         this._resolve(channel, evt);
         /* some STT engines will keep listening after a final response, so no need to restart */
-        if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
+        if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google']
+          .includes(this.vendor)) this._startTranscribing(cs, ep, channel);
       }
     }
     else {
@@ -430,7 +433,7 @@ class TaskTranscribe extends SttTask {
         const originalEvent = evt.vendor.evt;
         if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
           this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
-          this._bufferedTranscripts.push(evt);
+          bufferedTranscripts.push(evt);
         }
       }
 
@@ -591,8 +594,9 @@ class TaskTranscribe extends SttTask {
     this._clearAsrTimer(channel);
     this._asrTimer = setTimeout(() => {
       this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
-      const evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language, this.vendor);
-      this._bufferedTranscripts = [];
+      const evt = this.consolidateTranscripts(
+        this._bufferedTranscripts[channel - 1], channel, this.language, this.vendor);
+      this._bufferedTranscripts[channel - 1] = [];
       this._resolve(channel, evt);
     }, this.asrTimeout);
     this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${this.asrTimeout}ms for channel ${channel}`);
diff --git a/lib/utils/constants.json b/lib/utils/constants.json
index 26afb68d..76d6ed28 100644
--- a/lib/utils/constants.json
+++ b/lib/utils/constants.json
@@ -171,6 +171,7 @@
     "session:new",
     "session:reconnect",
     "session:redirect",
+    "session:adulting",
     "call:status",
     "queue:status",
     "dial:confirm",
diff --git a/lib/utils/place-outdial.js b/lib/utils/place-outdial.js
index 377980ac..030b789f 100644
--- a/lib/utils/place-outdial.js
+++ b/lib/utils/place-outdial.js
@@ -413,6 +413,7 @@ class SingleDialer extends Emitter {
     const app = {...application};
     if ('WS' === app.call_hook?.method ||
       app.call_hook?.url.startsWith('ws://') || app.call_hook?.url.startsWith('wss://')) {
+      if (app.call_hook?.url) app.call_hook.url += '/adulting';
       const requestor = new WsRequestor(logger, this.accountInfo.account.account_sid,
         app.call_hook, this.accountInfo.account.webhook_secret);
       app.requestor = requestor;
@@ -438,6 +439,13 @@ class SingleDialer extends Emitter {
       tasks,
       rootSpan
     });
+    app.requestor.request('session:adulting', '/adulting', {
+      ...cs.callInfo.toJSON(),
+      parentCallInfo: this.parentCallInfo
+    }).catch((err) => {
+      newLogger.error({err}, 'doAdulting: error sending adulting request');
+    });
+
     cs.req = this.req;
     cs.exec().catch((err) => newLogger.error({err}, 'doAdulting: error executing session'));
     return cs;
diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js
index 25e2a4c4..106bb9dc 100644
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -270,7 +270,7 @@ const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
     language_code: language,
     channel_tag: channel,
     is_final: shortUtterance ? evt.is_final : evt.speech_final,
-    alternatives: [alternatives[0]],
+    alternatives: alternatives.length ? [alternatives[0]] : [],
     vendor: {
       name: 'deepgram',
       evt: copy
@@ -537,7 +537,11 @@ module.exports = (logger) => {
           }),
           ...(rOpts.googleOptions?.enableVoiceActivityEvents && {
             GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS: rOpts.googleOptions.enableVoiceActivityEvents
-          })
+          }),
+          ...(rOpts.sgoogleOptions?.recognizerId) && {GOOGLE_SPEECH_RECOGNIZER_ID: rOpts.googleOptions.recognizerId},
+          ...(rOpts.googleOptions?.enableVoiceActivityEvents && {
+            GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS: rOpts.googleOptions.enableVoiceActivityEvents
+          }),
         }),
       };
     }
diff --git a/lib/utils/ws-requestor.js b/lib/utils/ws-requestor.js
index d1841840..3d7e3b69 100644
--- a/lib/utils/ws-requestor.js
+++ b/lib/utils/ws-requestor.js
@@ -119,7 +119,7 @@ class WsRequestor extends BaseRequestor {
       type,
       msgid,
       call_sid: this.call_sid,
-      hook: type === 'verb:hook' ? url : undefined,
+      hook: ['verb:hook', 'session:redirect'].includes(type) ? url : undefined,
       data: {...payload},
       ...b3
     };
@@ -346,7 +346,9 @@ class WsRequestor extends BaseRequestor {
     /* messages must be JSON format */
     try {
       const obj = JSON.parse(content);
-      const {type, msgid, command, call_sid = this.call_sid, queueCommand = false, data} = obj;
+      //const {type, msgid, command, call_sid = this.call_sid, queueCommand = false, data} = obj;
+      const {type, msgid, command, queueCommand = false, data} = obj;
+      const call_sid = obj.callSid || this.call_sid;
 
       //this.logger.debug({obj}, 'WsRequestor:request websocket: received');
       assert.ok(type, 'type property not supplied');
diff --git a/package-lock.json b/package-lock.json
index d1c31f30..66997cc5 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -15,10 +15,10 @@
         "@jambonz/http-health-check": "^0.0.1",
         "@jambonz/mw-registrar": "^0.2.4",
         "@jambonz/realtimedb-helpers": "^0.8.7",
-        "@jambonz/speech-utils": "^0.0.44",
+        "@jambonz/speech-utils": "^0.0.47",
         "@jambonz/stats-collector": "^0.1.9",
         "@jambonz/time-series": "^0.2.8",
-        "@jambonz/verb-specifications": "^0.0.67",
+        "@jambonz/verb-specifications": "^0.0.69",
         "@opentelemetry/api": "^1.4.0",
         "@opentelemetry/exporter-jaeger": "^1.9.0",
         "@opentelemetry/exporter-trace-otlp-http": "^0.35.0",
@@ -3468,9 +3468,9 @@
       }
     },
     "node_modules/@jambonz/speech-utils": {
-      "version": "0.0.44",
-      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.44.tgz",
-      "integrity": "sha512-47EtN/cu2R86STPLE5bUcPBKXZFlER0BeJweRPjac6jfxd5MmJpjezgec3ZKr5MkvmrYFhY4CTA8qcbTc5mycQ==",
+      "version": "0.0.47",
+      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.47.tgz",
+      "integrity": "sha512-aEMIjEq3yRT/VQAmH60MAD7nIFPKeQ926GlgADSAlx4kiB0cc371qHh3hxmF9roMJHf26e5DHWJQFSIFJad3yg==",
       "dependencies": {
         "@aws-sdk/client-polly": "^3.496.0",
         "@aws-sdk/client-sts": "^3.496.0",
@@ -3514,9 +3514,9 @@
       }
     },
     "node_modules/@jambonz/verb-specifications": {
-      "version": "0.0.67",
-      "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.67.tgz",
-      "integrity": "sha512-xzojbx92BUosrdaCqECZEU/mCW8ImM78VI3VWy5aEoK+1A5ZDoRiUsLGyxoKvBCEVBGwHF3Q2tQMgnWUUbqtKA==",
+      "version": "0.0.69",
+      "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
+      "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
       "dependencies": {
         "debug": "^4.3.4",
         "pino": "^8.8.0"
@@ -14138,9 +14138,9 @@
       }
     },
     "@jambonz/speech-utils": {
-      "version": "0.0.44",
-      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.44.tgz",
-      "integrity": "sha512-47EtN/cu2R86STPLE5bUcPBKXZFlER0BeJweRPjac6jfxd5MmJpjezgec3ZKr5MkvmrYFhY4CTA8qcbTc5mycQ==",
+      "version": "0.0.47",
+      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.47.tgz",
+      "integrity": "sha512-aEMIjEq3yRT/VQAmH60MAD7nIFPKeQ926GlgADSAlx4kiB0cc371qHh3hxmF9roMJHf26e5DHWJQFSIFJad3yg==",
       "requires": {
         "@aws-sdk/client-polly": "^3.496.0",
         "@aws-sdk/client-sts": "^3.496.0",
@@ -14183,9 +14183,9 @@
       }
     },
     "@jambonz/verb-specifications": {
-      "version": "0.0.67",
-      "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.67.tgz",
-      "integrity": "sha512-xzojbx92BUosrdaCqECZEU/mCW8ImM78VI3VWy5aEoK+1A5ZDoRiUsLGyxoKvBCEVBGwHF3Q2tQMgnWUUbqtKA==",
+      "version": "0.0.69",
+      "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
+      "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
       "requires": {
         "debug": "^4.3.4",
         "pino": "^8.8.0"
diff --git a/package.json b/package.json
index e96763ca..cf1440a3 100644
--- a/package.json
+++ b/package.json
@@ -31,10 +31,10 @@
     "@jambonz/http-health-check": "^0.0.1",
     "@jambonz/mw-registrar": "^0.2.4",
     "@jambonz/realtimedb-helpers": "^0.8.7",
-    "@jambonz/speech-utils": "^0.0.44",
+    "@jambonz/speech-utils": "^0.0.47",
     "@jambonz/stats-collector": "^0.1.9",
     "@jambonz/time-series": "^0.2.8",
-    "@jambonz/verb-specifications": "^0.0.67",
+    "@jambonz/verb-specifications": "^0.0.69",
     "@opentelemetry/api": "^1.4.0",
     "@opentelemetry/exporter-jaeger": "^1.9.0",
     "@opentelemetry/exporter-trace-otlp-http": "^0.35.0",