From 8999c85a71320b76c1f824d686bc76ff97f288c7 Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Wed, 3 Apr 2024 14:30:49 -0400 Subject: [PATCH] Fixes/ws testing dh (#704) * fixes from testing with translator app * more updates * linting * update gh actions to node 20 * add support for google v2 preconfigured recognizer * add support for google voice activity events * update to speech-utils@0.0.45 * update speech-utils to support caching azure tts * transcribe must buffer transcripts for channel 1 and 2 separately * further fix for accumulating transcripts * linting * deepgram sends transcripts with empty alternatives array * fix deepgram returning an empty array --- .github/workflows/build.yml | 2 +- lib/session/call-session.js | 57 ++++++++++++++++++++++++++------ lib/tasks/dub.js | 2 ++ lib/tasks/enqueue.js | 1 + lib/tasks/transcribe.js | 38 +++++++++++---------- lib/utils/constants.json | 1 + lib/utils/place-outdial.js | 8 +++++ lib/utils/transcription-utils.js | 8 +++-- lib/utils/ws-requestor.js | 6 ++-- package-lock.json | 28 ++++++++-------- package.json | 4 +-- 11 files changed, 107 insertions(+), 48 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bf152017..e5849a32 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,7 +9,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-node@v3 with: - node-version: 18 + node-version: 20 - run: npm ci - run: npm run jslint - run: docker pull drachtio/sipp diff --git a/lib/session/call-session.js b/lib/session/call-session.js index 6bc992e9..766dd2a4 100644 --- a/lib/session/call-session.js +++ b/lib/session/call-session.js @@ -112,13 +112,17 @@ class CallSession extends Emitter { this.requestor.removeAllListeners(); this.application.requestor = newRequestor; this.requestor.on('command', this._onCommand.bind(this)); + this.logger.debug(`CallSession: ${this.callSid} listener count ${this.requestor.listenerCount('command')}`); this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this)); this.requestor.on('handover', handover.bind(this)); }; - this.requestor.on('command', this._onCommand.bind(this)); - this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this)); - this.requestor.on('handover', handover.bind(this)); + if (!this.isConfirmCallSession) { + this.requestor.on('command', this._onCommand.bind(this)); + this.logger.debug(`CallSession: ${this.callSid} listener count ${this.requestor.listenerCount('command')}`); + this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this)); + this.requestor.on('handover', handover.bind(this)); + } } /** @@ -1367,6 +1371,30 @@ Duration=${duration} ` task.whisper(tasks, callSid).catch((err) => this.logger.error(err, 'CallSession:_lccWhisper')); } + async _lccConfig(opts) { + this.logger.debug({opts}, 'CallSession:_lccConfig'); + const t = normalizeJambones(this.logger, [ + { + verb: 'config', + ...opts + } + ]) + .map((tdata) => makeTask(this.logger, tdata)); + + const task = t[0]; + + const {span, ctx} = this.rootSpan.startChildSpan(`verb:${task.summary}`); + span.setAttributes({'verb.summary': task.summary}); + task.span = span; + task.ctx = ctx; + try { + await task.exec(this, {ep: this.ep}); + } catch (err) { + this.logger.error(err, 'CallSession:_lccConfig'); + } + task.span.end(); + } + async _lccDub(opts, callSid) { this.logger.debug({opts}, `CallSession:_lccDub on call_sid ${callSid}`); const t = normalizeJambones(this.logger, [ @@ -1377,23 +1405,24 @@ Duration=${duration} ` ]) .map((tdata) => makeTask(this.logger, tdata)); - const dubTask = t[0]; + const task = t[0]; const ep = this.currentTask?.name === TaskName.Dial && callSid === this.currentTask?.callSid ? this.currentTask.ep : this.ep; - const {span, ctx} = this.rootSpan.startChildSpan(`verb:${dubTask.summary}`); - span.setAttributes({'verb.summary': dubTask.summary}); - dubTask.span = span; - dubTask.ctx = ctx; + const {span, ctx} = this.rootSpan.startChildSpan(`verb:${task.summary}`); + span.setAttributes({'verb.summary': task.summary}); + task.span = span; + task.ctx = ctx; try { - await dubTask.exec(this, {ep}); + await task.exec(this, {ep}); } catch (err) { this.logger.error(err, 'CallSession:_lccDub'); } - dubTask.span.end(); + task.span.end(); } + async _lccBoostAudioSignal(opts, callSid) { const ep = this.currentTask?.name === TaskName.Dial && callSid === this.currentTask?.callSid ? this.currentTask.ep : @@ -1664,6 +1693,10 @@ Duration=${duration} ` this._lccCallStatus(data); break; + case 'config': + this._lccConfig(data, call_sid); + break; + case 'dial': this._lccCallDial(data); break; @@ -1978,6 +2011,10 @@ Duration=${duration} ` } this.logger.debug(`CallSession:propagateAnswer - answered callSid ${this.callSid}`); } + else { + this.logger.debug('CallSession:propagateAnswer - call already answered - re-anchor media with a reinvite'); + await this.dlg.modify(this.ep.local.sdp); + } } async _onRequestWithinDialog(req, res) { diff --git a/lib/tasks/dub.js b/lib/tasks/dub.js index 34ff86fd..2daaf56f 100644 --- a/lib/tasks/dub.js +++ b/lib/tasks/dub.js @@ -125,10 +125,12 @@ class TaskDub extends TtsTask { const path = filepath[0]; if (!path.startsWith('say:{')) { /* we have a local file of mp3 or r8 of synthesized speech audio to play */ + this.logger.info(`playing synthesized speech from file on track ${this.track}: ${path}`); this.play = path; await this._playOnTrack(cs, ep); } else { + this.logger.info(`doing actual text to speech file on track ${this.track}: ${path}`); await ep.dub({ action: 'sayOnTrack', track: this.track, diff --git a/lib/tasks/enqueue.js b/lib/tasks/enqueue.js index 4133f8df..9544ba31 100644 --- a/lib/tasks/enqueue.js +++ b/lib/tasks/enqueue.js @@ -338,6 +338,7 @@ class TaskEnqueue extends Task { this.logger.error({err}, `TaskEnqueue:_playHook error retrieving list info for queue ${this.queueName}`); } const json = await cs.application.requestor.request('verb:hook', hook, params, httpHeaders); + this.logger.debug({json}, 'TaskEnqueue:_playHook: received response from waitHook'); const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata)); const allowedTasks = tasks.filter((t) => allowed.includes(t.name)); diff --git a/lib/tasks/transcribe.js b/lib/tasks/transcribe.js index 1d78d5a5..95b57428 100644 --- a/lib/tasks/transcribe.js +++ b/lib/tasks/transcribe.js @@ -58,7 +58,7 @@ class TaskTranscribe extends SttTask { this.isContinuousAsr = true; } /* buffer speech for continuous asr */ - this._bufferedTranscripts = []; + this._bufferedTranscripts = [ [], [] ]; // for channel 1 and 2 this.bugname_prefix = 'transcribe_'; this.paused = false; } @@ -326,6 +326,7 @@ class TaskTranscribe extends SttTask { // make sure this is not a transcript from answering machine detection const bugname = fsEvent.getHeader('media-bugname'); const finished = fsEvent.getHeader('transcription-session-finished'); + const bufferedTranscripts = this._bufferedTranscripts[channel - 1]; if (bugname && this.bugname !== bugname) return; if (this.paused) { this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - paused, ignoring transcript'); @@ -335,14 +336,14 @@ class TaskTranscribe extends SttTask { if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') { /* we will only get this when we have set utterance_end_ms */ - if (this._bufferedTranscripts.length === 0) { + if (bufferedTranscripts.length === 0) { this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts'); } else { this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript'); - evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor); + evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language, this.vendor); evt.is_final = true; - this._bufferedTranscripts = []; + this._bufferedTranscripts[channel - 1] = []; this._resolve(channel, evt); } return; @@ -359,11 +360,11 @@ class TaskTranscribe extends SttTask { let emptyTranscript = false; if (evt.is_final) { - if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) { + if (evt.alternatives.length === 0 || evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) { emptyTranscript = true; if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor) && - this._bufferedTranscripts.length === 0) { + bufferedTranscripts.length === 0) { this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding'); return; } @@ -376,7 +377,7 @@ class TaskTranscribe extends SttTask { 'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening'); return; } - else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) { + else if (this.vendor === 'deepgram' && bufferedTranscripts.length > 0) { this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts'); } @@ -392,11 +393,12 @@ class TaskTranscribe extends SttTask { } } this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr'); - this._bufferedTranscripts.push(evt); + bufferedTranscripts.push(evt); this._startAsrTimer(channel); /* some STT engines will keep listening after a final response, so no need to restart */ - if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel); + if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google'] + .includes(this.vendor)) this._startTranscribing(cs, ep, channel); } else { if (this.vendor === 'soniox') { @@ -407,19 +409,20 @@ class TaskTranscribe extends SttTask { } else if (this.vendor === 'deepgram') { /* compile transcripts into one */ - if (!emptyTranscript) this._bufferedTranscripts.push(evt); + if (!emptyTranscript) bufferedTranscripts.push(evt); /* deepgram can send an empty and final transcript; only if we have any buffered should we resolve */ - if (this._bufferedTranscripts.length === 0) return; - evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language); - this._bufferedTranscripts = []; + if (bufferedTranscripts.length === 0) return; + evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language); + this._bufferedTranscripts[channel - 1] = []; } /* here is where we return a final transcript */ this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript'); this._resolve(channel, evt); /* some STT engines will keep listening after a final response, so no need to restart */ - if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel); + if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google'] + .includes(this.vendor)) this._startTranscribing(cs, ep, channel); } } else { @@ -430,7 +433,7 @@ class TaskTranscribe extends SttTask { const originalEvent = evt.vendor.evt; if (originalEvent.is_final && evt.alternatives[0].transcript !== '') { this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript'); - this._bufferedTranscripts.push(evt); + bufferedTranscripts.push(evt); } } @@ -591,8 +594,9 @@ class TaskTranscribe extends SttTask { this._clearAsrTimer(channel); this._asrTimer = setTimeout(() => { this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`); - const evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language, this.vendor); - this._bufferedTranscripts = []; + const evt = this.consolidateTranscripts( + this._bufferedTranscripts[channel - 1], channel, this.language, this.vendor); + this._bufferedTranscripts[channel - 1] = []; this._resolve(channel, evt); }, this.asrTimeout); this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${this.asrTimeout}ms for channel ${channel}`); diff --git a/lib/utils/constants.json b/lib/utils/constants.json index 26afb68d..76d6ed28 100644 --- a/lib/utils/constants.json +++ b/lib/utils/constants.json @@ -171,6 +171,7 @@ "session:new", "session:reconnect", "session:redirect", + "session:adulting", "call:status", "queue:status", "dial:confirm", diff --git a/lib/utils/place-outdial.js b/lib/utils/place-outdial.js index 377980ac..030b789f 100644 --- a/lib/utils/place-outdial.js +++ b/lib/utils/place-outdial.js @@ -413,6 +413,7 @@ class SingleDialer extends Emitter { const app = {...application}; if ('WS' === app.call_hook?.method || app.call_hook?.url.startsWith('ws://') || app.call_hook?.url.startsWith('wss://')) { + if (app.call_hook?.url) app.call_hook.url += '/adulting'; const requestor = new WsRequestor(logger, this.accountInfo.account.account_sid, app.call_hook, this.accountInfo.account.webhook_secret); app.requestor = requestor; @@ -438,6 +439,13 @@ class SingleDialer extends Emitter { tasks, rootSpan }); + app.requestor.request('session:adulting', '/adulting', { + ...cs.callInfo.toJSON(), + parentCallInfo: this.parentCallInfo + }).catch((err) => { + newLogger.error({err}, 'doAdulting: error sending adulting request'); + }); + cs.req = this.req; cs.exec().catch((err) => newLogger.error({err}, 'doAdulting: error executing session')); return cs; diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index 25e2a4c4..106bb9dc 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -270,7 +270,7 @@ const normalizeDeepgram = (evt, channel, language, shortUtterance) => { language_code: language, channel_tag: channel, is_final: shortUtterance ? evt.is_final : evt.speech_final, - alternatives: [alternatives[0]], + alternatives: alternatives.length ? [alternatives[0]] : [], vendor: { name: 'deepgram', evt: copy @@ -537,7 +537,11 @@ module.exports = (logger) => { }), ...(rOpts.googleOptions?.enableVoiceActivityEvents && { GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS: rOpts.googleOptions.enableVoiceActivityEvents - }) + }), + ...(rOpts.sgoogleOptions?.recognizerId) && {GOOGLE_SPEECH_RECOGNIZER_ID: rOpts.googleOptions.recognizerId}, + ...(rOpts.googleOptions?.enableVoiceActivityEvents && { + GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS: rOpts.googleOptions.enableVoiceActivityEvents + }), }), }; } diff --git a/lib/utils/ws-requestor.js b/lib/utils/ws-requestor.js index d1841840..3d7e3b69 100644 --- a/lib/utils/ws-requestor.js +++ b/lib/utils/ws-requestor.js @@ -119,7 +119,7 @@ class WsRequestor extends BaseRequestor { type, msgid, call_sid: this.call_sid, - hook: type === 'verb:hook' ? url : undefined, + hook: ['verb:hook', 'session:redirect'].includes(type) ? url : undefined, data: {...payload}, ...b3 }; @@ -346,7 +346,9 @@ class WsRequestor extends BaseRequestor { /* messages must be JSON format */ try { const obj = JSON.parse(content); - const {type, msgid, command, call_sid = this.call_sid, queueCommand = false, data} = obj; + //const {type, msgid, command, call_sid = this.call_sid, queueCommand = false, data} = obj; + const {type, msgid, command, queueCommand = false, data} = obj; + const call_sid = obj.callSid || this.call_sid; //this.logger.debug({obj}, 'WsRequestor:request websocket: received'); assert.ok(type, 'type property not supplied'); diff --git a/package-lock.json b/package-lock.json index d1c31f30..66997cc5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,10 +15,10 @@ "@jambonz/http-health-check": "^0.0.1", "@jambonz/mw-registrar": "^0.2.4", "@jambonz/realtimedb-helpers": "^0.8.7", - "@jambonz/speech-utils": "^0.0.44", + "@jambonz/speech-utils": "^0.0.47", "@jambonz/stats-collector": "^0.1.9", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.67", + "@jambonz/verb-specifications": "^0.0.69", "@opentelemetry/api": "^1.4.0", "@opentelemetry/exporter-jaeger": "^1.9.0", "@opentelemetry/exporter-trace-otlp-http": "^0.35.0", @@ -3468,9 +3468,9 @@ } }, "node_modules/@jambonz/speech-utils": { - "version": "0.0.44", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.44.tgz", - "integrity": "sha512-47EtN/cu2R86STPLE5bUcPBKXZFlER0BeJweRPjac6jfxd5MmJpjezgec3ZKr5MkvmrYFhY4CTA8qcbTc5mycQ==", + "version": "0.0.47", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.47.tgz", + "integrity": "sha512-aEMIjEq3yRT/VQAmH60MAD7nIFPKeQ926GlgADSAlx4kiB0cc371qHh3hxmF9roMJHf26e5DHWJQFSIFJad3yg==", "dependencies": { "@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0", @@ -3514,9 +3514,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.67", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.67.tgz", - "integrity": "sha512-xzojbx92BUosrdaCqECZEU/mCW8ImM78VI3VWy5aEoK+1A5ZDoRiUsLGyxoKvBCEVBGwHF3Q2tQMgnWUUbqtKA==", + "version": "0.0.69", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz", + "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==", "dependencies": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -14138,9 +14138,9 @@ } }, "@jambonz/speech-utils": { - "version": "0.0.44", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.44.tgz", - "integrity": "sha512-47EtN/cu2R86STPLE5bUcPBKXZFlER0BeJweRPjac6jfxd5MmJpjezgec3ZKr5MkvmrYFhY4CTA8qcbTc5mycQ==", + "version": "0.0.47", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.47.tgz", + "integrity": "sha512-aEMIjEq3yRT/VQAmH60MAD7nIFPKeQ926GlgADSAlx4kiB0cc371qHh3hxmF9roMJHf26e5DHWJQFSIFJad3yg==", "requires": { "@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0", @@ -14183,9 +14183,9 @@ } }, "@jambonz/verb-specifications": { - "version": "0.0.67", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.67.tgz", - "integrity": "sha512-xzojbx92BUosrdaCqECZEU/mCW8ImM78VI3VWy5aEoK+1A5ZDoRiUsLGyxoKvBCEVBGwHF3Q2tQMgnWUUbqtKA==", + "version": "0.0.69", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz", + "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==", "requires": { "debug": "^4.3.4", "pino": "^8.8.0" diff --git a/package.json b/package.json index e96763ca..cf1440a3 100644 --- a/package.json +++ b/package.json @@ -31,10 +31,10 @@ "@jambonz/http-health-check": "^0.0.1", "@jambonz/mw-registrar": "^0.2.4", "@jambonz/realtimedb-helpers": "^0.8.7", - "@jambonz/speech-utils": "^0.0.44", + "@jambonz/speech-utils": "^0.0.47", "@jambonz/stats-collector": "^0.1.9", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.67", + "@jambonz/verb-specifications": "^0.0.69", "@opentelemetry/api": "^1.4.0", "@opentelemetry/exporter-jaeger": "^1.9.0", "@opentelemetry/exporter-trace-otlp-http": "^0.35.0",