diff --git a/lib/session/call-session.js b/lib/session/call-session.js index 85955c3e..8b0c75fa 100644 --- a/lib/session/call-session.js +++ b/lib/session/call-session.js @@ -338,6 +338,17 @@ class CallSession extends Emitter { this.application.fallback_speech_recognizer_language = language; } + /** + * Vad + */ + get vad() { + return this._vad; + } + + set vad(v) { + this._vad = v; + } + /** * indicates whether the call currently in progress */ diff --git a/lib/tasks/config.js b/lib/tasks/config.js index 75343139..526c1c85 100644 --- a/lib/tasks/config.js +++ b/lib/tasks/config.js @@ -15,7 +15,8 @@ class TaskConfig extends Task { 'transcribe', 'fillerNoise', 'actionHookDelayAction', - 'boostAudioSignal' + 'boostAudioSignal', + 'vad' ].forEach((k) => this[k] = this.data[k] || {}); if ('notifyEvents' in this.data) { @@ -70,6 +71,7 @@ class TaskConfig extends Task { get hasListen() { return Object.keys(this.listen).length; } get hasTranscribe() { return Object.keys(this.transcribe).length; } get hasDub() { return Object.keys(this.dub).length; } + get hasVad() { return Object.keys(this.vad).length; } get hasFillerNoise() { return Object.keys(this.fillerNoise).length; } get summary() { @@ -287,6 +289,16 @@ class TaskConfig extends Task { cs.enableFillerNoise(opts); } } + + if (this.hasVad) { + cs.vad = { + enable: this.vad.enable || false, + voiceMs: this.vad.voiceMs || 250, + silenceMs: this.vad.silenceMs || 150, + strategy: this.vad.strategy || 'one-shot', + mode: this.vad.mod || 2 + }; + } } async kill(cs) { diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index 3ae23326..f82df34d 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -10,7 +10,8 @@ const { IbmTranscriptionEvents, NvidiaTranscriptionEvents, JambonzTranscriptionEvents, - AssemblyAiTranscriptionEvents + AssemblyAiTranscriptionEvents, + VadDetection } = require('../utils/constants.json'); const { JAMBONES_GATHER_EARLY_HINTS_MATCH, @@ -27,7 +28,7 @@ class TaskGather extends SttTask { [ 'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits', 'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein', - 'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise' + 'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise', 'vad' ].forEach((k) => this[k] = this.data[k]); // gather default input is digits @@ -41,7 +42,8 @@ class TaskGather extends SttTask { this.timeout = this.timeout === 0 ? 0 : (this.timeout || 15) * 1000; this.interim = !!this.partialResultHook || this.bargein || (this.timeout > 0); this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true; - this.minBargeinWordCount = this.data.minBargeinWordCount || 1; + this.minBargeinWordCount = this.data.minBargeinWordCount !== undefined ? this.data.minBargeinWordCount : 1; + this._vadEnabled = this.minBargeinWordCount === 0; if (this.data.recognizer) { /* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */ this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ? @@ -128,6 +130,11 @@ class TaskGather extends SttTask { ...(this.fillerNoise || {}) }; + this.vad = { + ...(cs.vad || {}), + ...(this.vad || {}) + }; + if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) { const {hints, hintsBoost} = cs.globalSttHints; const setOfHints = new Set((this.data.recognizer.hints || []) @@ -178,6 +185,8 @@ class TaskGather extends SttTask { retries: this._hookDelayRetries }; + this._startVad(); + const startListening = async(cs, ep) => { this._startTimer(); if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer(); @@ -201,6 +210,7 @@ class TaskGather extends SttTask { const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`); const process = () => { this.logger.debug('Gather: nested say task completed'); + this._stopVad(); if (!this.killed) { startListening(cs, ep); if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) { @@ -227,6 +237,7 @@ class TaskGather extends SttTask { const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`); const process = () => { this.logger.debug('Gather: nested play task completed'); + this._stopVad(); if (!this.killed) { startListening(cs, ep); if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) { @@ -291,6 +302,7 @@ class TaskGather extends SttTask { this._clearAsrTimer(); this.playTask?.span.end(); this.sayTask?.span.end(); + this._stopVad(); this._resolve('killed'); } @@ -368,15 +380,12 @@ class TaskGather extends SttTask { ep, GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); this.addCustomEventListener( ep, GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep)); - this.addCustomEventListener( - ep, GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); break; case 'aws': case 'polly': this.bugname = `${this.bugname_prefix}aws_transcribe`; this.addCustomEventListener(ep, AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); - this.addCustomEventListener(ep, AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); break; case 'microsoft': this.bugname = `${this.bugname_prefix}azure_transcribe`; @@ -384,7 +393,6 @@ class TaskGather extends SttTask { ep, AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); //this.addCustomEventListener(ep, AzureTranscriptionEvents.NoSpeechDetected, //this._onNoSpeechDetected.bind(this, cs, ep)); - this.addCustomEventListener(ep, AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); break; case 'nuance': this.bugname = `${this.bugname_prefix}nuance_transcribe`; @@ -394,8 +402,6 @@ class TaskGather extends SttTask { this._onStartOfSpeech.bind(this, cs, ep)); this.addCustomEventListener(ep, NuanceTranscriptionEvents.TranscriptionComplete, this._onTranscriptionComplete.bind(this, cs, ep)); - this.addCustomEventListener(ep, NuanceTranscriptionEvents.VadDetected, - this._onVadDetected.bind(this, cs, ep)); /* stall timers until prompt finishes playing */ if ((this.sayTask || this.playTask) && this.listenDuringPrompt) { @@ -465,8 +471,6 @@ class TaskGather extends SttTask { this._onStartOfSpeech.bind(this, cs, ep)); this.addCustomEventListener(ep, NvidiaTranscriptionEvents.TranscriptionComplete, this._onTranscriptionComplete.bind(this, cs, ep)); - this.addCustomEventListener(ep, NvidiaTranscriptionEvents.VadDetected, - this._onVadDetected.bind(this, cs, ep)); /* I think nvidia has this (??) - stall timers until prompt finishes playing */ if ((this.sayTask || this.playTask) && this.listenDuringPrompt) { @@ -704,6 +708,25 @@ class TaskGather extends SttTask { this._finalAsrTimer = null; } + + _startVad() { + if (!this._vadStarted && this._vadEnabled) { + this.logger.debug('_startVad'); + this.addCustomEventListener(this.ep, VadDetection.Detection, this._onVadDetected.bind(this, this.cs, this.ep)); + this.ep?.startVadDetection(this.vad); + this._vadStarted = true; + } + } + + _stopVad() { + if (this._vadStarted) { + this.logger.debug('_stopVad'); + this.ep?.stopVadDetection(this.vad); + this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected); + this._vadStarted = false; + } + } + _startFillerNoise() { this.logger.debug('Gather:_startFillerNoise - playing filler noise'); this.ep?.play(this.fillerNoise.url); @@ -1039,6 +1062,10 @@ class TaskGather extends SttTask { this._killAudio(cs); this.emit('vad'); } + if (this.vad?.strategy === 'one-shot') { + this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected); + this._vadStarted = false; + } } _onNoSpeechDetected(cs, ep, evt, fsEvent) { diff --git a/lib/utils/constants.json b/lib/utils/constants.json index 76d6ed28..972e9cc4 100644 --- a/lib/utils/constants.json +++ b/lib/utils/constants.json @@ -134,6 +134,9 @@ "ConnectFailure": "assemblyai_transcribe::connect_failed", "Connect": "assemblyai_transcribe::connect" }, + "VadDetection": { + "Detection": "vad_detect:detection" + }, "ListenEvents": { "Connect": "mod_audio_fork::connect", "ConnectFailure": "mod_audio_fork::connect_failed", diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index 8ad150d7..baff1197 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -474,18 +474,8 @@ module.exports = (logger) => { const setChannelVarsForStt = (task, sttCredentials, language, rOpts = {}) => { let opts = {}; - const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {}; - const vad = {enable, voiceMs, mode}; const vendor = rOpts.vendor; - /* voice activity detection works across vendors */ - opts = { - ...opts, - ...(vad.enable && {START_RECOGNIZING_ON_VAD: 1}), - ...(vad.enable && vad.voiceMs && {RECOGNIZER_VAD_VOICE_MS: vad.voiceMs}), - ...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}), - }; - if ('google' === vendor) { const useV2 = rOpts.googleOptions?.serviceVersion === 'v2'; const model = task.name === TaskName.Gather ? diff --git a/package-lock.json b/package-lock.json index 2f37a835..755ba806 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,7 +18,7 @@ "@jambonz/speech-utils": "^0.1.3", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.69", + "@jambonz/verb-specifications": "^0.0.71", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0", "@opentelemetry/exporter-trace-otlp-http": "^0.50.0", @@ -2360,9 +2360,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.69", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz", - "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==", + "version": "0.0.71", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.71.tgz", + "integrity": "sha512-e4f7zbSncuh4cVtEg0DlGBp60B6d9SMxa0sI+bgIWLq9oRfvziL2Afb0od/a8AiPgDmIxBp6a3IoXcOy9gNCxw==", "dependencies": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -11992,9 +11992,9 @@ } }, "@jambonz/verb-specifications": { - "version": "0.0.69", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz", - "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==", + "version": "0.0.71", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.71.tgz", + "integrity": "sha512-e4f7zbSncuh4cVtEg0DlGBp60B6d9SMxa0sI+bgIWLq9oRfvziL2Afb0od/a8AiPgDmIxBp6a3IoXcOy9gNCxw==", "requires": { "debug": "^4.3.4", "pino": "^8.8.0" diff --git a/package.json b/package.json index 734d73af..64b346e2 100644 --- a/package.json +++ b/package.json @@ -34,7 +34,7 @@ "@jambonz/speech-utils": "^0.1.3", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.69", + "@jambonz/verb-specifications": "^0.0.71", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0", "@opentelemetry/exporter-trace-otlp-http": "^0.50.0",