support mod_vad_detect (#762)

* support mod_vad_detect * wip * update verb spec and drachtio fsmrf * Update example-voicemail-greetings.json (#761) Update voicemail english greetings * wip * stopvad if playdone --------- Co-authored-by: Vinod Dharashive <vdharashive@gmail.com>
2025-12-20 16:50:39 +00:00 · 2024-05-29 18:31:59 +07:00
parent 24b6d2464b
commit 498dd64025
7 changed files with 73 additions and 30 deletions
--- a/lib/session/call-session.js
+++ b/lib/session/call-session.js
@@ -338,6 +338,17 @@ class CallSession extends Emitter {
    this.application.fallback_speech_recognizer_language = language;
  }

+  /**
+   * Vad
+   */
+  get vad() {
+    return this._vad;
+  }
+
+  set vad(v) {
+    this._vad = v;
+  }
+
  /**
   * indicates whether the call currently in progress
   */
--- a/lib/tasks/config.js
+++ b/lib/tasks/config.js
@@ -15,7 +15,8 @@ class TaskConfig extends Task {
      'transcribe',
      'fillerNoise',
      'actionHookDelayAction',
-      'boostAudioSignal'
+      'boostAudioSignal',
+      'vad'
    ].forEach((k) => this[k] = this.data[k] || {});

    if ('notifyEvents' in this.data) {
@@ -70,6 +71,7 @@ class TaskConfig extends Task {
  get hasListen() { return Object.keys(this.listen).length; }
  get hasTranscribe() { return Object.keys(this.transcribe).length; }
  get hasDub() { return Object.keys(this.dub).length; }
+  get hasVad() { return Object.keys(this.vad).length; }
  get hasFillerNoise() { return Object.keys(this.fillerNoise).length; }

  get summary() {
@@ -287,6 +289,16 @@ class TaskConfig extends Task {
        cs.enableFillerNoise(opts);
      }
    }
+
+    if (this.hasVad) {
+      cs.vad = {
+        enable: this.vad.enable || false,
+        voiceMs: this.vad.voiceMs || 250,
+        silenceMs: this.vad.silenceMs || 150,
+        strategy: this.vad.strategy || 'one-shot',
+        mode: this.vad.mod || 2
+      };
+    }
  }

  async kill(cs) {
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -10,7 +10,8 @@ const {
  IbmTranscriptionEvents,
  NvidiaTranscriptionEvents,
  JambonzTranscriptionEvents,
-  AssemblyAiTranscriptionEvents
+  AssemblyAiTranscriptionEvents,
+  VadDetection
 } = require('../utils/constants.json');
 const {
  JAMBONES_GATHER_EARLY_HINTS_MATCH,
@@ -27,7 +28,7 @@ class TaskGather extends SttTask {
    [
      'finishOnKey', 'input', 'numDigits', 'minDigits', 'maxDigits',
      'interDigitTimeout', 'partialResultHook', 'bargein', 'dtmfBargein',
-      'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise'
+      'speechTimeout', 'timeout', 'say', 'play', 'actionHookDelayAction', 'fillerNoise', 'vad'
    ].forEach((k) => this[k] = this.data[k]);

    // gather default input is digits
@@ -41,7 +42,8 @@ class TaskGather extends SttTask {
    this.timeout = this.timeout === 0 ? 0 : (this.timeout || 15) * 1000;
    this.interim = !!this.partialResultHook || this.bargein || (this.timeout > 0);
    this.listenDuringPrompt = this.data.listenDuringPrompt === false ? false : true;
-    this.minBargeinWordCount = this.data.minBargeinWordCount || 1;
+    this.minBargeinWordCount = this.data.minBargeinWordCount !== undefined ? this.data.minBargeinWordCount : 1;
+    this._vadEnabled = this.minBargeinWordCount === 0;
    if (this.data.recognizer) {
      /* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
      this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
@@ -128,6 +130,11 @@ class TaskGather extends SttTask {
      ...(this.fillerNoise || {})
    };

+    this.vad = {
+      ...(cs.vad || {}),
+      ...(this.vad || {})
+    };
+
    if (cs.hasGlobalSttHints && !this.maskGlobalSttHints) {
      const {hints, hintsBoost} = cs.globalSttHints;
      const setOfHints = new Set((this.data.recognizer.hints || [])
@@ -178,6 +185,8 @@ class TaskGather extends SttTask {
      retries: this._hookDelayRetries
    };

+    this._startVad();
+
    const startListening = async(cs, ep) => {
      this._startTimer();
      if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
@@ -201,6 +210,7 @@ class TaskGather extends SttTask {
        const {span, ctx} = this.startChildSpan(`nested:${this.sayTask.summary}`);
        const process = () => {
          this.logger.debug('Gather: nested say task completed');
+          this._stopVad();
          if (!this.killed) {
            startListening(cs, ep);
            if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
@@ -227,6 +237,7 @@ class TaskGather extends SttTask {
        const {span, ctx} = this.startChildSpan(`nested:${this.playTask.summary}`);
        const process = () => {
          this.logger.debug('Gather: nested play task completed');
+          this._stopVad();
          if (!this.killed) {
            startListening(cs, ep);
            if (this.input.includes('speech') && this.vendor === 'nuance' && this.listenDuringPrompt) {
@@ -291,6 +302,7 @@ class TaskGather extends SttTask {
    this._clearAsrTimer();
    this.playTask?.span.end();
    this.sayTask?.span.end();
+    this._stopVad();
    this._resolve('killed');
  }

@@ -368,15 +380,12 @@ class TaskGather extends SttTask {
          ep, GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
        this.addCustomEventListener(
          ep, GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
-        this.addCustomEventListener(
-          ep, GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
        break;

      case 'aws':
      case 'polly':
        this.bugname = `${this.bugname_prefix}aws_transcribe`;
        this.addCustomEventListener(ep, AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
-        this.addCustomEventListener(ep, AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
        break;
      case 'microsoft':
        this.bugname = `${this.bugname_prefix}azure_transcribe`;
@@ -384,7 +393,6 @@ class TaskGather extends SttTask {
          ep, AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
        //this.addCustomEventListener(ep, AzureTranscriptionEvents.NoSpeechDetected,
        //this._onNoSpeechDetected.bind(this, cs, ep));
-        this.addCustomEventListener(ep, AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
        break;
      case 'nuance':
        this.bugname = `${this.bugname_prefix}nuance_transcribe`;
@@ -394,8 +402,6 @@ class TaskGather extends SttTask {
          this._onStartOfSpeech.bind(this, cs, ep));
        this.addCustomEventListener(ep, NuanceTranscriptionEvents.TranscriptionComplete,
          this._onTranscriptionComplete.bind(this, cs, ep));
-        this.addCustomEventListener(ep, NuanceTranscriptionEvents.VadDetected,
-          this._onVadDetected.bind(this, cs, ep));

        /* stall timers until prompt finishes playing */
        if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
@@ -465,8 +471,6 @@ class TaskGather extends SttTask {
          this._onStartOfSpeech.bind(this, cs, ep));
        this.addCustomEventListener(ep, NvidiaTranscriptionEvents.TranscriptionComplete,
          this._onTranscriptionComplete.bind(this, cs, ep));
-        this.addCustomEventListener(ep, NvidiaTranscriptionEvents.VadDetected,
-          this._onVadDetected.bind(this, cs, ep));

        /* I think nvidia has this (??) - stall timers until prompt finishes playing */
        if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
@@ -704,6 +708,25 @@ class TaskGather extends SttTask {
    this._finalAsrTimer = null;
  }

+
+  _startVad() {
+    if (!this._vadStarted && this._vadEnabled) {
+      this.logger.debug('_startVad');
+      this.addCustomEventListener(this.ep, VadDetection.Detection, this._onVadDetected.bind(this, this.cs, this.ep));
+      this.ep?.startVadDetection(this.vad);
+      this._vadStarted = true;
+    }
+  }
+
+  _stopVad() {
+    if (this._vadStarted) {
+      this.logger.debug('_stopVad');
+      this.ep?.stopVadDetection(this.vad);
+      this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected);
+      this._vadStarted = false;
+    }
+  }
+
  _startFillerNoise() {
    this.logger.debug('Gather:_startFillerNoise - playing filler noise');
    this.ep?.play(this.fillerNoise.url);
@@ -1039,6 +1062,10 @@ class TaskGather extends SttTask {
      this._killAudio(cs);
      this.emit('vad');
    }
+    if (this.vad?.strategy === 'one-shot') {
+      this.ep?.removeCustomEventListener(VadDetection.Detection, this._onVadDetected);
+      this._vadStarted = false;
+    }
  }

  _onNoSpeechDetected(cs, ep, evt, fsEvent) {
--- a/lib/utils/constants.json
+++ b/lib/utils/constants.json
@@ -134,6 +134,9 @@
    "ConnectFailure": "assemblyai_transcribe::connect_failed",
    "Connect": "assemblyai_transcribe::connect"
  },
+  "VadDetection": {
+    "Detection": "vad_detect:detection"
+  },
  "ListenEvents": {
    "Connect": "mod_audio_fork::connect",
    "ConnectFailure": "mod_audio_fork::connect_failed",
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -474,18 +474,8 @@ module.exports = (logger) => {

  const setChannelVarsForStt = (task, sttCredentials, language, rOpts = {}) => {
    let opts = {};
-    const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {};
-    const vad = {enable, voiceMs, mode};
    const vendor = rOpts.vendor;

-    /* voice activity detection works across vendors */
-    opts = {
-      ...opts,
-      ...(vad.enable && {START_RECOGNIZING_ON_VAD: 1}),
-      ...(vad.enable && vad.voiceMs && {RECOGNIZER_VAD_VOICE_MS: vad.voiceMs}),
-      ...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}),
-    };
-
    if ('google' === vendor) {
      const useV2 = rOpts.googleOptions?.serviceVersion === 'v2';
      const model = task.name === TaskName.Gather ?