add support for vad to gather and transcribe (#67)

2026-02-13 01:39:26 +00:00 · 2022-02-10 08:45:16 -05:00
parent bac1b7f2c6
commit 30ed5b6a02
3 changed files with 31 additions and 0 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -30,6 +30,10 @@ class TaskGather extends Task {
      this.hints = recognizer.hints || [];
      this.altLanguages = recognizer.altLanguages || [];

+      /* vad: if provided, we dont connect to recognizer until voice activity is detected */
+      const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {};
+      this.vad = {enable, voiceMs, mode};
+
      /* aws options */
      this.vocabularyName = recognizer.vocabularyName;
      this.vocabularyFilterName = recognizer.vocabularyFilterName;
@@ -137,6 +141,12 @@ class TaskGather extends Task {
  async _initSpeech(cs, ep) {
    const opts = {};

+    if (this.vad.enable) {
+      opts.START_RECOGNIZING_ON_VAD = 1;
+      if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs;
+      if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode;
+    }
+
    if ('google' === this.vendor) {
      if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
      Object.assign(opts, {
--- a/lib/tasks/specs.json
+++ b/lib/tasks/specs.json
@@ -389,6 +389,7 @@
        "enum": ["google", "aws", "microsoft", "default"]
      },
      "language": "string",
+      "vad": "#vad",
      "hints": "array",
      "altLanguages": "array",
      "profanityFilter": "boolean",
@@ -457,5 +458,15 @@
    "required": [
      "name"
    ]
+  },
+  "vad": {
+    "properties": {
+      "enable": "boolean",
+      "voiceMs": "number",
+      "mode": "number"  
+    },
+    "required": [
+      "enable"
+    ]
  }
 }
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -22,6 +22,10 @@ class TaskTranscribe extends Task {
    this.interim = !!recognizer.interim;
    this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;

+    /* vad: if provided, we dont connect to recognizer until voice activity is detected */
+    const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {};
+    this.vad = {enable, voiceMs, mode};
+
    /* google-specific options */
    this.hints = recognizer.hints || [];
    this.profanityFilter = recognizer.profanityFilter;
@@ -105,6 +109,12 @@ class TaskTranscribe extends Task {
  async _startTranscribing(cs, ep) {
    const opts = {};

+    if (this.vad.enable) {
+      opts.START_RECOGNIZING_ON_VAD = 1;
+      if (this.vad.voiceMs) opts.RECOGNIZER_VAD_VOICE_MS = this.vad.voiceMs;
+      if (this.vad.mode >= 0 && this.vad.mode <= 3) opts.RECOGNIZER_VAD_MODE = this.vad.mode;
+    }
+
    ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
    ep.addCustomEventListener(GoogleTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep));
    ep.addCustomEventListener(GoogleTranscriptionEvents.MaxDurationExceeded,