expose model and singleUtterance to gather/transcribe when using google

2026-02-13 17:59:42 +00:00 · 2022-05-08 12:29:55 -04:00
parent 036accab44
commit 182ad8c716
3 changed files with 39 additions and 20 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -36,8 +36,18 @@ class TaskGather extends Task {
      this.language = recognizer.language;
      this.hints = recognizer.hints || [];
      this.hintsBoost = recognizer.hintsBoost;
-      this.altLanguages = recognizer.altLanguages || [];
+      this.profanityFilter = recognizer.profanityFilter;
      this.punctuation = !!recognizer.punctuation;
+      this.enhancedModel = !!recognizer.enhancedModel;
+      this.model = recognizer.model || 'command_and_search';
+      this.words = !!recognizer.words;
+      this.singleUtterance = recognizer.singleUtterance || true;
+      this.diarization = !!recognizer.diarization;
+      this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
+      this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
+      this.interactionType = recognizer.interactionType || 'unspecified';
+      this.naicsCode = recognizer.naicsCode || 0;
+      this.altLanguages = recognizer.altLanguages || [];

      /* vad: if provided, we dont connect to recognizer until voice activity is detected */
      const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {};
@@ -232,24 +242,35 @@ class TaskGather extends Task {

    if ('google' === this.vendor) {
      if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
-      Object.assign(opts, {
-        GOOGLE_SPEECH_USE_ENHANCED: true,
-        GOOGLE_SPEECH_SINGLE_UTTERANCE: true,
-        GOOGLE_SPEECH_MODEL: 'command_and_search',
-        GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: !!this.punctuation
+      [
+        ['enhancedModel', 'GOOGLE_SPEECH_USE_ENHANCED'],
+        ['separateRecognitionPerChannel', 'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL'],
+        ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
+        ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
+        ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
+        ['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'],
+        ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
+      ].forEach((arr) => {
+        if (this[arr[0]]) opts[arr[1]] = true;
      });
-      if (this.hints && this.hints.length > 1) {
-        opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
+      if (this.hints.length > 1) {
+        opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
        if (typeof this.hintsBoost === 'number') {
          opts.GOOGLE_SPEECH_HINTS_BOOST = this.hintsBoost;
        }
      }
-      if (this.altLanguages && this.altLanguages.length > 0) {
-        opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
+      if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
+      if ('unspecified' !== this.interactionType) {
+        opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
      }
-      if (this.profanityFilter === true) {
-        Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true});
+      opts.GOOGLE_SPEECH_MODEL = this.model;
+      if (this.diarization && this.diarizationMinSpeakers > 0) {
+        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
      }
+      if (this.diarization && this.diarizationMaxSpeakers > 0) {
+        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT = this.diarizationMaxSpeakers;
+      }
+      if (this.naicsCode > 0) opts.GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE = this.naicsCode;
      ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
      ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
      ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));
--- a/lib/tasks/specs.json
+++ b/lib/tasks/specs.json
@@ -419,6 +419,7 @@
      "separateRecognitionPerChannel": "boolean",
      "punctuation": "boolean",
      "enhancedModel": "boolean",
+      "singleUtterance": "boolean",
      "words": "boolean",
      "diarization": "boolean",
      "diarizationMinSpeakers": "number",
@@ -448,6 +449,7 @@
          "tag"
        ]
      },
+      "model": "string",
      "outputFormat": {
        "type": "string",
        "enum": [
--- a/lib/tasks/transcribe.js
+++ b/lib/tasks/transcribe.js
@@ -32,7 +32,9 @@ class TaskTranscribe extends Task {
    this.profanityFilter = recognizer.profanityFilter;
    this.punctuation = !!recognizer.punctuation;
    this.enhancedModel = !!recognizer.enhancedModel;
+    this.model = recognizer.model || 'phone_call';
    this.words = !!recognizer.words;
+    this.singleUtterance = recognizer.singleUtterance || false;
    this.diarization = !!recognizer.diarization;
    this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
    this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
@@ -136,6 +138,7 @@ class TaskTranscribe extends Task {
        ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
        ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
        ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
+        ['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'],
        ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
      ].forEach((arr) => {
        if (this[arr[0]]) opts[arr[1]] = true;
@@ -149,15 +152,8 @@ class TaskTranscribe extends Task {
      if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
      if ('unspecified' !== this.interactionType) {
        opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
-
-        // additionally set model if appropriate
-        if ('phone_call' === this.interactionType) opts.GOOGLE_SPEECH_MODEL = 'phone_call';
-        else if (['voice_search', 'voice_command'].includes(this.interactionType)) {
-          opts.GOOGLE_SPEECH_MODEL = 'command_and_search';
-        }
-        else opts.GOOGLE_SPEECH_MODEL = 'phone_call';
      }
-      else opts.GOOGLE_SPEECH_MODEL = 'phone_call';
+      opts.GOOGLE_SPEECH_MODEL = this.model;
      if (this.diarization && this.diarizationMinSpeakers > 0) {
        opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
      }