From 182ad8c7167c480600e47c2566d4171ded204d9a Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Sun, 8 May 2022 12:29:55 -0400 Subject: [PATCH] expose model and singleUtterance to gather/transcribe when using google --- lib/tasks/gather.js | 45 ++++++++++++++++++++++++++++++----------- lib/tasks/specs.json | 2 ++ lib/tasks/transcribe.js | 12 ++++------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index 0c9c633c..ab191cd0 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -36,8 +36,18 @@ class TaskGather extends Task { this.language = recognizer.language; this.hints = recognizer.hints || []; this.hintsBoost = recognizer.hintsBoost; - this.altLanguages = recognizer.altLanguages || []; + this.profanityFilter = recognizer.profanityFilter; this.punctuation = !!recognizer.punctuation; + this.enhancedModel = !!recognizer.enhancedModel; + this.model = recognizer.model || 'command_and_search'; + this.words = !!recognizer.words; + this.singleUtterance = recognizer.singleUtterance || true; + this.diarization = !!recognizer.diarization; + this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0; + this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0; + this.interactionType = recognizer.interactionType || 'unspecified'; + this.naicsCode = recognizer.naicsCode || 0; + this.altLanguages = recognizer.altLanguages || []; /* vad: if provided, we dont connect to recognizer until voice activity is detected */ const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {}; @@ -232,24 +242,35 @@ class TaskGather extends Task { if ('google' === this.vendor) { if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials); - Object.assign(opts, { - GOOGLE_SPEECH_USE_ENHANCED: true, - GOOGLE_SPEECH_SINGLE_UTTERANCE: true, - GOOGLE_SPEECH_MODEL: 'command_and_search', - GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: !!this.punctuation + [ + ['enhancedModel', 'GOOGLE_SPEECH_USE_ENHANCED'], + ['separateRecognitionPerChannel', 'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL'], + ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'], + ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'], + ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'], + ['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'], + ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER'] + ].forEach((arr) => { + if (this[arr[0]]) opts[arr[1]] = true; }); - if (this.hints && this.hints.length > 1) { - opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(','); + if (this.hints.length > 1) { + opts.GOOGLE_SPEECH_HINTS = this.hints.join(','); if (typeof this.hintsBoost === 'number') { opts.GOOGLE_SPEECH_HINTS_BOOST = this.hintsBoost; } } - if (this.altLanguages && this.altLanguages.length > 0) { - opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(','); + if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(','); + if ('unspecified' !== this.interactionType) { + opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType; } - if (this.profanityFilter === true) { - Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true}); + opts.GOOGLE_SPEECH_MODEL = this.model; + if (this.diarization && this.diarizationMinSpeakers > 0) { + opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers; } + if (this.diarization && this.diarizationMaxSpeakers > 0) { + opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT = this.diarizationMaxSpeakers; + } + if (this.naicsCode > 0) opts.GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE = this.naicsCode; ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); diff --git a/lib/tasks/specs.json b/lib/tasks/specs.json index 4efc736d..574d9b64 100644 --- a/lib/tasks/specs.json +++ b/lib/tasks/specs.json @@ -419,6 +419,7 @@ "separateRecognitionPerChannel": "boolean", "punctuation": "boolean", "enhancedModel": "boolean", + "singleUtterance": "boolean", "words": "boolean", "diarization": "boolean", "diarizationMinSpeakers": "number", @@ -448,6 +449,7 @@ "tag" ] }, + "model": "string", "outputFormat": { "type": "string", "enum": [ diff --git a/lib/tasks/transcribe.js b/lib/tasks/transcribe.js index 683884f2..163c8ca5 100644 --- a/lib/tasks/transcribe.js +++ b/lib/tasks/transcribe.js @@ -32,7 +32,9 @@ class TaskTranscribe extends Task { this.profanityFilter = recognizer.profanityFilter; this.punctuation = !!recognizer.punctuation; this.enhancedModel = !!recognizer.enhancedModel; + this.model = recognizer.model || 'phone_call'; this.words = !!recognizer.words; + this.singleUtterance = recognizer.singleUtterance || false; this.diarization = !!recognizer.diarization; this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0; this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0; @@ -136,6 +138,7 @@ class TaskTranscribe extends Task { ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'], ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'], ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'], + ['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'], ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER'] ].forEach((arr) => { if (this[arr[0]]) opts[arr[1]] = true; @@ -149,15 +152,8 @@ class TaskTranscribe extends Task { if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(','); if ('unspecified' !== this.interactionType) { opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType; - - // additionally set model if appropriate - if ('phone_call' === this.interactionType) opts.GOOGLE_SPEECH_MODEL = 'phone_call'; - else if (['voice_search', 'voice_command'].includes(this.interactionType)) { - opts.GOOGLE_SPEECH_MODEL = 'command_and_search'; - } - else opts.GOOGLE_SPEECH_MODEL = 'phone_call'; } - else opts.GOOGLE_SPEECH_MODEL = 'phone_call'; + opts.GOOGLE_SPEECH_MODEL = this.model; if (this.diarization && this.diarizationMinSpeakers > 0) { opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers; }