From 63ab554908c2a1c11b8c61772d04ee1d6a23779b Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Sun, 26 Mar 2023 12:20:03 -0400 Subject: [PATCH] google STT: default to command_and_search for Gather, as latest_short seems to have issues, various other fixes (#285) --- lib/tasks/gather.js | 20 ++++++++---- lib/utils/transcription-utils.js | 53 ++++++++++++-------------------- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index 9fc6b529..ab8aa44a 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -104,6 +104,10 @@ class TaskGather extends Task { get needsStt() { return this.input.includes('speech'); } + get wantsMultipleUtterances() { + return this.data.recognizer?.singleUtterance === false; + } + get earlyMedia() { return (this.sayTask && this.sayTask.earlyMedia) || (this.playTask && this.playTask.earlyMedia); @@ -661,14 +665,18 @@ class TaskGather extends Task { this._killAudio(cs); } - // DCH: commenting out because my experience is that the google STT engine - // will keep listening after it detects end of utterance, and will return a final transcript - // My earlier understanding that we needed to stop and restart the recognizer appears incorrect. - /* - if (!this.resolved && !this.killed && !this._bufferedTranscripts.length) { + /** + * By default, Gather asks google for a single utterance. On getting an end of utterance event, + * the mod_google_transcribe plugin will send a WritesDone to the grpc stream, which will usually + * cause google to return a final transcript. So even though we have not received a final + * transcript at this point (because otherwise resolved() would be true), we do not need to + * restart the recognizer - we should get the final transcript shortly. + * The exception is if the Gather was specifically configured to listen + * to multiple utterances, in which case we need to restart the recognizer. + */ + if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsMultipleUtterances) { this._startTranscribing(ep); } - */ } _onStartOfSpeech(cs, ep) { diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index 10054512..e032a0be 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -337,53 +337,38 @@ module.exports = (logger) => { if ('google' === vendor) { opts = { ...opts, - ...(sttCredentials && - {GOOGLE_APPLICATION_CREDENTIALS: JSON.stringify(sttCredentials.credentials)}), - ...(rOpts.enhancedModel && - {GOOGLE_SPEECH_USE_ENHANCED: 1}), - ...(rOpts.separateRecognitionPerChannel && - {GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 1}), - ...(rOpts.profanityFilter && - {GOOGLE_SPEECH_PROFANITY_FILTER: 1}), - ...(rOpts.punctuation && - {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1}), - ...(rOpts.words && - {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 1}), - ...((rOpts.singleUtterance || task.name === TaskName.Gather) && + ...(sttCredentials && {GOOGLE_APPLICATION_CREDENTIALS: JSON.stringify(sttCredentials.credentials)}), + ...(rOpts.separateRecognitionPerChannel && {GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 1}), + ...(rOpts.separateRecognitionPerChanne === false && {GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 0}), + ...(rOpts.profanityFilter && {GOOGLE_SPEECH_PROFANITY_FILTER: 1}), + ...(rOpts.punctuation && {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1}), + ...(rOpts.words && {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 1}), + ...((rOpts.singleUtterance || task.name === TaskName.Gather) && {GOOGLE_SPEECH_SINGLE_UTTERANCE: 1}), - ...(rOpts.diarization && - {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 1}), + ...(rOpts.singleUtterance === false && {GOOGLE_SPEECH_SINGLE_UTTERANCE: 0}), + ...(rOpts.diarization && {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 1}), ...(rOpts.diarization && rOpts.diarizationMinSpeakers > 0 && {GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT: rOpts.diarizationMinSpeakers}), ...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 && {GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}), - ...(rOpts.enhancedModel === false && - {GOOGLE_SPEECH_USE_ENHANCED: 0}), - ...(rOpts.separateRecognitionPerChannel === false && - {GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 0}), - ...(rOpts.profanityFilter === false && - {GOOGLE_SPEECH_PROFANITY_FILTER: 0}), - ...(rOpts.punctuation === false && - {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 0}), - ...(rOpts.words == false && - {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 0}), - ...((rOpts.singleUtterance === false || task.name === TaskName.Transcribe) && - {GOOGLE_SPEECH_SINGLE_UTTERANCE: 0}), - ...(rOpts.diarization === false && - {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 0}), + ...(rOpts.enhancedModel === false && {GOOGLE_SPEECH_USE_ENHANCED: 0}), + ...(rOpts.enhancedModel !== false && {GOOGLE_SPEECH_USE_ENHANCED: 1}), + ...(rOpts.profanityFilter === false && {GOOGLE_SPEECH_PROFANITY_FILTER: 0}), + ...(rOpts.punctuation === false && {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 0}), + ...(rOpts.words == false && {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 0}), + ...(rOpts.diarization === false && {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 0}), ...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' && {GOOGLE_SPEECH_HINTS: rOpts.hints.join(',')}), ...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' && {GOOGLE_SPEECH_HINTS: JSON.stringify(rOpts.hints)}), - ...(typeof rOpts.hintsBoost === 'number' && - {GOOGLE_SPEECH_HINTS_BOOST: rOpts.hintsBoost}), + ...(typeof rOpts.hintsBoost === 'number' && {GOOGLE_SPEECH_HINTS_BOOST: rOpts.hintsBoost}), ...(rOpts.altLanguages.length > 0 && {GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES: [...new Set(rOpts.altLanguages)].join(',')}), ...(rOpts.interactionType && {GOOGLE_SPEECH_METADATA_INTERACTION_TYPE: rOpts.interactionType}), - ...{GOOGLE_SPEECH_MODEL: rOpts.model || (task.name === TaskName.Gather ? 'latest_short' : 'phone_call')}, - ...(rOpts.naicsCode > 0 && - {GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE: rOpts.naicsCode}), + ...{GOOGLE_SPEECH_MODEL: rOpts.model || (task.name === TaskName.Gather ? 'command_and_search' : 'latest_long')}, + ...(rOpts.naicsCode > 0 && {GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE: rOpts.naicsCode}), + GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE: 'phone_line', }; } else if (['aws', 'polly'].includes(vendor)) {