expose model and singleUtterance to gather/transcribe when using google

This commit is contained in:
Dave Horton
2022-05-08 12:29:55 -04:00
parent 036accab44
commit 182ad8c716
3 changed files with 39 additions and 20 deletions

View File

@@ -36,8 +36,18 @@ class TaskGather extends Task {
this.language = recognizer.language; this.language = recognizer.language;
this.hints = recognizer.hints || []; this.hints = recognizer.hints || [];
this.hintsBoost = recognizer.hintsBoost; this.hintsBoost = recognizer.hintsBoost;
this.altLanguages = recognizer.altLanguages || []; this.profanityFilter = recognizer.profanityFilter;
this.punctuation = !!recognizer.punctuation; this.punctuation = !!recognizer.punctuation;
this.enhancedModel = !!recognizer.enhancedModel;
this.model = recognizer.model || 'command_and_search';
this.words = !!recognizer.words;
this.singleUtterance = recognizer.singleUtterance || true;
this.diarization = !!recognizer.diarization;
this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
this.interactionType = recognizer.interactionType || 'unspecified';
this.naicsCode = recognizer.naicsCode || 0;
this.altLanguages = recognizer.altLanguages || [];
/* vad: if provided, we dont connect to recognizer until voice activity is detected */ /* vad: if provided, we dont connect to recognizer until voice activity is detected */
const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {}; const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {};
@@ -232,24 +242,35 @@ class TaskGather extends Task {
if ('google' === this.vendor) { if ('google' === this.vendor) {
if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials); if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
Object.assign(opts, { [
GOOGLE_SPEECH_USE_ENHANCED: true, ['enhancedModel', 'GOOGLE_SPEECH_USE_ENHANCED'],
GOOGLE_SPEECH_SINGLE_UTTERANCE: true, ['separateRecognitionPerChannel', 'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL'],
GOOGLE_SPEECH_MODEL: 'command_and_search', ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: !!this.punctuation ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'],
['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
].forEach((arr) => {
if (this[arr[0]]) opts[arr[1]] = true;
}); });
if (this.hints && this.hints.length > 1) { if (this.hints.length > 1) {
opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(','); opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
if (typeof this.hintsBoost === 'number') { if (typeof this.hintsBoost === 'number') {
opts.GOOGLE_SPEECH_HINTS_BOOST = this.hintsBoost; opts.GOOGLE_SPEECH_HINTS_BOOST = this.hintsBoost;
} }
} }
if (this.altLanguages && this.altLanguages.length > 0) { if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(','); if ('unspecified' !== this.interactionType) {
opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
} }
if (this.profanityFilter === true) { opts.GOOGLE_SPEECH_MODEL = this.model;
Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true}); if (this.diarization && this.diarizationMinSpeakers > 0) {
opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
} }
if (this.diarization && this.diarizationMaxSpeakers > 0) {
opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT = this.diarizationMaxSpeakers;
}
if (this.naicsCode > 0) opts.GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE = this.naicsCode;
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep));

View File

@@ -419,6 +419,7 @@
"separateRecognitionPerChannel": "boolean", "separateRecognitionPerChannel": "boolean",
"punctuation": "boolean", "punctuation": "boolean",
"enhancedModel": "boolean", "enhancedModel": "boolean",
"singleUtterance": "boolean",
"words": "boolean", "words": "boolean",
"diarization": "boolean", "diarization": "boolean",
"diarizationMinSpeakers": "number", "diarizationMinSpeakers": "number",
@@ -448,6 +449,7 @@
"tag" "tag"
] ]
}, },
"model": "string",
"outputFormat": { "outputFormat": {
"type": "string", "type": "string",
"enum": [ "enum": [

View File

@@ -32,7 +32,9 @@ class TaskTranscribe extends Task {
this.profanityFilter = recognizer.profanityFilter; this.profanityFilter = recognizer.profanityFilter;
this.punctuation = !!recognizer.punctuation; this.punctuation = !!recognizer.punctuation;
this.enhancedModel = !!recognizer.enhancedModel; this.enhancedModel = !!recognizer.enhancedModel;
this.model = recognizer.model || 'phone_call';
this.words = !!recognizer.words; this.words = !!recognizer.words;
this.singleUtterance = recognizer.singleUtterance || false;
this.diarization = !!recognizer.diarization; this.diarization = !!recognizer.diarization;
this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0; this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0; this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
@@ -136,6 +138,7 @@ class TaskTranscribe extends Task {
['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'], ['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'], ['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'], ['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
['singleUtterance', 'GOOGLE_SPEECH_SINGLE_UTTERANCE'],
['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER'] ['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
].forEach((arr) => { ].forEach((arr) => {
if (this[arr[0]]) opts[arr[1]] = true; if (this[arr[0]]) opts[arr[1]] = true;
@@ -149,15 +152,8 @@ class TaskTranscribe extends Task {
if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(','); if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
if ('unspecified' !== this.interactionType) { if ('unspecified' !== this.interactionType) {
opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType; opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
// additionally set model if appropriate
if ('phone_call' === this.interactionType) opts.GOOGLE_SPEECH_MODEL = 'phone_call';
else if (['voice_search', 'voice_command'].includes(this.interactionType)) {
opts.GOOGLE_SPEECH_MODEL = 'command_and_search';
}
else opts.GOOGLE_SPEECH_MODEL = 'phone_call';
} }
else opts.GOOGLE_SPEECH_MODEL = 'phone_call'; opts.GOOGLE_SPEECH_MODEL = this.model;
if (this.diarization && this.diarizationMinSpeakers > 0) { if (this.diarization && this.diarizationMinSpeakers > 0) {
opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers; opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
} }