diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index 3a34b9b2..ae063f9a 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -37,6 +37,7 @@ class TaskGather extends Task { this.hints = recognizer.hints || []; this.hintsBoost = recognizer.hintsBoost; this.altLanguages = recognizer.altLanguages || []; + this.punctuation = !!recognizer.punctuation; /* vad: if provided, we dont connect to recognizer until voice activity is detected */ const {enable, voiceMs = 0, mode = -1} = recognizer.vad || {}; @@ -156,9 +157,12 @@ class TaskGather extends Task { } ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription); ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance); + ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected); ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription); + ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected); ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription); ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected); + ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected); } kill(cs) { @@ -214,7 +218,8 @@ class TaskGather extends Task { Object.assign(opts, { GOOGLE_SPEECH_USE_ENHANCED: true, GOOGLE_SPEECH_SINGLE_UTTERANCE: true, - GOOGLE_SPEECH_MODEL: 'command_and_search' + GOOGLE_SPEECH_MODEL: 'command_and_search', + GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: this.punctuation }); if (this.hints && this.hints.length > 1) { opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(','); @@ -230,6 +235,7 @@ class TaskGather extends Task { } ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep)); + ep.addCustomEventListener(GoogleTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); } else if (['aws', 'polly'].includes(this.vendor)) { if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName; @@ -245,6 +251,7 @@ class TaskGather extends Task { }); } ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); + ep.addCustomEventListener(AwsTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); } else if ('microsoft' === this.vendor) { if (this.sttCredentials) { @@ -257,13 +264,14 @@ class TaskGather extends Task { opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(','); } if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1; - if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption; + if (this.profanityOption && this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption; if (this.azureServiceEndpoint) opts.AZURE_SERVICE_ENDPOINT = this.azureServiceEndpoint; if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs; opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1; ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep)); + ep.addCustomEventListener(AzureTranscriptionEvents.VadDetected, this._onVadDetected.bind(this, cs, ep)); } await ep.set(opts) .catch((err) => this.logger.info(err, 'Error setting channel variables')); @@ -375,7 +383,7 @@ class TaskGather extends Task { } } _onEndOfUtterance(cs, ep) { - this.logger.info('TaskGather:_onEndOfUtterance'); + this.logger.debug('TaskGather:_onEndOfUtterance'); if (this.bargein && this.minBargeinWordCount === 0) { this._killAudio(cs); } @@ -385,6 +393,13 @@ class TaskGather extends Task { } } + _onVadDetected(cs, ep) { + if (this.bargein && this.minBargeinWordCount === 0) { + this.logger.debug('TaskGather:_onVadDetected'); + this._killAudio(cs); + } + } + _onNoSpeechDetected(cs, ep) { this._resolve('timeout'); } diff --git a/lib/utils/constants.json b/lib/utils/constants.json index b76d9fcf..028f9e60 100644 --- a/lib/utils/constants.json +++ b/lib/utils/constants.json @@ -59,19 +59,22 @@ "Transcription": "google_transcribe::transcription", "EndOfUtterance": "google_transcribe::end_of_utterance", "NoAudioDetected": "google_transcribe::no_audio_detected", - "MaxDurationExceeded": "google_transcribe::max_duration_exceeded" + "MaxDurationExceeded": "google_transcribe::max_duration_exceeded", + "VadDetected": "google_transcribe::vad_detected" }, "AwsTranscriptionEvents": { "Transcription": "aws_transcribe::transcription", "EndOfTranscript": "aws_transcribe::end_of_transcript", "NoAudioDetected": "aws_transcribe::no_audio_detected", - "MaxDurationExceeded": "aws_transcribe::max_duration_exceeded" + "MaxDurationExceeded": "aws_transcribe::max_duration_exceeded", + "VadDetected": "aws_transcribe::vad_detected" }, "AzureTranscriptionEvents": { "Transcription": "azure_transcribe::transcription", "StartOfUtterance": "azure_transcribe::start_of_utterance", "EndOfUtterance": "azure_transcribe::end_of_utterance", - "NoSpeechDetected": "azure_transcribe::no_speech_detected" + "NoSpeechDetected": "azure_transcribe::no_speech_detected", + "VadDetected": "azure_transcribe::vad_detected" }, "ListenEvents": { "Connect": "mod_audio_fork::connect",