google stt fixes, including defaulting to phone_call model based on c… (#288)

* google stt fixes, including defaulting to phone_call model based on comparative model testing * lint error
2026-02-13 01:39:26 +00:00 · 2023-03-28 10:02:03 -04:00
parent efdea3e514
commit 6e945dde9a
2 changed files with 16 additions and 18 deletions
--- a/lib/tasks/gather.js
+++ b/lib/tasks/gather.js
@@ -98,14 +98,15 @@ class TaskGather extends Task {
    this._sonioxTranscripts = [];

    this.parentTask = parentTask;
+    this.partialTranscriptsCount = 0;
  }

  get name() { return TaskName.Gather; }

  get needsStt() { return this.input.includes('speech'); }

-  get wantsMultipleUtterances() {
-    return this.data.recognizer?.singleUtterance !== true;
+  get wantsSingleUtterance() {
+    return this.data.recognizer?.singleUtterance === true;
  }

  get earlyMedia() {
@@ -565,7 +566,8 @@ class TaskGather extends Task {
      return;
    }

-    if (this.earlyHintsMatch && evt.is_final === false) {
+    /* fast path: our first partial transcript exactly matches an early hint */
+    if (this.earlyHintsMatch && evt.is_final === false && this.partialTranscriptsCount++ === 0) {
      const transcript = evt.alternatives[0].transcript?.toLowerCase();
      const hints = this.data.recognizer?.hints || [];
      if (hints.find((h) => h.toLowerCase() === transcript)) {
@@ -670,15 +672,15 @@ class TaskGather extends Task {
    }

    /**
-     * By default, Gather asks google for a single utterance.  On getting an end of utterance event,
-     * the mod_google_transcribe plugin will send a WritesDone to the grpc stream, which will usually
-     * cause google to return a final transcript.  So even though we have not received a final
-     * transcript at this point (because otherwise resolved() would be true), we do not need to
-     * restart the recognizer - we should get the final transcript shortly.
-     * The exception is if the Gather was specifically configured to listen
-     * to multiple utterances, in which case we need to restart the recognizer.
+     * By default, Gather asks google for multiple utterances.
+     * The reason is that we can sometimes get an 'end_of_utterance' event without
+     * getting a transcription.  This can happen if someone coughs or mumbles.
+     * For that reason don't ask for a single utterance and we'll terminate the transcribe operation
+     * once we get a final transcript.
+     * However, if the usr has specified a singleUtterance, then we need to restart here
+     * since we dont have a final transcript yet.
     */
-    if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsMultipleUtterances) {
+    if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsSingleUtterance) {
      this._startTranscribing(ep);
    }
  }
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -1,5 +1,4 @@
 const {
-  TaskName,
  AzureTranscriptionEvents,
  GoogleTranscriptionEvents,
  AwsTranscriptionEvents,
@@ -343,16 +342,13 @@ module.exports = (logger) => {
        ...(rOpts.profanityFilter && {GOOGLE_SPEECH_PROFANITY_FILTER: 1}),
        ...(rOpts.punctuation && {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1}),
        ...(rOpts.words && {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 1}),
-        ...((rOpts.singleUtterance /*|| task.name === TaskName.Gather*/) &&
-          {GOOGLE_SPEECH_SINGLE_UTTERANCE: 1}),
-        ...(rOpts.singleUtterance === false && {GOOGLE_SPEECH_SINGLE_UTTERANCE: 0}),
+        ...(rOpts.singleUtterance && {GOOGLE_SPEECH_SINGLE_UTTERANCE: 1}),
        ...(rOpts.diarization && {GOOGLE_SPEECH_SPEAKER_DIARIZATION: 1}),
        ...(rOpts.diarization && rOpts.diarizationMinSpeakers > 0 &&
          {GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT: rOpts.diarizationMinSpeakers}),
        ...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 &&
          {GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}),
-        ...(rOpts.enhancedModel === false && {GOOGLE_SPEECH_USE_ENHANCED: 0}),
-        ...(rOpts.enhancedModel !== false && {GOOGLE_SPEECH_USE_ENHANCED: 1}),
+        ...(rOpts.enhancedModel && {GOOGLE_SPEECH_USE_ENHANCED: 1}),
        ...(rOpts.profanityFilter === false && {GOOGLE_SPEECH_PROFANITY_FILTER: 0}),
        ...(rOpts.punctuation === false && {GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 0}),
        ...(rOpts.words  == false && {GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 0}),
@@ -366,7 +362,7 @@ module.exports = (logger) => {
          {GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES: [...new Set(rOpts.altLanguages)].join(',')}),
        ...(rOpts.interactionType &&
          {GOOGLE_SPEECH_METADATA_INTERACTION_TYPE: rOpts.interactionType}),
-        ...{GOOGLE_SPEECH_MODEL: rOpts.model || (task.name === TaskName.Gather ? 'command_and_search' : 'latest_long')},
+        ...{GOOGLE_SPEECH_MODEL: rOpts.model || 'phone_call'},
        ...(rOpts.naicsCode > 0 && {GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE: rOpts.naicsCode}),
        GOOGLE_SPEECH_METADATA_RECORDING_DEVICE_TYPE: 'phone_line',
      };