deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)

* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip) * wip * deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts * wip * fixes for deepgram compiling multiple transcripts * test deepgram utteranceEndMs * more handling of utteranceEndMs * wip * better handling of digit strings collected over multiple deepgram responses * wip * add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final * apply deepgram fixes to transcribe * cleanup continnuous asr * more continuous asr fixes for deepgram * update to verb-specifications for handling SttTask properties * set log level for tests back to error
2025-12-20 08:40:38 +00:00 · 2023-10-30 13:57:25 -04:00
parent 67f8f7181a
commit f43a5c1491
4 changed files with 179 additions and 60 deletions
--- a/lib/utils/transcription-utils.js
+++ b/lib/utils/transcription-utils.js
@@ -52,6 +52,7 @@ const stickyVars = {
    'DEEPGRAM_SPEECH_SEARCH',
    'DEEPGRAM_SPEECH_REPLACE',
    'DEEPGRAM_SPEECH_ENDPOINTING',
+    'DEEPGRAM_SPEECH_UTTERANCE_END_MS',
    'DEEPGRAM_SPEECH_VAD_TURNOFF',
    'DEEPGRAM_SPEECH_TAG'
  ],
@@ -106,6 +107,53 @@ const stickyVars = {
  ]
 };

+const consolidateTranscripts = (bufferedTranscripts, channel, language) => {
+  if (bufferedTranscripts.length === 1) return bufferedTranscripts[0];
+  let totalConfidence = 0;
+  const finalTranscript = bufferedTranscripts.reduce((acc, evt) => {
+    totalConfidence += evt.alternatives[0].confidence;
+
+    let newTranscript = evt.alternatives[0].transcript;
+
+    // If new transcript consists only of digits, spaces, and a trailing comma or period
+    if (newTranscript.match(/^[\d\s]+[,.]?$/)) {
+      newTranscript = newTranscript.replace(/\s/g, '');  // Remove all spaces
+      if (newTranscript.endsWith(',')) {
+        newTranscript = newTranscript.slice(0, -1);  // Remove the trailing comma
+      } else if (newTranscript.endsWith('.')) {
+        newTranscript = newTranscript.slice(0, -1);  // Remove the trailing period
+      }
+    }
+
+    const lastChar = acc.alternatives[0].transcript.slice(-1);
+    const firstChar = newTranscript.charAt(0);
+
+    if (lastChar.match(/\d/) && firstChar.match(/\d/)) {
+      acc.alternatives[0].transcript += newTranscript;
+    } else {
+      acc.alternatives[0].transcript += ` ${newTranscript}`;
+    }
+
+    return acc;
+  }, {
+    language_code: language,
+    channel_tag: channel,
+    is_final: true,
+    alternatives: [{
+      transcript: ''
+    }]
+  });
+  finalTranscript.alternatives[0].confidence = bufferedTranscripts.length === 1 ?
+    bufferedTranscripts[0].alternatives[0].confidence :
+    totalConfidence / bufferedTranscripts.length;
+  finalTranscript.alternatives[0].transcript = finalTranscript.alternatives[0].transcript.trim();
+  finalTranscript.vendor = {
+    name: 'deepgram',
+    evt: bufferedTranscripts
+  };
+  return finalTranscript;
+};
+
 const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
  const words = finalWordChunks.flat();
  const transcript = words.reduce((acc, word) => {
@@ -163,7 +211,7 @@ const normalizeSoniox = (evt, channel, language) => {
  };
 };

-const normalizeDeepgram = (evt, channel, language) => {
+const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
  const copy = JSON.parse(JSON.stringify(evt));
  const alternatives = (evt.channel?.alternatives || [])
    .map((alt) => ({
@@ -171,10 +219,14 @@ const normalizeDeepgram = (evt, channel, language) => {
      transcript: alt.transcript,
    }));

+  /**
+   * note difference between is_final and speech_final in Deepgram:
+   * https://developers.deepgram.com/docs/understand-endpointing-interim-results
+   */
  return {
    language_code: language,
    channel_tag: channel,
-    is_final: evt.is_final,
+    is_final: shortUtterance ? evt.is_final : evt.speech_final,
    alternatives: [alternatives[0]],
    vendor: {
      name: 'deepgram',
@@ -325,12 +377,12 @@ const normalizeAws = (evt, channel, language) => {


 module.exports = (logger) => {
-  const normalizeTranscription = (evt, vendor, channel, language) => {
+  const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {

    //logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
    switch (vendor) {
      case 'deepgram':
-        return normalizeDeepgram(evt, channel, language);
+        return normalizeDeepgram(evt, channel, language, shortUtterance);
      case 'microsoft':
        return normalizeMicrosoft(evt, channel, language);
      case 'google':
@@ -536,6 +588,8 @@ module.exports = (logger) => {
          {DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
        ...('endpointing' in deepgramOptions) &&
          {DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
+        ...(deepgramOptions.utteranceEndMs) &&
+          {DEEPGRAM_SPEECH_UTTERANCE_END_MS: deepgramOptions.utteranceEndMs},
        ...(deepgramOptions.vadTurnoff) &&
          {DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
        ...(deepgramOptions.tag) &&
@@ -743,6 +797,7 @@ module.exports = (logger) => {
    setChannelVarsForStt,
    removeSpeechListeners,
    setSpeechCredentialsAtRuntime,
-    compileSonioxTranscripts
+    compileSonioxTranscripts,
+    consolidateTranscripts
  };
 };