From f71f0ac69a7c120f946e7c17acf3af3a807faf47 Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Wed, 15 Jan 2025 19:12:15 -0500 Subject: [PATCH] Fix/speechmatics (#1042) * add speechmatics options * wip * speechmatics does not do endpointing for us so we need to flip on continuousAsr * speechmatics: continousAsr should be at least equal to max_delay, if set --- lib/tasks/gather.js | 13 ++++++++++++- lib/utils/transcription-utils.js | 32 +++++++++++++++++++++++++++++++- package-lock.json | 15 +++++++-------- package.json | 2 +- 4 files changed, 51 insertions(+), 11 deletions(-) diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index e89c9a74..8f1ecc47 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -24,6 +24,7 @@ const makeTask = require('./make_task'); const assert = require('assert'); const SttTask = require('./stt-task'); const { SpeechCredentialError } = require('../utils/error'); +const SPEECHMATICS_DEFAULT_ASR_TIMEOUT = 1200; class TaskGather extends SttTask { constructor(logger, opts, parentTask) { @@ -162,6 +163,16 @@ class TaskGather extends SttTask { this.logger.debug({hints: this.data.recognizer.hints, hintsBoost: this.data.recognizer.hintsBoost}, 'Gather:exec - applying global sttHints'); } + + // specials case for speechmatics: they dont do endpointing so we need to enable continuous ASR + if (this.vendor === 'speechmatics' && !this.isContinuousAsr) { + const maxDelay = this.recognizer?.speechmaticsOptions?.transcription_config?.max_delay; + if (maxDelay) this.asrTimeout = Math.min(SPEECHMATICS_DEFAULT_ASR_TIMEOUT, maxDelay * 1000); + else this.asrTimeout = SPEECHMATICS_DEFAULT_ASR_TIMEOUT; + this.isContinuousAsr = true; + this.logger.debug(`Gather:exec - auto-enabling continuous ASR for speechmatics w/ timeout ${this.asrTimeout}`); + } + if (!this.isContinuousAsr && cs.isContinuousAsr) { this.isContinuousAsr = true; this.asrTimeout = cs.asrTimeout * 1000; @@ -832,7 +843,7 @@ class TaskGather extends SttTask { const t = evt.alternatives[0].transcript; if (t) { /* remove trailing punctuation */ - if (/[,;:\.!\?]$/.test(t)) { + if (this.vendor !== 'speechmatics' && /[,;:\.!\?]$/.test(t)) { this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation'); evt.alternatives[0].transcript = t.slice(0, -1); } diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index fa095add..70aa1b1a 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -209,7 +209,7 @@ const consolidateTranscripts = (bufferedTranscripts, channel, language, vendor) const lastChar = acc.alternatives[0].transcript.slice(-1); const firstChar = newTranscript.charAt(0); - if (lastChar.match(/\d/) && firstChar.match(/\d/)) { + if (vendor === 'speechmatics' || (lastChar.match(/\d/) && firstChar.match(/\d/))) { acc.alternatives[0].transcript += newTranscript; } else { acc.alternatives[0].transcript += ` ${newTranscript}`; @@ -957,6 +957,36 @@ module.exports = (logger) => { SPEECHMATICS_TRANSLATION_PARTIALS: speechmaticsOptions.translation_config.enable_partials ? 1 : 0 } ), + ...(speechmaticsOptions.transcription_config?.domain && + {SPEECHMATICS_DOMAIN: speechmaticsOptions.transcription_config.domain}), + ...{SPEECHMATICS_MAX_DELAY: speechmaticsOptions.transcription_config?.max_delay || 0.7}, + ...{SPEECHMATICS_MAX_DELAY_MODE: speechmaticsOptions.transcription_config?.max_delay_mode || 'flexible'}, + ...(speechmaticsOptions.transcription_config?.diarization && + {SPEECHMATICS_DIARIZATION: speechmaticsOptions.transcription_config.diarization}), + ...(speechmaticsOptions.transcription_config?.speaker_diarization_config?.speaker_sensitivity && + {SPEECHMATICS_DIARIZATION_SPEAKER_SENSITIVITY: + speechmaticsOptions.transcription_config.speaker_diarization_config.speaker_sensitivity}), + ...(speechmaticsOptions.transcription_config?.speaker_diarization_config?.max_speakers && + {SPEECHMATICS_DIARIZATION_MAX_SPEAKERS: + speechmaticsOptions.transcription_config.speaker_diarization_config.max_speakers}), + ...(speechmaticsOptions.transcription_config?.output_locale && + {SPEECHMATICS_OUTPUT_LOCALE: speechmaticsOptions.transcription_config.output_locale}), + ...(speechmaticsOptions.transcription_config?.punctuation_overrides?.permitted_marks && + {SPEECHMATICS_PUNCTUATION_ALLOWED: + speechmaticsOptions.transcription_config.punctuation_overrides.permitted_marks.join(',')}), + ...(speechmaticsOptions.transcription_config?.punctuation_overrides?.sensitivity && + {SPEECHMATICS_PUNCTUATION_SENSITIVITY: + speechmaticsOptions.transcription_config?.punctuation_overrides?.sensitivity}), + ...(speechmaticsOptions.transcription_config?.operating_point && + {SPEECHMATICS_OPERATING_POINT: speechmaticsOptions.transcription_config.operating_point}), + ...(speechmaticsOptions.transcription_config?.enable_entities && + {SPEECHMATICS_ENABLE_ENTTIES: speechmaticsOptions.transcription_config.enable_entities}), + ...(speechmaticsOptions.transcription_config?.audio_filtering_config?.volume_threshold && + {SPEECHMATICS_VOLUME_THRESHOLD: + speechmaticsOptions.transcription_config.audio_filtering_config.volume_threshold}), + ...(speechmaticsOptions.transcription_config?.transcript_filtering_config?.remove_disfluencies && + {SPEECHMATICS_REMOVE_DISFLUENCIES: + speechmaticsOptions.transcription_config.transcript_filtering_config.remove_disfluencies}) }; } else if (vendor.startsWith('custom:')) { diff --git a/package-lock.json b/package-lock.json index 9e13786c..2b06f205 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,7 +18,7 @@ "@jambonz/speech-utils": "^0.2.1", "@jambonz/stats-collector": "^0.1.10", "@jambonz/time-series": "^0.2.13", - "@jambonz/verb-specifications": "^0.0.92", + "@jambonz/verb-specifications": "^0.0.93", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0", "@opentelemetry/exporter-trace-otlp-http": "^0.50.0", @@ -1671,10 +1671,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.92", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.92.tgz", - "integrity": "sha512-zb1y5Hq+FqGYleYYKZafEIHyhhlH3VHapTJh3N0s+2xdy8I2Gf17zJpUc45mhV/4ficlT3SSwETsBbMt20Hwog==", - "license": "MIT", + "version": "0.0.93", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.93.tgz", + "integrity": "sha512-Ml1+fT+cNSm4sEkd6zQpG3g1WmBeoXRtgcXVDRn980gCjBKuegcaXHaaCod6ddyWOpj3YiA51PnVEpfpaXU76A==", "dependencies": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -10939,9 +10938,9 @@ } }, "@jambonz/verb-specifications": { - "version": "0.0.92", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.92.tgz", - "integrity": "sha512-zb1y5Hq+FqGYleYYKZafEIHyhhlH3VHapTJh3N0s+2xdy8I2Gf17zJpUc45mhV/4ficlT3SSwETsBbMt20Hwog==", + "version": "0.0.93", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.93.tgz", + "integrity": "sha512-Ml1+fT+cNSm4sEkd6zQpG3g1WmBeoXRtgcXVDRn980gCjBKuegcaXHaaCod6ddyWOpj3YiA51PnVEpfpaXU76A==", "requires": { "debug": "^4.3.4", "pino": "^8.8.0" diff --git a/package.json b/package.json index f8d6b86a..214ed4d5 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,7 @@ "@jambonz/realtimedb-helpers": "^0.8.8", "@jambonz/speech-utils": "^0.2.1", "@jambonz/stats-collector": "^0.1.10", - "@jambonz/verb-specifications": "^0.0.92", + "@jambonz/verb-specifications": "^0.0.93", "@jambonz/time-series": "^0.2.13", "@opentelemetry/api": "^1.8.0", "@opentelemetry/exporter-jaeger": "^1.23.0",