From f84b3793e1ba6f12fdd5c4ac290b10583fb78389 Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Sat, 12 Oct 2024 18:42:53 -0400 Subject: [PATCH] Feat/speechmatics (#932) * wip * initial working version of speechmatics * linting --- lib/session/call-session.js | 28 +++++++++++---- lib/tasks/gather.js | 36 +++++++++++++++++-- lib/tasks/stt-task.js | 4 ++- lib/tasks/transcribe.js | 41 +++++++++++++++++++++- lib/utils/constants.json | 8 +++++ lib/utils/db-utils.js | 26 ++++++++++---- lib/utils/transcription-utils.js | 59 +++++++++++++++++++++++++++----- 7 files changed, 175 insertions(+), 27 deletions(-) diff --git a/lib/session/call-session.js b/lib/session/call-session.js index 1cbd9950..113f28b8 100644 --- a/lib/session/call-session.js +++ b/lib/session/call-session.js @@ -963,42 +963,56 @@ class CallSession extends Emitter { speech_credential_sid: credential.speech_credential_sid, cobalt_server_uri: credential.cobalt_server_uri }; - } else if ('elevenlabs' === vendor) { + } + else if ('elevenlabs' === vendor) { return { api_key: credential.api_key, model_id: credential.model_id, options: credential.options }; - } else if ('playht' === vendor) { + } + else if ('playht' === vendor) { return { api_key: credential.api_key, user_id: credential.user_id, voice_engine: credential.voice_engine, options: credential.options }; - } else if ('rimelabs' === vendor) { + } + else if ('rimelabs' === vendor) { return { api_key: credential.api_key, model_id: credential.model_id, options: credential.options }; - } else if ('assemblyai' === vendor) { + } + else if ('assemblyai' === vendor) { return { speech_credential_sid: credential.speech_credential_sid, api_key: credential.api_key }; - } else if ('whisper' === vendor) { + } + else if ('whisper' === vendor) { return { api_key: credential.api_key, model_id: credential.model_id }; - } else if ('verbio' === vendor) { + } + else if ('verbio' === vendor) { return { client_id: credential.client_id, client_secret: credential.client_secret, engine_version: credential.engine_version }; - } else if (vendor.startsWith('custom:')) { + } + else if ('speechmatics' === vendor) { + this.logger.info({credential}, 'CallSession:getSpeechCredentials - speechmatics credential'); + return { + api_key: credential.api_key, + speechmatics_stt_uri: credential.speechmatics_stt_uri, + }; + } + else if (vendor.startsWith('custom:')) { return { speech_credential_sid: credential.speech_credential_sid, auth_token: credential.auth_token, diff --git a/lib/tasks/gather.js b/lib/tasks/gather.js index dcab1abc..96a7b8a8 100644 --- a/lib/tasks/gather.js +++ b/lib/tasks/gather.js @@ -12,7 +12,8 @@ const { JambonzTranscriptionEvents, AssemblyAiTranscriptionEvents, VadDetection, - VerbioTranscriptionEvents + VerbioTranscriptionEvents, + SpeechmaticsTranscriptionEvents } = require('../utils/constants.json'); const { JAMBONES_GATHER_EARLY_HINTS_MATCH, @@ -514,6 +515,24 @@ class TaskGather extends SttTask { this.addCustomEventListener(ep, AssemblyAiTranscriptionEvents.ConnectFailure, this._onVendorConnectFailure.bind(this, cs, ep)); break; + + case 'speechmatics': + this.bugname = `${this.bugname_prefix}speechmatics_transcribe`; + this.addCustomEventListener( + ep, SpeechmaticsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Info, + this._onSpeechmaticsInfo.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.RecognitionStarted, + this._onSpeechmaticsRecognitionStarted.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Connect, + this._onVendorConnect.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.ConnectFailure, + this._onVendorConnectFailure.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Error, + this._onSpeechmaticsErrror.bind(this, cs, ep)); + + break; + default: if (this.vendor.startsWith('custom:')) { this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`; @@ -752,7 +771,7 @@ class TaskGather extends SttTask { evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance, this.data.recognizer.punctuation); - //this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription normalized transcript'); + this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription normalized transcript'); if (evt.alternatives.length === 0) { this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening'); @@ -1006,12 +1025,25 @@ class TaskGather extends SttTask { } } + async _onSpeechmaticsErrror(cs, _ep, evt) { + // eslint-disable-next-line no-unused-vars + const {message, ...e} = evt; + this._onVendorError(cs, _ep, {error: JSON.stringify(e)}); + } + async _onVendorError(cs, _ep, evt) { super._onVendorError(cs, _ep, evt); if (!(await this._startFallback(cs, _ep, evt))) { this._resolve('stt-error', evt); } } + async _onSpeechmaticsRecognitionStarted(_cs, _ep, evt) { + this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsRecognitionStarted'); + } + + async _onSpeechmaticsInfo(_cs, _ep, evt) { + this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsInfo'); + } _onVadDetected(cs, ep) { if (this.bargein && this.minBargeinWordCount === 0) { diff --git a/lib/tasks/stt-task.js b/lib/tasks/stt-task.js index 8ba70bdf..918d5f50 100644 --- a/lib/tasks/stt-task.js +++ b/lib/tasks/stt-task.js @@ -17,12 +17,14 @@ class SttTask extends Task { normalizeTranscription, setSpeechCredentialsAtRuntime, compileSonioxTranscripts, - consolidateTranscripts + consolidateTranscripts, + updateSpeechmaticsPayload } = require('../utils/transcription-utils')(logger); this.setChannelVarsForStt = setChannelVarsForStt; this.normalizeTranscription = normalizeTranscription; this.compileSonioxTranscripts = compileSonioxTranscripts; this.consolidateTranscripts = consolidateTranscripts; + this.updateSpeechmaticsPayload = updateSpeechmaticsPayload; this.eventHandlers = []; this.isHandledByPrimaryProvider = true; /** diff --git a/lib/tasks/transcribe.js b/lib/tasks/transcribe.js index 51d97881..b7dd7d17 100644 --- a/lib/tasks/transcribe.js +++ b/lib/tasks/transcribe.js @@ -12,7 +12,9 @@ const { NvidiaTranscriptionEvents, JambonzTranscriptionEvents, TranscribeStatus, - AssemblyAiTranscriptionEvents + AssemblyAiTranscriptionEvents, + VerbioTranscriptionEvents, + SpeechmaticsTranscriptionEvents } = require('../utils/constants.json'); const { normalizeJambones } = require('@jambonz/verb-specifications'); const SttTask = require('./stt-task'); @@ -237,6 +239,13 @@ class TaskTranscribe extends SttTask { this.addCustomEventListener(ep, SonioxTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep, channel)); break; + + case 'verbio': + this.bugname = `${this.bugname_prefix}verbio_transcribe`; + this.addCustomEventListener( + ep, VerbioTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); + break; + case 'cobalt': this.bugname = `${this.bugname_prefix}cobalt_transcribe`; this.addCustomEventListener(ep, CobaltTranscriptionEvents.Transcription, @@ -294,6 +303,22 @@ class TaskTranscribe extends SttTask { this._onVendorConnectFailure.bind(this, cs, ep, channel)); break; + case 'speechmatics': + this.bugname = `${this.bugname_prefix}speechmatics_transcribe`; + this.addCustomEventListener( + ep, SpeechmaticsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Info, + this._onSpeechmaticsInfo.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.RecognitionStarted, + this._onSpeechmaticsRecognitionStarted.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Connect, + this._onVendorConnect.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.ConnectFailure, + this._onVendorConnectFailure.bind(this, cs, ep)); + this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Error, + this._onSpeechmaticsError.bind(this, cs, ep)); + break; + default: if (this.vendor.startsWith('custom:')) { this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`; @@ -644,6 +669,20 @@ class TaskTranscribe extends SttTask { } } + async _onSpeechmaticsRecognitionStarted(_cs, _ep, evt) { + this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsRecognitionStarted'); + } + + async _onSpeechmaticsInfo(_cs, _ep, evt) { + this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsInfo'); + } + + async _onSpeechmaticsErrror(cs, _ep, evt) { + // eslint-disable-next-line no-unused-vars + const {message, ...e} = evt; + this._onVendorError(cs, _ep, {error: JSON.stringify(e)}); + } + _startAsrTimer(channel) { if (this.vendor === 'deepgram') return; // no need assert(this.isContinuousAsr); diff --git a/lib/utils/constants.json b/lib/utils/constants.json index ba1feac9..a13e376d 100644 --- a/lib/utils/constants.json +++ b/lib/utils/constants.json @@ -126,6 +126,14 @@ "NoSpeechDetected": "azure_transcribe::no_speech_detected", "VadDetected": "azure_transcribe::vad_detected" }, + "SpeechmaticsTranscriptionEvents": { + "Transcription": "speechmatics_transcribe::transcription", + "Info": "speechmatics_transcribe::info", + "RecognitionStarted": "speechmatics_transcribe::recognition_started", + "ConnectFailure": "speechmatics_transcribe::connect_failed", + "Connect": "speechmatics_transcribe::connect", + "Error": "speechmatics_transcribe::error" + }, "JambonzTranscriptionEvents": { "Transcription": "jambonz_transcribe::transcription", "ConnectFailure": "jambonz_transcribe::connect_failed", diff --git a/lib/utils/db-utils.js b/lib/utils/db-utils.js index 9c70a3dc..54246840 100644 --- a/lib/utils/db-utils.js +++ b/lib/utils/db-utils.js @@ -91,35 +91,47 @@ const speechMapper = (cred) => { else if ('cobalt' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.cobalt_server_uri = o.cobalt_server_uri; - } else if ('elevenlabs' === obj.vendor) { + } + else if ('elevenlabs' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; obj.model_id = o.model_id; obj.options = o.options; - } else if ('playht' === obj.vendor) { + } + else if ('playht' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; obj.user_id = o.user_id; obj.voice_engine = o.voice_engine; obj.options = o.options; - } else if ('rimelabs' === obj.vendor) { + } + else if ('rimelabs' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; obj.model_id = o.model_id; obj.options = o.options; - } else if ('assemblyai' === obj.vendor) { + } + else if ('assemblyai' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; - } else if ('whisper' === obj.vendor) { + } + else if ('whisper' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.api_key = o.api_key; obj.model_id = o.model_id; - } else if ('verbio' === obj.vendor) { + } + else if ('verbio' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.client_id = o.client_id; obj.client_secret = o.client_secret; obj.engine_version = o.engine_version; - } else if (obj.vendor.startsWith('custom:')) { + } + else if ('speechmatics' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = o.api_key; + obj.speechmatics_stt_uri = o.speechmatics_stt_uri; + } + else if (obj.vendor.startsWith('custom:')) { const o = JSON.parse(decrypt(credential)); obj.auth_token = o.auth_token; obj.custom_stt_url = o.custom_stt_url; diff --git a/lib/utils/transcription-utils.js b/lib/utils/transcription-utils.js index d50d3bc4..9d70f720 100644 --- a/lib/utils/transcription-utils.js +++ b/lib/utils/transcription-utils.js @@ -1,7 +1,4 @@ -const { - TaskName, -} = require('./constants.json'); - +const {TaskName} = require('./constants.json'); const stickyVars = { google: [ 'GOOGLE_SPEECH_HINTS', @@ -100,6 +97,12 @@ const stickyVars = { assemblyai: [ 'ASSEMBLYAI_API_KEY', 'ASSEMBLYAI_WORD_BOOST' + ], + speechmatics: [ + 'SPEECHMATICS_API_KEY', + 'SPEECHMATICS_HOST', + 'SPEECHMATICS_PATH', + 'SPEECHMATICS_SPEECH_HINTS', ] }; @@ -466,12 +469,37 @@ const normalizeAssemblyAi = (evt, channel, language) => { } ], vendor: { - name: 'ASSEMBLYAI', + name: 'assemblyai', evt: copy } }; }; +const normalizeSpeechmatics = (evt, channel, language) => { + const copy = JSON.parse(JSON.stringify(evt)); + const is_final = evt.message === 'AddTranscript'; + const words = evt.results?.filter((r) => r.type === 'word') || []; + const confidence = words.length > 0 ? + words.reduce((acc, word) => acc + word.alternatives[0].confidence, 0) / words.length : + 0; + + const alternative = { + confidence, + transcript: evt.metadata?.transcript + }; + const obj = { + language_code: language, + channel_tag: channel, + is_final, + alternatives: [alternative], + vendor: { + name: 'speechmatics', + evt: copy + } + }; + return obj; +}; + module.exports = (logger) => { const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => { @@ -499,6 +527,8 @@ module.exports = (logger) => { return normalizeAssemblyAi(evt, channel, language, shortUtterance); case 'verbio': return normalizeVerbio(evt, channel, language); + case 'speechmatics': + return normalizeSpeechmatics(evt, channel, language); default: if (vendor.startsWith('custom:')) { return normalizeCustom(evt, channel, language, vendor); @@ -828,7 +858,8 @@ module.exports = (logger) => { ...(cobaltOptions.enableConfusionNetwork && {COBALT_ENABLE_CONFUSION_NETWORK: 1}), ...(cobaltOptions.compiledContextData && {COBALT_COMPILED_CONTEXT_DATA: cobaltOptions.compiledContextData}), }; - } else if ('assemblyai' === vendor) { + } + else if ('assemblyai' === vendor) { opts = { ...opts, ...(sttCredentials.api_key) && @@ -836,7 +867,8 @@ module.exports = (logger) => { ...(rOpts.hints?.length > 0 && {ASSEMBLYAI_WORD_BOOST: JSON.stringify(rOpts.hints)}) }; - } else if ('verbio' === vendor) { + } + else if ('verbio' === vendor) { const {verbioOptions = {}} = rOpts; opts = { ...opts, @@ -855,7 +887,16 @@ module.exports = (logger) => { ...(verbioOptions.speech_incomplete_timeout && {VERBIO_SPEECH_INCOMPLETE_TIMEOUT: verbioOptions.speech_incomplete_timeout}), }; - } else if (vendor.startsWith('custom:')) { + } + else if ('speechmatics' === vendor) { + opts = { + ...opts, + ...(sttCredentials.api_key) && {SPEECHMATICS_API_KEY: sttCredentials.api_key}, + ...(sttCredentials.speechmatics_stt_uri) && {SPEECHMATICS_HOST: sttCredentials.speechmatics_stt_uri}, + ...(rOpts.hints?.length > 0 && {SPEECHMATICS_SPEECH_HINTS: rOpts.hints.join(',')}), + }; + } + else if (vendor.startsWith('custom:')) { let {options = {}} = rOpts.customOptions || {}; const {sampleRate} = rOpts.customOptions || {}; const {auth_token, custom_stt_url} = sttCredentials; @@ -923,6 +964,6 @@ module.exports = (logger) => { setChannelVarsForStt, setSpeechCredentialsAtRuntime, compileSonioxTranscripts, - consolidateTranscripts + consolidateTranscripts, }; };