Feat/speechmatics (#932)

* wip

* initial working version of speechmatics

* linting
This commit is contained in:
Dave Horton
2024-10-12 18:42:53 -04:00
committed by GitHub
parent 84b7456c2d
commit f84b3793e1
7 changed files with 175 additions and 27 deletions

View File

@@ -963,42 +963,56 @@ class CallSession extends Emitter {
speech_credential_sid: credential.speech_credential_sid, speech_credential_sid: credential.speech_credential_sid,
cobalt_server_uri: credential.cobalt_server_uri cobalt_server_uri: credential.cobalt_server_uri
}; };
} else if ('elevenlabs' === vendor) { }
else if ('elevenlabs' === vendor) {
return { return {
api_key: credential.api_key, api_key: credential.api_key,
model_id: credential.model_id, model_id: credential.model_id,
options: credential.options options: credential.options
}; };
} else if ('playht' === vendor) { }
else if ('playht' === vendor) {
return { return {
api_key: credential.api_key, api_key: credential.api_key,
user_id: credential.user_id, user_id: credential.user_id,
voice_engine: credential.voice_engine, voice_engine: credential.voice_engine,
options: credential.options options: credential.options
}; };
} else if ('rimelabs' === vendor) { }
else if ('rimelabs' === vendor) {
return { return {
api_key: credential.api_key, api_key: credential.api_key,
model_id: credential.model_id, model_id: credential.model_id,
options: credential.options options: credential.options
}; };
} else if ('assemblyai' === vendor) { }
else if ('assemblyai' === vendor) {
return { return {
speech_credential_sid: credential.speech_credential_sid, speech_credential_sid: credential.speech_credential_sid,
api_key: credential.api_key api_key: credential.api_key
}; };
} else if ('whisper' === vendor) { }
else if ('whisper' === vendor) {
return { return {
api_key: credential.api_key, api_key: credential.api_key,
model_id: credential.model_id model_id: credential.model_id
}; };
} else if ('verbio' === vendor) { }
else if ('verbio' === vendor) {
return { return {
client_id: credential.client_id, client_id: credential.client_id,
client_secret: credential.client_secret, client_secret: credential.client_secret,
engine_version: credential.engine_version engine_version: credential.engine_version
}; };
} else if (vendor.startsWith('custom:')) { }
else if ('speechmatics' === vendor) {
this.logger.info({credential}, 'CallSession:getSpeechCredentials - speechmatics credential');
return {
api_key: credential.api_key,
speechmatics_stt_uri: credential.speechmatics_stt_uri,
};
}
else if (vendor.startsWith('custom:')) {
return { return {
speech_credential_sid: credential.speech_credential_sid, speech_credential_sid: credential.speech_credential_sid,
auth_token: credential.auth_token, auth_token: credential.auth_token,

View File

@@ -12,7 +12,8 @@ const {
JambonzTranscriptionEvents, JambonzTranscriptionEvents,
AssemblyAiTranscriptionEvents, AssemblyAiTranscriptionEvents,
VadDetection, VadDetection,
VerbioTranscriptionEvents VerbioTranscriptionEvents,
SpeechmaticsTranscriptionEvents
} = require('../utils/constants.json'); } = require('../utils/constants.json');
const { const {
JAMBONES_GATHER_EARLY_HINTS_MATCH, JAMBONES_GATHER_EARLY_HINTS_MATCH,
@@ -514,6 +515,24 @@ class TaskGather extends SttTask {
this.addCustomEventListener(ep, AssemblyAiTranscriptionEvents.ConnectFailure, this.addCustomEventListener(ep, AssemblyAiTranscriptionEvents.ConnectFailure,
this._onVendorConnectFailure.bind(this, cs, ep)); this._onVendorConnectFailure.bind(this, cs, ep));
break; break;
case 'speechmatics':
this.bugname = `${this.bugname_prefix}speechmatics_transcribe`;
this.addCustomEventListener(
ep, SpeechmaticsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Info,
this._onSpeechmaticsInfo.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.RecognitionStarted,
this._onSpeechmaticsRecognitionStarted.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Connect,
this._onVendorConnect.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.ConnectFailure,
this._onVendorConnectFailure.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Error,
this._onSpeechmaticsErrror.bind(this, cs, ep));
break;
default: default:
if (this.vendor.startsWith('custom:')) { if (this.vendor.startsWith('custom:')) {
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`; this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
@@ -752,7 +771,7 @@ class TaskGather extends SttTask {
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, evt = this.normalizeTranscription(evt, this.vendor, 1, this.language,
this.shortUtterance, this.data.recognizer.punctuation); this.shortUtterance, this.data.recognizer.punctuation);
//this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription normalized transcript'); this.logger.debug({evt, bugname, finished, vendor: this.vendor}, 'Gather:_onTranscription normalized transcript');
if (evt.alternatives.length === 0) { if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening'); this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
@@ -1006,12 +1025,25 @@ class TaskGather extends SttTask {
} }
} }
async _onSpeechmaticsErrror(cs, _ep, evt) {
// eslint-disable-next-line no-unused-vars
const {message, ...e} = evt;
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
async _onVendorError(cs, _ep, evt) { async _onVendorError(cs, _ep, evt) {
super._onVendorError(cs, _ep, evt); super._onVendorError(cs, _ep, evt);
if (!(await this._startFallback(cs, _ep, evt))) { if (!(await this._startFallback(cs, _ep, evt))) {
this._resolve('stt-error', evt); this._resolve('stt-error', evt);
} }
} }
async _onSpeechmaticsRecognitionStarted(_cs, _ep, evt) {
this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsRecognitionStarted');
}
async _onSpeechmaticsInfo(_cs, _ep, evt) {
this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsInfo');
}
_onVadDetected(cs, ep) { _onVadDetected(cs, ep) {
if (this.bargein && this.minBargeinWordCount === 0) { if (this.bargein && this.minBargeinWordCount === 0) {

View File

@@ -17,12 +17,14 @@ class SttTask extends Task {
normalizeTranscription, normalizeTranscription,
setSpeechCredentialsAtRuntime, setSpeechCredentialsAtRuntime,
compileSonioxTranscripts, compileSonioxTranscripts,
consolidateTranscripts consolidateTranscripts,
updateSpeechmaticsPayload
} = require('../utils/transcription-utils')(logger); } = require('../utils/transcription-utils')(logger);
this.setChannelVarsForStt = setChannelVarsForStt; this.setChannelVarsForStt = setChannelVarsForStt;
this.normalizeTranscription = normalizeTranscription; this.normalizeTranscription = normalizeTranscription;
this.compileSonioxTranscripts = compileSonioxTranscripts; this.compileSonioxTranscripts = compileSonioxTranscripts;
this.consolidateTranscripts = consolidateTranscripts; this.consolidateTranscripts = consolidateTranscripts;
this.updateSpeechmaticsPayload = updateSpeechmaticsPayload;
this.eventHandlers = []; this.eventHandlers = [];
this.isHandledByPrimaryProvider = true; this.isHandledByPrimaryProvider = true;
/** /**

View File

@@ -12,7 +12,9 @@ const {
NvidiaTranscriptionEvents, NvidiaTranscriptionEvents,
JambonzTranscriptionEvents, JambonzTranscriptionEvents,
TranscribeStatus, TranscribeStatus,
AssemblyAiTranscriptionEvents AssemblyAiTranscriptionEvents,
VerbioTranscriptionEvents,
SpeechmaticsTranscriptionEvents
} = require('../utils/constants.json'); } = require('../utils/constants.json');
const { normalizeJambones } = require('@jambonz/verb-specifications'); const { normalizeJambones } = require('@jambonz/verb-specifications');
const SttTask = require('./stt-task'); const SttTask = require('./stt-task');
@@ -237,6 +239,13 @@ class TaskTranscribe extends SttTask {
this.addCustomEventListener(ep, SonioxTranscriptionEvents.Transcription, this.addCustomEventListener(ep, SonioxTranscriptionEvents.Transcription,
this._onTranscription.bind(this, cs, ep, channel)); this._onTranscription.bind(this, cs, ep, channel));
break; break;
case 'verbio':
this.bugname = `${this.bugname_prefix}verbio_transcribe`;
this.addCustomEventListener(
ep, VerbioTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
break;
case 'cobalt': case 'cobalt':
this.bugname = `${this.bugname_prefix}cobalt_transcribe`; this.bugname = `${this.bugname_prefix}cobalt_transcribe`;
this.addCustomEventListener(ep, CobaltTranscriptionEvents.Transcription, this.addCustomEventListener(ep, CobaltTranscriptionEvents.Transcription,
@@ -294,6 +303,22 @@ class TaskTranscribe extends SttTask {
this._onVendorConnectFailure.bind(this, cs, ep, channel)); this._onVendorConnectFailure.bind(this, cs, ep, channel));
break; break;
case 'speechmatics':
this.bugname = `${this.bugname_prefix}speechmatics_transcribe`;
this.addCustomEventListener(
ep, SpeechmaticsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Info,
this._onSpeechmaticsInfo.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.RecognitionStarted,
this._onSpeechmaticsRecognitionStarted.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Connect,
this._onVendorConnect.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.ConnectFailure,
this._onVendorConnectFailure.bind(this, cs, ep));
this.addCustomEventListener(ep, SpeechmaticsTranscriptionEvents.Error,
this._onSpeechmaticsError.bind(this, cs, ep));
break;
default: default:
if (this.vendor.startsWith('custom:')) { if (this.vendor.startsWith('custom:')) {
this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`; this.bugname = `${this.bugname_prefix}${this.vendor}_transcribe`;
@@ -644,6 +669,20 @@ class TaskTranscribe extends SttTask {
} }
} }
async _onSpeechmaticsRecognitionStarted(_cs, _ep, evt) {
this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsRecognitionStarted');
}
async _onSpeechmaticsInfo(_cs, _ep, evt) {
this.logger.debug({evt}, 'TaskGather:_onSpeechmaticsInfo');
}
async _onSpeechmaticsErrror(cs, _ep, evt) {
// eslint-disable-next-line no-unused-vars
const {message, ...e} = evt;
this._onVendorError(cs, _ep, {error: JSON.stringify(e)});
}
_startAsrTimer(channel) { _startAsrTimer(channel) {
if (this.vendor === 'deepgram') return; // no need if (this.vendor === 'deepgram') return; // no need
assert(this.isContinuousAsr); assert(this.isContinuousAsr);

View File

@@ -126,6 +126,14 @@
"NoSpeechDetected": "azure_transcribe::no_speech_detected", "NoSpeechDetected": "azure_transcribe::no_speech_detected",
"VadDetected": "azure_transcribe::vad_detected" "VadDetected": "azure_transcribe::vad_detected"
}, },
"SpeechmaticsTranscriptionEvents": {
"Transcription": "speechmatics_transcribe::transcription",
"Info": "speechmatics_transcribe::info",
"RecognitionStarted": "speechmatics_transcribe::recognition_started",
"ConnectFailure": "speechmatics_transcribe::connect_failed",
"Connect": "speechmatics_transcribe::connect",
"Error": "speechmatics_transcribe::error"
},
"JambonzTranscriptionEvents": { "JambonzTranscriptionEvents": {
"Transcription": "jambonz_transcribe::transcription", "Transcription": "jambonz_transcribe::transcription",
"ConnectFailure": "jambonz_transcribe::connect_failed", "ConnectFailure": "jambonz_transcribe::connect_failed",

View File

@@ -91,35 +91,47 @@ const speechMapper = (cred) => {
else if ('cobalt' === obj.vendor) { else if ('cobalt' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.cobalt_server_uri = o.cobalt_server_uri; obj.cobalt_server_uri = o.cobalt_server_uri;
} else if ('elevenlabs' === obj.vendor) { }
else if ('elevenlabs' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key; obj.api_key = o.api_key;
obj.model_id = o.model_id; obj.model_id = o.model_id;
obj.options = o.options; obj.options = o.options;
} else if ('playht' === obj.vendor) { }
else if ('playht' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key; obj.api_key = o.api_key;
obj.user_id = o.user_id; obj.user_id = o.user_id;
obj.voice_engine = o.voice_engine; obj.voice_engine = o.voice_engine;
obj.options = o.options; obj.options = o.options;
} else if ('rimelabs' === obj.vendor) { }
else if ('rimelabs' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key; obj.api_key = o.api_key;
obj.model_id = o.model_id; obj.model_id = o.model_id;
obj.options = o.options; obj.options = o.options;
} else if ('assemblyai' === obj.vendor) { }
else if ('assemblyai' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key; obj.api_key = o.api_key;
} else if ('whisper' === obj.vendor) { }
else if ('whisper' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key; obj.api_key = o.api_key;
obj.model_id = o.model_id; obj.model_id = o.model_id;
} else if ('verbio' === obj.vendor) { }
else if ('verbio' === obj.vendor) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.client_id = o.client_id; obj.client_id = o.client_id;
obj.client_secret = o.client_secret; obj.client_secret = o.client_secret;
obj.engine_version = o.engine_version; obj.engine_version = o.engine_version;
} else if (obj.vendor.startsWith('custom:')) { }
else if ('speechmatics' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key;
obj.speechmatics_stt_uri = o.speechmatics_stt_uri;
}
else if (obj.vendor.startsWith('custom:')) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.auth_token = o.auth_token; obj.auth_token = o.auth_token;
obj.custom_stt_url = o.custom_stt_url; obj.custom_stt_url = o.custom_stt_url;

View File

@@ -1,7 +1,4 @@
const { const {TaskName} = require('./constants.json');
TaskName,
} = require('./constants.json');
const stickyVars = { const stickyVars = {
google: [ google: [
'GOOGLE_SPEECH_HINTS', 'GOOGLE_SPEECH_HINTS',
@@ -100,6 +97,12 @@ const stickyVars = {
assemblyai: [ assemblyai: [
'ASSEMBLYAI_API_KEY', 'ASSEMBLYAI_API_KEY',
'ASSEMBLYAI_WORD_BOOST' 'ASSEMBLYAI_WORD_BOOST'
],
speechmatics: [
'SPEECHMATICS_API_KEY',
'SPEECHMATICS_HOST',
'SPEECHMATICS_PATH',
'SPEECHMATICS_SPEECH_HINTS',
] ]
}; };
@@ -466,12 +469,37 @@ const normalizeAssemblyAi = (evt, channel, language) => {
} }
], ],
vendor: { vendor: {
name: 'ASSEMBLYAI', name: 'assemblyai',
evt: copy evt: copy
} }
}; };
}; };
const normalizeSpeechmatics = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
const is_final = evt.message === 'AddTranscript';
const words = evt.results?.filter((r) => r.type === 'word') || [];
const confidence = words.length > 0 ?
words.reduce((acc, word) => acc + word.alternatives[0].confidence, 0) / words.length :
0;
const alternative = {
confidence,
transcript: evt.metadata?.transcript
};
const obj = {
language_code: language,
channel_tag: channel,
is_final,
alternatives: [alternative],
vendor: {
name: 'speechmatics',
evt: copy
}
};
return obj;
};
module.exports = (logger) => { module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => { const normalizeTranscription = (evt, vendor, channel, language, shortUtterance, punctuation) => {
@@ -499,6 +527,8 @@ module.exports = (logger) => {
return normalizeAssemblyAi(evt, channel, language, shortUtterance); return normalizeAssemblyAi(evt, channel, language, shortUtterance);
case 'verbio': case 'verbio':
return normalizeVerbio(evt, channel, language); return normalizeVerbio(evt, channel, language);
case 'speechmatics':
return normalizeSpeechmatics(evt, channel, language);
default: default:
if (vendor.startsWith('custom:')) { if (vendor.startsWith('custom:')) {
return normalizeCustom(evt, channel, language, vendor); return normalizeCustom(evt, channel, language, vendor);
@@ -828,7 +858,8 @@ module.exports = (logger) => {
...(cobaltOptions.enableConfusionNetwork && {COBALT_ENABLE_CONFUSION_NETWORK: 1}), ...(cobaltOptions.enableConfusionNetwork && {COBALT_ENABLE_CONFUSION_NETWORK: 1}),
...(cobaltOptions.compiledContextData && {COBALT_COMPILED_CONTEXT_DATA: cobaltOptions.compiledContextData}), ...(cobaltOptions.compiledContextData && {COBALT_COMPILED_CONTEXT_DATA: cobaltOptions.compiledContextData}),
}; };
} else if ('assemblyai' === vendor) { }
else if ('assemblyai' === vendor) {
opts = { opts = {
...opts, ...opts,
...(sttCredentials.api_key) && ...(sttCredentials.api_key) &&
@@ -836,7 +867,8 @@ module.exports = (logger) => {
...(rOpts.hints?.length > 0 && ...(rOpts.hints?.length > 0 &&
{ASSEMBLYAI_WORD_BOOST: JSON.stringify(rOpts.hints)}) {ASSEMBLYAI_WORD_BOOST: JSON.stringify(rOpts.hints)})
}; };
} else if ('verbio' === vendor) { }
else if ('verbio' === vendor) {
const {verbioOptions = {}} = rOpts; const {verbioOptions = {}} = rOpts;
opts = { opts = {
...opts, ...opts,
@@ -855,7 +887,16 @@ module.exports = (logger) => {
...(verbioOptions.speech_incomplete_timeout && ...(verbioOptions.speech_incomplete_timeout &&
{VERBIO_SPEECH_INCOMPLETE_TIMEOUT: verbioOptions.speech_incomplete_timeout}), {VERBIO_SPEECH_INCOMPLETE_TIMEOUT: verbioOptions.speech_incomplete_timeout}),
}; };
} else if (vendor.startsWith('custom:')) { }
else if ('speechmatics' === vendor) {
opts = {
...opts,
...(sttCredentials.api_key) && {SPEECHMATICS_API_KEY: sttCredentials.api_key},
...(sttCredentials.speechmatics_stt_uri) && {SPEECHMATICS_HOST: sttCredentials.speechmatics_stt_uri},
...(rOpts.hints?.length > 0 && {SPEECHMATICS_SPEECH_HINTS: rOpts.hints.join(',')}),
};
}
else if (vendor.startsWith('custom:')) {
let {options = {}} = rOpts.customOptions || {}; let {options = {}} = rOpts.customOptions || {};
const {sampleRate} = rOpts.customOptions || {}; const {sampleRate} = rOpts.customOptions || {};
const {auth_token, custom_stt_url} = sttCredentials; const {auth_token, custom_stt_url} = sttCredentials;
@@ -923,6 +964,6 @@ module.exports = (logger) => {
setChannelVarsForStt, setChannelVarsForStt,
setSpeechCredentialsAtRuntime, setSpeechCredentialsAtRuntime,
compileSonioxTranscripts, compileSonioxTranscripts,
consolidateTranscripts consolidateTranscripts,
}; };
}; };