feat: update speech-ultil version 1.0.1 (#275)

* feat: update speech-ultil version 1.0.1

* feat: update speech-ultil version 1.0.1

* more fixes for custom stt

* more fixes

* fixes

* update drachtio-fsmrf

* pass url to mod_jambonz_transcribe

* transcription utils: handle custom results

* handle custom speech vendor errors

* add support for hints to custom speech

* change to custom speech options

* send hints as an array for custom speech

* update latest speech-utils

* transcribe: changes to support soniox

* bugfix: soniox transcribe

---------

Co-authored-by: Quan HL <quanluuhoang8@gmail.com>
Co-authored-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
Hoan Luu Huu
2023-03-13 06:38:36 +07:00
committed by GitHub
parent f71d3aed8b
commit 7be3c64116
9 changed files with 2075 additions and 129 deletions

View File

@@ -110,6 +110,12 @@
"NoSpeechDetected": "azure_transcribe::no_speech_detected",
"VadDetected": "azure_transcribe::vad_detected"
},
"JambonzTranscriptionEvents": {
"Transcription": "jambonz_transcribe::transcription",
"ConnectFailure": "jambonz_transcribe::connect_failed",
"Connect": "jambonz_transcribe::connect",
"Error": "jambonz_transcribe::error"
},
"ListenEvents": {
"Connect": "mod_audio_fork::connect",
"ConnectFailure": "mod_audio_fork::connect_failed",

View File

@@ -66,6 +66,12 @@ const speechMapper = (cred) => {
const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key;
}
else if (obj.vendor.startsWith('custom:')) {
const o = JSON.parse(decrypt(credential));
obj.auth_token = o.auth_token;
obj.custom_stt_url = o.custom_stt_url;
obj.custom_tts_url = o.custom_tts_url;
}
} catch (err) {
console.log(err);
}
@@ -83,53 +89,13 @@ module.exports = (logger, srf) => {
const [r2] = await pp.query(sqlSpeechCredentials, account_sid);
const speech = r2.map(speechMapper);
/* search at the service provider level if we don't find it at the account level */
const haveGoogle = speech.find((s) => s.vendor === 'google');
const haveAws = speech.find((s) => s.vendor === 'aws');
const haveMicrosoft = speech.find((s) => s.vendor === 'microsoft');
const haveWellsaid = speech.find((s) => s.vendor === 'wellsaid');
const haveNuance = speech.find((s) => s.vendor === 'nuance');
const haveDeepgram = speech.find((s) => s.vendor === 'deepgram');
const haveSoniox = speech.find((s) => s.vendor === 'soniox');
const haveIbm = speech.find((s) => s.vendor === 'ibm');
if (!haveGoogle || !haveAws || !haveMicrosoft || !haveWellsaid ||
!haveNuance || !haveIbm || !haveDeepgram || !haveSoniox) {
const [r3] = await pp.query(sqlSpeechCredentialsForSP, account_sid);
if (r3.length) {
if (!haveGoogle) {
const google = r3.find((s) => s.vendor === 'google');
if (google) speech.push(speechMapper(google));
}
if (!haveAws) {
const aws = r3.find((s) => s.vendor === 'aws');
if (aws) speech.push(speechMapper(aws));
}
if (!haveMicrosoft) {
const ms = r3.find((s) => s.vendor === 'microsoft');
if (ms) speech.push(speechMapper(ms));
}
if (!haveWellsaid) {
const wellsaid = r3.find((s) => s.vendor === 'wellsaid');
if (wellsaid) speech.push(speechMapper(wellsaid));
}
if (!haveNuance) {
const nuance = r3.find((s) => s.vendor === 'nuance');
if (nuance) speech.push(speechMapper(nuance));
}
if (!haveDeepgram) {
const deepgram = r3.find((s) => s.vendor === 'deepgram');
if (deepgram) speech.push(speechMapper(deepgram));
}
if (!haveSoniox) {
const soniox = r3.find((s) => s.vendor === 'soniox');
if (soniox) speech.push(speechMapper(soniox));
}
if (!haveIbm) {
const ibm = r3.find((s) => s.vendor === 'ibm');
if (ibm) speech.push(speechMapper(ibm));
}
/* add service provider creds unless we have that vendor at the account level */
const [r3] = await pp.query(sqlSpeechCredentialsForSP, account_sid);
r3.forEach((s) => {
if (!speech.find((s2) => s2.vendor === s.vendor)) {
speech.push(speechMapper(s));
}
}
});
return {
...r[0],

View File

@@ -138,7 +138,6 @@ function installSrfLocals(srf, logger) {
retrieveCall,
listCalls,
deleteCall,
synthAudio,
createHash,
retrieveHash,
deleteKey,
@@ -151,11 +150,17 @@ function installSrfLocals(srf, logger) {
pushBack,
popFront,
removeFromList,
lengthOfList,
getListPosition,
lengthOfList,
} = require('@jambonz/realtimedb-helpers')({
host: process.env.JAMBONES_REDIS_HOST,
port: process.env.JAMBONES_REDIS_PORT || 6379
}, logger, tracer);
const {
synthAudio,
getNuanceAccessToken,
getIbmAccessToken,
} = require('@jambonz/realtimedb-helpers')({
} = require('@jambonz/speech-utils')({
host: process.env.JAMBONES_REDIS_HOST,
port: process.env.JAMBONES_REDIS_PORT || 6379
}, logger, tracer);

View File

@@ -6,7 +6,8 @@ const {
NuanceTranscriptionEvents,
DeepgramTranscriptionEvents,
SonioxTranscriptionEvents,
NvidiaTranscriptionEvents
NvidiaTranscriptionEvents,
JambonzTranscriptionEvents
} = require('./constants');
const stickyVars = {
@@ -223,6 +224,15 @@ const normalizeGoogle = (evt, channel, language) => {
};
};
const normalizeCustom = (evt, channel, language) => {
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
alternatives: [evt.alternatives[0]]
};
};
const normalizeNuance = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
return {
@@ -302,6 +312,9 @@ module.exports = (logger) => {
case 'soniox':
return normalizeSoniox(evt, channel, language);
default:
if (vendor.startsWith('custom:')) {
return normalizeCustom(evt, channel, language);
}
logger.error(`Unknown vendor ${vendor}`);
return evt;
}
@@ -311,6 +324,7 @@ module.exports = (logger) => {
let opts = {};
const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {};
const vad = {enable, voiceMs, mode};
const vendor = rOpts.vendor;
/* voice activity detection works across vendors */
opts = {
@@ -320,7 +334,7 @@ module.exports = (logger) => {
...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}),
};
if ('google' === rOpts.vendor) {
if ('google' === vendor) {
opts = {
...opts,
...(sttCredentials &&
@@ -372,7 +386,7 @@ module.exports = (logger) => {
{GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE: rOpts.naicsCode}),
};
}
else if (['aws', 'polly'].includes(rOpts.vendor)) {
else if (['aws', 'polly'].includes(vendor)) {
opts = {
...opts,
...(rOpts.vocabularyName && {AWS_VOCABULARY_NAME: rOpts.vocabularyName}),
@@ -385,7 +399,7 @@ module.exports = (logger) => {
}),
};
}
else if ('microsoft' === rOpts.vendor) {
else if ('microsoft' === vendor) {
opts = {
...opts,
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
@@ -410,7 +424,7 @@ module.exports = (logger) => {
{AZURE_SERVICE_ENDPOINT_ID: sttCredentials.custom_stt_endpoint})
};
}
else if ('nuance' === rOpts.vendor) {
else if ('nuance' === vendor) {
/**
* Note: all nuance options are in recognizer.nuanceOptions, should migrate
* other vendor settings to similar nested structure
@@ -461,7 +475,7 @@ module.exports = (logger) => {
{NUANCE_RESOURCES: JSON.stringify(nuanceOptions.resources)},
};
}
else if ('deepgram' === rOpts.vendor) {
else if ('deepgram' === vendor) {
const {deepgramOptions = {}} = rOpts;
opts = {
...opts,
@@ -505,7 +519,7 @@ module.exports = (logger) => {
{DEEPGRAM_SPEECH_TAG: deepgramOptions.tag}
};
}
else if ('soniox' === rOpts.vendor) {
else if ('soniox' === vendor) {
const {sonioxOptions = {}} = rOpts;
const {storage = {}} = sonioxOptions;
opts = {
@@ -528,7 +542,7 @@ module.exports = (logger) => {
...(storage?.id && storage?.disableSearch && {SONIOX_STORAGE_DISABLE_SEARCH: 1})
};
}
else if ('ibm' === rOpts.vendor) {
else if ('ibm' === vendor) {
const {ibmOptions = {}} = rOpts;
opts = {
...opts,
@@ -552,7 +566,7 @@ module.exports = (logger) => {
{IBM_SPEECH_WATSON_LEARNING_OPT_OUT: ibmOptions.watsonLearningOptOut}
};
}
else if ('nvidia' === rOpts.vendor) {
else if ('nvidia' === vendor) {
const {nvidiaOptions = {}} = rOpts;
opts = {
...opts,
@@ -581,11 +595,29 @@ module.exports = (logger) => {
{NVIDIA_CUSTOM_CONFIGURATION: JSON.stringify(nvidiaOptions.customConfiguration)}),
};
}
else if (vendor.startsWith('custom:')) {
let {options = {}} = rOpts;
const {auth_token, custom_stt_url} = sttCredentials;
options = {
...options,
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
{hints: rOpts.hints}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' &&
{hints: JSON.stringify(rOpts.hints)}),
...(typeof rOpts.hintsBoost === 'number' && {hintsBoost: rOpts.hintsBoost})
};
stickyVars[rOpts.vendor].forEach((key) => {
opts = {
...opts,
JAMBONZ_STT_API_KEY: auth_token,
JAMBONZ_STT_URL: custom_stt_url,
...(Object.keys(options).length > 0 && {JAMBONZ_STT_OPTIONS: JSON.stringify(options)}),
};
}
(stickyVars[vendor] || []).forEach((key) => {
if (!opts[key]) opts[key] = '';
});
//logger.debug({opts}, 'recognizer channel vars');
return opts;
};
@@ -604,7 +636,6 @@ module.exports = (logger) => {
ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
@@ -612,13 +643,17 @@ module.exports = (logger) => {
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
ep.removeCustomEventListener(SonioxTranscriptionEvents.Transcription);
ep.removeCustomEventListener(SonioxTranscriptionEvents.Error);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Transcription);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Error);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(JambonzTranscriptionEvents.Transcription);
ep.removeCustomEventListener(JambonzTranscriptionEvents.Connect);
ep.removeCustomEventListener(JambonzTranscriptionEvents.ConnectFailure);
ep.removeCustomEventListener(JambonzTranscriptionEvents.Error);
};
const setSpeechCredentialsAtRuntime = (recognizer) => {