mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
initial changes for soniox (#270)
* initial changes for soniox * changes to gather for soniox * parse soniox stt results * handle <end> token for soniox * soniox: handle empty array of words * support for soniox hints * add soniox storage options * update to verb specs * add support for transcribe * compile soniox transcripts * gather: kill no input timer for soniox when we get interim results * fix buffering of soniox transcripts * fix for compiling soniox transcript * another fix for compiling soniox transcript * another fix * handling of <end> token * fix soniox bug * gather: fixes for soniox continous asr * fix undefined variable reference * fix prev commit * bugfix: allow verb_status requests * gather: for soniox no need to restart transcription after final transcription received * update verb specs * update verb specs, fixes for continuous asr:
This commit is contained in:
@@ -86,6 +86,10 @@
|
||||
"ConnectFailure": "deepgram_transcribe::connect_failed",
|
||||
"Connect": "deepgram_transcribe::connect"
|
||||
},
|
||||
"SonioxTranscriptionEvents": {
|
||||
"Transcription": "soniox_transcribe::transcription",
|
||||
"Error": "soniox_transcribe::error"
|
||||
},
|
||||
"IbmTranscriptionEvents": {
|
||||
"Transcription": "ibm_transcribe::transcription",
|
||||
"ConnectFailure": "ibm_transcribe::connect_failed",
|
||||
@@ -147,6 +151,7 @@
|
||||
"queue:status",
|
||||
"dial:confirm",
|
||||
"verb:hook",
|
||||
"verb:status",
|
||||
"jambonz:error"
|
||||
],
|
||||
"RecordState": {
|
||||
|
||||
@@ -62,6 +62,10 @@ const speechMapper = (cred) => {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = o.api_key;
|
||||
}
|
||||
else if ('soniox' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = o.api_key;
|
||||
}
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
@@ -86,8 +90,10 @@ module.exports = (logger, srf) => {
|
||||
const haveWellsaid = speech.find((s) => s.vendor === 'wellsaid');
|
||||
const haveNuance = speech.find((s) => s.vendor === 'nuance');
|
||||
const haveDeepgram = speech.find((s) => s.vendor === 'deepgram');
|
||||
const haveSoniox = speech.find((s) => s.vendor === 'soniox');
|
||||
const haveIbm = speech.find((s) => s.vendor === 'ibm');
|
||||
if (!haveGoogle || !haveAws || !haveMicrosoft || !haveWellsaid || !haveNuance || !haveIbm || !haveDeepgram) {
|
||||
if (!haveGoogle || !haveAws || !haveMicrosoft || !haveWellsaid ||
|
||||
!haveNuance || !haveIbm || !haveDeepgram || !haveSoniox) {
|
||||
const [r3] = await pp.query(sqlSpeechCredentialsForSP, account_sid);
|
||||
if (r3.length) {
|
||||
if (!haveGoogle) {
|
||||
@@ -114,6 +120,10 @@ module.exports = (logger, srf) => {
|
||||
const deepgram = r3.find((s) => s.vendor === 'deepgram');
|
||||
if (deepgram) speech.push(speechMapper(deepgram));
|
||||
}
|
||||
if (!haveSoniox) {
|
||||
const soniox = r3.find((s) => s.vendor === 'soniox');
|
||||
if (soniox) speech.push(speechMapper(soniox));
|
||||
}
|
||||
if (!haveIbm) {
|
||||
const ibm = r3.find((s) => s.vendor === 'ibm');
|
||||
if (ibm) speech.push(speechMapper(ibm));
|
||||
|
||||
@@ -5,6 +5,7 @@ const {
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents,
|
||||
SonioxTranscriptionEvents,
|
||||
NvidiaTranscriptionEvents
|
||||
} = require('./constants');
|
||||
|
||||
@@ -88,9 +89,70 @@ const stickyVars = {
|
||||
],
|
||||
nvidia: [
|
||||
'NVIDIA_HINTS'
|
||||
],
|
||||
soniox: [
|
||||
'SONIOX_PROFANITY_FILTER',
|
||||
'SONIOX_MODEL'
|
||||
]
|
||||
};
|
||||
|
||||
const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
|
||||
const words = finalWordChunks.flat();
|
||||
const transcript = words.reduce((acc, word) => {
|
||||
if (word.text === '<end>') return acc;
|
||||
if ([',', '.', '?', '!'].includes(word.text)) return `${acc}${word.text}`;
|
||||
return `${acc} ${word.text}`;
|
||||
}, '').trim();
|
||||
const realWords = words.filter((word) => ![',.!?;'].includes(word.text) && word.text !== '<end>');
|
||||
const confidence = realWords.reduce((acc, word) => acc + word.confidence, 0) / realWords.length;
|
||||
const alternatives = [{transcript, confidence}];
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: true,
|
||||
alternatives,
|
||||
vendor: {
|
||||
name: 'soniox',
|
||||
evt: words
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeSoniox = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
|
||||
/* an <end> token indicates the end of an utterance */
|
||||
const endTokenPos = evt.words.map((w) => w.text).indexOf('<end>');
|
||||
const endpointReached = endTokenPos !== -1;
|
||||
const words = endpointReached ? evt.words.slice(0, endTokenPos) : evt.words;
|
||||
|
||||
/* note: we can safely ignore words after the <end> token as they will be returned again */
|
||||
const finalWords = words.filter((word) => word.is_final);
|
||||
const nonFinalWords = words.filter((word) => !word.is_final);
|
||||
|
||||
const is_final = endpointReached && finalWords.length > 0;
|
||||
const transcript = words.reduce((acc, word) => {
|
||||
if ([',', '.', '?', '!'].includes(word.text)) return `${acc}${word.text}`;
|
||||
else return `${acc} ${word.text}`;
|
||||
}, '').trim();
|
||||
const realWords = words.filter((word) => ![',.!?;'].includes(word.text) && word.text !== '<end>');
|
||||
const confidence = realWords.reduce((acc, word) => acc + word.confidence, 0) / realWords.length;
|
||||
const alternatives = [{transcript, confidence}];
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final,
|
||||
alternatives,
|
||||
vendor: {
|
||||
name: 'soniox',
|
||||
endpointReached,
|
||||
evt: copy,
|
||||
finalWords,
|
||||
nonFinalWords
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeDeepgram = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
const alternatives = (evt.channel?.alternatives || [])
|
||||
@@ -237,6 +299,8 @@ module.exports = (logger) => {
|
||||
return normalizeIbm(evt, channel, language);
|
||||
case 'nvidia':
|
||||
return normalizeNvidia(evt, channel, language);
|
||||
case 'soniox':
|
||||
return normalizeSoniox(evt, channel, language);
|
||||
default:
|
||||
logger.error(`Unknown vendor ${vendor}`);
|
||||
return evt;
|
||||
@@ -441,6 +505,29 @@ module.exports = (logger) => {
|
||||
{DEEPGRAM_SPEECH_TAG: deepgramOptions.tag}
|
||||
};
|
||||
}
|
||||
else if ('soniox' === rOpts.vendor) {
|
||||
const {sonioxOptions = {}} = rOpts;
|
||||
const {storage = {}} = sonioxOptions;
|
||||
opts = {
|
||||
...opts,
|
||||
...(sttCredentials.api_key) &&
|
||||
{SONIOX_API_KEY: sttCredentials.api_key},
|
||||
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
|
||||
{SONIOX_HINTS: rOpts.hints.join(',')}),
|
||||
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' &&
|
||||
{SONIOX_HINTS: JSON.stringify(rOpts.hints)}),
|
||||
...(typeof rOpts.hintsBoost === 'number' &&
|
||||
{SONIOX_HINTS_BOOST: rOpts.hintsBoost}),
|
||||
...(sonioxOptions.model) &&
|
||||
{SONIOX_MODEL: sonioxOptions.model},
|
||||
...((sonioxOptions.profanityFilter || rOpts.profanityFilter) && {SONIOX_PROFANITY_FILTER: 1}),
|
||||
...(storage?.id && {SONIOX_STORAGE_ID: storage.id}),
|
||||
...(storage?.id && storage?.title && {SONIOX_STORAGE_TITLE: storage.title}),
|
||||
...(storage?.id && storage?.disableStoreAudio && {SONIOX_STORAGE_DISABLE_AUDIO: 1}),
|
||||
...(storage?.id && storage?.disableStoreTranscript && {SONIOX_STORAGE_DISABLE_TRANSCRIPT: 1}),
|
||||
...(storage?.id && storage?.disableSearch && {SONIOX_STORAGE_DISABLE_SEARCH: 1})
|
||||
};
|
||||
}
|
||||
else if ('ibm' === rOpts.vendor) {
|
||||
const {ibmOptions = {}} = rOpts;
|
||||
opts = {
|
||||
@@ -524,6 +611,9 @@ module.exports = (logger) => {
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
|
||||
|
||||
ep.removeCustomEventListener(SonioxTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(SonioxTranscriptionEvents.Error);
|
||||
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete);
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech);
|
||||
@@ -534,8 +624,9 @@ module.exports = (logger) => {
|
||||
const setSpeechCredentialsAtRuntime = (recognizer) => {
|
||||
if (!recognizer) return;
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions || {};
|
||||
const {clientId, secret, kryptonEndpoint} = recognizer.nuanceOptions || {};
|
||||
if (clientId && secret) return {client_id: clientId, secret};
|
||||
if (kryptonEndpoint) return {krypton_endpoint: kryptonEndpoint};
|
||||
}
|
||||
else if (recognizer.vendor === 'nvidia') {
|
||||
const {rivaUri} = recognizer.nvidiaOptions || {};
|
||||
@@ -545,6 +636,10 @@ module.exports = (logger) => {
|
||||
const {apiKey} = recognizer.deepgramOptions || {};
|
||||
if (apiKey) return {api_key: apiKey};
|
||||
}
|
||||
else if (recognizer.vendor === 'soniox') {
|
||||
const {apiKey} = recognizer.sonioxOptions || {};
|
||||
if (apiKey) return {api_key: apiKey};
|
||||
}
|
||||
else if (recognizer.vendor === 'ibm') {
|
||||
const {ttsApiKey, ttsRegion, sttApiKey, sttRegion, instanceId} = recognizer.ibmOptions || {};
|
||||
if (ttsApiKey || sttApiKey) return {
|
||||
@@ -561,6 +656,7 @@ module.exports = (logger) => {
|
||||
normalizeTranscription,
|
||||
setChannelVarsForStt,
|
||||
removeSpeechListeners,
|
||||
setSpeechCredentialsAtRuntime
|
||||
setSpeechCredentialsAtRuntime,
|
||||
compileSonioxTranscripts
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user