Feature/nvidia speech (#261)

* initial changes for nvidia speech

* allow nvidia speech credentials to be set at runtime

* update drachtio-fsmrf

* fix handling of nvidia-specific options

* fix nvidia custom config

* fix nvidia word time offsets

* fix nvidia custom configuration

* normalize nvidia transcripts

* update to @jambonz/realtime-dbhelpers with nvidia tts support
This commit is contained in:
Dave Horton
2023-02-12 14:06:01 -05:00
committed by GitHub
parent 50057deca9
commit 0fdcb3a6d6
7 changed files with 222 additions and 96 deletions

View File

@@ -5,6 +5,7 @@ const {
AwsTranscriptionEvents,
NuanceTranscriptionEvents,
DeepgramTranscriptionEvents,
NvidiaTranscriptionEvents
} = require('./constants');
const stickyVars = {
@@ -84,6 +85,9 @@ const stickyVars = {
'IBM_SPEECH_BASE_MODEL_VERSION',
'IBM_SPEECH_WATSON_METADATA',
'IBM_SPEECH_WATSON_LEARNING_OPT_OUT'
],
nvidia: [
'NVIDIA_HINTS'
]
};
@@ -107,6 +111,25 @@ const normalizeDeepgram = (evt, channel, language) => {
};
};
const normalizeNvidia = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
const alternatives = (evt.alternatives || [])
.map((alt) => ({
confidence: alt.confidence,
transcript: alt.transcript,
}));
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
alternatives,
vendor: {
name: 'nvidia',
evt: copy
}
};
};
const normalizeIbm = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
//const idx = evt.result_index;
@@ -212,6 +235,8 @@ module.exports = (logger) => {
return normalizeNuance(evt, channel, language);
case 'ibm':
return normalizeIbm(evt, channel, language);
case 'nvidia':
return normalizeNvidia(evt, channel, language);
default:
logger.error(`Unknown vendor ${vendor}`);
return evt;
@@ -440,6 +465,36 @@ module.exports = (logger) => {
{IBM_SPEECH_WATSON_LEARNING_OPT_OUT: ibmOptions.watsonLearningOptOut}
};
}
else if ('nvidia' === rOpts.vendor) {
const {nvidiaOptions = {}} = rOpts;
opts = {
...opts,
...((nvidiaOptions.profanityFilter || rOpts.profanityFilter) && {NVIDIA_PROFANITY_FILTER: 1}),
...(!(nvidiaOptions.profanityFilter || rOpts.profanityFilter) && {NVIDIA_PROFANITY_FILTER: 0}),
...((nvidiaOptions.punctuation || rOpts.punctuation) && {NVIDIA_PUNCTUATION: 1}),
...(!(nvidiaOptions.punctuation || rOpts.punctuation) && {NVIDIA_PUNCTUATION: 0}),
...((rOpts.words || nvidiaOptions.wordTimeOffsets) && {NVIDIA_WORD_TIME_OFFSETS: 1}),
...(!(rOpts.words || nvidiaOptions.wordTimeOffsets) && {NVIDIA_WORD_TIME_OFFSETS: 0}),
...(nvidiaOptions.maxAlternatives && {NVIDIA_MAX_ALTERNATIVES: nvidiaOptions.maxAlternatives}),
...(!nvidiaOptions.maxAlternatives && {NVIDIA_MAX_ALTERNATIVES: 1}),
...(rOpts.model && {NVIDIA_MODEL: rOpts.model}),
...(nvidiaOptions.rivaUri && {NVIDIA_RIVA_URI: nvidiaOptions.rivaUri}),
...(nvidiaOptions.verbatimTranscripts && {NVIDIA_VERBATIM_TRANSCRIPTS: 1}),
...(rOpts.diarization && {NVIDIA_SPEAKER_DIARIZATION: 1}),
...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 &&
{NVIDIA_DIARIZATION_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}),
...(rOpts.separateRecognitionPerChannel && {NVIDIA_SEPARATE_RECOGNITION_PER_CHANNEL: 1}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
{NVIDIA_HINTS: rOpts.hints.join(',')}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' &&
{NVIDIA_HINTS: JSON.stringify(rOpts.hints)}),
...(typeof rOpts.hintsBoost === 'number' &&
{NVIDIA_HINTS_BOOST: rOpts.hintsBoost}),
...(nvidiaOptions.customConfiguration &&
{NVIDIA_CUSTOM_CONFIGURATION: JSON.stringify(nvidiaOptions.customConfiguration)}),
};
}
stickyVars[rOpts.vendor].forEach((key) => {
if (!opts[key]) opts[key] = '';
});
@@ -468,6 +523,12 @@ module.exports = (logger) => {
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Transcription);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Error);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.VadDetected);
};
const setSpeechCredentialsAtRuntime = (recognizer) => {
@@ -476,6 +537,10 @@ module.exports = (logger) => {
const {clientId, secret} = recognizer.nuanceOptions || {};
if (clientId && secret) return {client_id: clientId, secret};
}
else if (recognizer.vendor === 'nvidia') {
const {rivaUri} = recognizer.nvidiaOptions || {};
if (rivaUri) return {riva_uri: rivaUri};
}
else if (recognizer.vendor === 'deepgram') {
const {apiKey} = recognizer.deepgramOptions || {};
if (apiKey) return {api_key: apiKey};