Feature/nvidia speech (#261)

* initial changes for nvidia speech

* allow nvidia speech credentials to be set at runtime

* update drachtio-fsmrf

* fix handling of nvidia-specific options

* fix nvidia custom config

* fix nvidia word time offsets

* fix nvidia custom configuration

* normalize nvidia transcripts

* update to @jambonz/realtime-dbhelpers with nvidia tts support
This commit is contained in:
Dave Horton
2023-02-12 14:06:01 -05:00
committed by GitHub
parent 50057deca9
commit 0fdcb3a6d6
7 changed files with 222 additions and 96 deletions

View File

@@ -7,7 +7,8 @@ const {
AwsTranscriptionEvents,
AzureTranscriptionEvents,
DeepgramTranscriptionEvents,
IbmTranscriptionEvents
IbmTranscriptionEvents,
NvidiaTranscriptionEvents
} = require('../utils/constants');
const makeTask = require('./make_task');
@@ -397,6 +398,25 @@ class TaskGather extends Task {
this._onIbmError.bind(this, cs, ep));
break;
case 'nvidia':
this.bugname = 'nvidia_transcribe';
ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription,
this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech,
this._onStartOfSpeech.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete,
this._onTranscriptionComplete.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.VadDetected,
this._onVadDetected.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.Error,
this._onNvidiaError.bind(this, cs, ep));
/* I think nvidia has this (??) - stall timers until prompt finishes playing */
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
opts.NVIDIA_STALL_TIMERS = 1;
}
break;
default:
this.notifyError({ msg: 'ASR error', details:`Invalid vendor ${this.vendor}`});
this.notifyTaskDone();
@@ -612,6 +632,9 @@ class TaskGather extends Task {
return this._resolve('timeout');
}
}
_onNvidiaError(cs, ep, evt) {
this.logger.info({evt}, 'TaskGather:_onNvidiaError');
}
_onDeepgramConnect(_cs, _ep) {
this.logger.debug('TaskGather:_onDeepgramConnect');
}

View File

@@ -467,7 +467,7 @@
"properties": {
"vendor": {
"type": "string",
"enum": ["google", "aws", "polly", "microsoft", "nuance", "ibm", "default"]
"enum": ["google", "aws", "polly", "microsoft", "nuance", "ibm", "nvidia", "default"]
},
"language": "string",
"voice": "string",
@@ -488,7 +488,7 @@
"properties": {
"vendor": {
"type": "string",
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "ibm", "default"]
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "ibm", "nvidia", "default"]
},
"language": "string",
"vad": "#vad",
@@ -555,12 +555,26 @@
"asrTimeout": "number",
"nuanceOptions": "#nuanceOptions",
"deepgramOptions": "#deepgramOptions",
"ibmOptions": "#ibmOptions"
"ibmOptions": "#ibmOptions",
"nvidiaOptions": "#nvidiaOptions"
},
"required": [
"vendor"
]
},
"nvidiaOptions": {
"properties": {
"rivaUri": "string",
"maxAlternatives": "number",
"profanityFilter": "boolean",
"punctuation": "boolean",
"wordTimeOffsets": "boolean",
"verbatimTranscripts": "boolean",
"customConfiguration": "object"
},
"required": [
]
},
"ibmOptions": {
"properties": {
"sttApiKey": "string",

View File

@@ -7,7 +7,8 @@ const {
AwsTranscriptionEvents,
NuanceTranscriptionEvents,
DeepgramTranscriptionEvents,
IbmTranscriptionEvents
IbmTranscriptionEvents,
NvidiaTranscriptionEvents
} = require('../utils/constants');
const normalizeJambones = require('../utils/normalize-jambones');
@@ -207,6 +208,20 @@ class TaskTranscribe extends Task {
this._onIbmError.bind(this, cs, ep, channel));
break;
case 'nvidia':
this.bugname = 'nvidia_transcribe';
ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription,
this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech,
this._onStartOfSpeech.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete,
this._onTranscriptionComplete.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.VadDetected,
this._onVadDetected.bind(this, cs, ep));
ep.addCustomEventListener(NvidiaTranscriptionEvents.Error,
this._onNvidiaError.bind(this, cs, ep));
break;
default:
throw new Error(`Invalid vendor ${this.vendor}`);
}
@@ -311,6 +326,9 @@ class TaskTranscribe extends Task {
return this._resolve('timeout');
}
}
_onNvidiaError(cs, ep, evt) {
this.logger.info({evt}, 'TaskGather:_onNvidiaError');
}
_onDeepgramConnect(_cs, _ep) {
this.logger.debug('TaskTranscribe:_onDeepgramConnect');
}

View File

@@ -74,6 +74,13 @@
"Error": "nuance_transcribe::error",
"VadDetected": "nuance_transcribe::vad_detected"
},
"NvidiaTranscriptionEvents": {
"Transcription": "nvidia_transcribe::transcription",
"StartOfSpeech": "nvidia_transcribe::start_of_speech",
"TranscriptionComplete": "nvidia_transcribe::end_of_transcription",
"Error": "nvidia_transcribe::error",
"VadDetected": "nvidia_transcribe::vad_detected"
},
"DeepgramTranscriptionEvents": {
"Transcription": "deepgram_transcribe::transcription",
"ConnectFailure": "deepgram_transcribe::connect_failed",

View File

@@ -5,6 +5,7 @@ const {
AwsTranscriptionEvents,
NuanceTranscriptionEvents,
DeepgramTranscriptionEvents,
NvidiaTranscriptionEvents
} = require('./constants');
const stickyVars = {
@@ -84,6 +85,9 @@ const stickyVars = {
'IBM_SPEECH_BASE_MODEL_VERSION',
'IBM_SPEECH_WATSON_METADATA',
'IBM_SPEECH_WATSON_LEARNING_OPT_OUT'
],
nvidia: [
'NVIDIA_HINTS'
]
};
@@ -107,6 +111,25 @@ const normalizeDeepgram = (evt, channel, language) => {
};
};
const normalizeNvidia = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
const alternatives = (evt.alternatives || [])
.map((alt) => ({
confidence: alt.confidence,
transcript: alt.transcript,
}));
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
alternatives,
vendor: {
name: 'nvidia',
evt: copy
}
};
};
const normalizeIbm = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
//const idx = evt.result_index;
@@ -212,6 +235,8 @@ module.exports = (logger) => {
return normalizeNuance(evt, channel, language);
case 'ibm':
return normalizeIbm(evt, channel, language);
case 'nvidia':
return normalizeNvidia(evt, channel, language);
default:
logger.error(`Unknown vendor ${vendor}`);
return evt;
@@ -440,6 +465,36 @@ module.exports = (logger) => {
{IBM_SPEECH_WATSON_LEARNING_OPT_OUT: ibmOptions.watsonLearningOptOut}
};
}
else if ('nvidia' === rOpts.vendor) {
const {nvidiaOptions = {}} = rOpts;
opts = {
...opts,
...((nvidiaOptions.profanityFilter || rOpts.profanityFilter) && {NVIDIA_PROFANITY_FILTER: 1}),
...(!(nvidiaOptions.profanityFilter || rOpts.profanityFilter) && {NVIDIA_PROFANITY_FILTER: 0}),
...((nvidiaOptions.punctuation || rOpts.punctuation) && {NVIDIA_PUNCTUATION: 1}),
...(!(nvidiaOptions.punctuation || rOpts.punctuation) && {NVIDIA_PUNCTUATION: 0}),
...((rOpts.words || nvidiaOptions.wordTimeOffsets) && {NVIDIA_WORD_TIME_OFFSETS: 1}),
...(!(rOpts.words || nvidiaOptions.wordTimeOffsets) && {NVIDIA_WORD_TIME_OFFSETS: 0}),
...(nvidiaOptions.maxAlternatives && {NVIDIA_MAX_ALTERNATIVES: nvidiaOptions.maxAlternatives}),
...(!nvidiaOptions.maxAlternatives && {NVIDIA_MAX_ALTERNATIVES: 1}),
...(rOpts.model && {NVIDIA_MODEL: rOpts.model}),
...(nvidiaOptions.rivaUri && {NVIDIA_RIVA_URI: nvidiaOptions.rivaUri}),
...(nvidiaOptions.verbatimTranscripts && {NVIDIA_VERBATIM_TRANSCRIPTS: 1}),
...(rOpts.diarization && {NVIDIA_SPEAKER_DIARIZATION: 1}),
...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 &&
{NVIDIA_DIARIZATION_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}),
...(rOpts.separateRecognitionPerChannel && {NVIDIA_SEPARATE_RECOGNITION_PER_CHANNEL: 1}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
{NVIDIA_HINTS: rOpts.hints.join(',')}),
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' &&
{NVIDIA_HINTS: JSON.stringify(rOpts.hints)}),
...(typeof rOpts.hintsBoost === 'number' &&
{NVIDIA_HINTS_BOOST: rOpts.hintsBoost}),
...(nvidiaOptions.customConfiguration &&
{NVIDIA_CUSTOM_CONFIGURATION: JSON.stringify(nvidiaOptions.customConfiguration)}),
};
}
stickyVars[rOpts.vendor].forEach((key) => {
if (!opts[key]) opts[key] = '';
});
@@ -468,6 +523,12 @@ module.exports = (logger) => {
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Transcription);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Error);
ep.removeCustomEventListener(NvidiaTranscriptionEvents.VadDetected);
};
const setSpeechCredentialsAtRuntime = (recognizer) => {
@@ -476,6 +537,10 @@ module.exports = (logger) => {
const {clientId, secret} = recognizer.nuanceOptions || {};
if (clientId && secret) return {client_id: clientId, secret};
}
else if (recognizer.vendor === 'nvidia') {
const {rivaUri} = recognizer.nvidiaOptions || {};
if (rivaUri) return {riva_uri: rivaUri};
}
else if (recognizer.vendor === 'deepgram') {
const {apiKey} = recognizer.deepgramOptions || {};
if (apiKey) return {api_key: apiKey};