mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 08:40:38 +00:00
Feature/nvidia speech (#261)
* initial changes for nvidia speech * allow nvidia speech credentials to be set at runtime * update drachtio-fsmrf * fix handling of nvidia-specific options * fix nvidia custom config * fix nvidia word time offsets * fix nvidia custom configuration * normalize nvidia transcripts * update to @jambonz/realtime-dbhelpers with nvidia tts support
This commit is contained in:
@@ -7,7 +7,8 @@ const {
|
||||
AwsTranscriptionEvents,
|
||||
AzureTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents,
|
||||
IbmTranscriptionEvents
|
||||
IbmTranscriptionEvents,
|
||||
NvidiaTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
|
||||
const makeTask = require('./make_task');
|
||||
@@ -397,6 +398,25 @@ class TaskGather extends Task {
|
||||
this._onIbmError.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
case 'nvidia':
|
||||
this.bugname = 'nvidia_transcribe';
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription,
|
||||
this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech,
|
||||
this._onStartOfSpeech.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete,
|
||||
this._onTranscriptionComplete.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.VadDetected,
|
||||
this._onVadDetected.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.Error,
|
||||
this._onNvidiaError.bind(this, cs, ep));
|
||||
|
||||
/* I think nvidia has this (??) - stall timers until prompt finishes playing */
|
||||
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
|
||||
opts.NVIDIA_STALL_TIMERS = 1;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
this.notifyError({ msg: 'ASR error', details:`Invalid vendor ${this.vendor}`});
|
||||
this.notifyTaskDone();
|
||||
@@ -612,6 +632,9 @@ class TaskGather extends Task {
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
}
|
||||
_onNvidiaError(cs, ep, evt) {
|
||||
this.logger.info({evt}, 'TaskGather:_onNvidiaError');
|
||||
}
|
||||
_onDeepgramConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskGather:_onDeepgramConnect');
|
||||
}
|
||||
|
||||
@@ -467,7 +467,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "polly", "microsoft", "nuance", "ibm", "default"]
|
||||
"enum": ["google", "aws", "polly", "microsoft", "nuance", "ibm", "nvidia", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"voice": "string",
|
||||
@@ -488,7 +488,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "ibm", "default"]
|
||||
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "ibm", "nvidia", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"vad": "#vad",
|
||||
@@ -555,12 +555,26 @@
|
||||
"asrTimeout": "number",
|
||||
"nuanceOptions": "#nuanceOptions",
|
||||
"deepgramOptions": "#deepgramOptions",
|
||||
"ibmOptions": "#ibmOptions"
|
||||
"ibmOptions": "#ibmOptions",
|
||||
"nvidiaOptions": "#nvidiaOptions"
|
||||
},
|
||||
"required": [
|
||||
"vendor"
|
||||
]
|
||||
},
|
||||
"nvidiaOptions": {
|
||||
"properties": {
|
||||
"rivaUri": "string",
|
||||
"maxAlternatives": "number",
|
||||
"profanityFilter": "boolean",
|
||||
"punctuation": "boolean",
|
||||
"wordTimeOffsets": "boolean",
|
||||
"verbatimTranscripts": "boolean",
|
||||
"customConfiguration": "object"
|
||||
},
|
||||
"required": [
|
||||
]
|
||||
},
|
||||
"ibmOptions": {
|
||||
"properties": {
|
||||
"sttApiKey": "string",
|
||||
|
||||
@@ -7,7 +7,8 @@ const {
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents,
|
||||
IbmTranscriptionEvents
|
||||
IbmTranscriptionEvents,
|
||||
NvidiaTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
const normalizeJambones = require('../utils/normalize-jambones');
|
||||
|
||||
@@ -207,6 +208,20 @@ class TaskTranscribe extends Task {
|
||||
this._onIbmError.bind(this, cs, ep, channel));
|
||||
break;
|
||||
|
||||
case 'nvidia':
|
||||
this.bugname = 'nvidia_transcribe';
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription,
|
||||
this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech,
|
||||
this._onStartOfSpeech.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete,
|
||||
this._onTranscriptionComplete.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.VadDetected,
|
||||
this._onVadDetected.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.Error,
|
||||
this._onNvidiaError.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
@@ -311,6 +326,9 @@ class TaskTranscribe extends Task {
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
}
|
||||
_onNvidiaError(cs, ep, evt) {
|
||||
this.logger.info({evt}, 'TaskGather:_onNvidiaError');
|
||||
}
|
||||
_onDeepgramConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskTranscribe:_onDeepgramConnect');
|
||||
}
|
||||
|
||||
@@ -74,6 +74,13 @@
|
||||
"Error": "nuance_transcribe::error",
|
||||
"VadDetected": "nuance_transcribe::vad_detected"
|
||||
},
|
||||
"NvidiaTranscriptionEvents": {
|
||||
"Transcription": "nvidia_transcribe::transcription",
|
||||
"StartOfSpeech": "nvidia_transcribe::start_of_speech",
|
||||
"TranscriptionComplete": "nvidia_transcribe::end_of_transcription",
|
||||
"Error": "nvidia_transcribe::error",
|
||||
"VadDetected": "nvidia_transcribe::vad_detected"
|
||||
},
|
||||
"DeepgramTranscriptionEvents": {
|
||||
"Transcription": "deepgram_transcribe::transcription",
|
||||
"ConnectFailure": "deepgram_transcribe::connect_failed",
|
||||
|
||||
@@ -5,6 +5,7 @@ const {
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents,
|
||||
NvidiaTranscriptionEvents
|
||||
} = require('./constants');
|
||||
|
||||
const stickyVars = {
|
||||
@@ -84,6 +85,9 @@ const stickyVars = {
|
||||
'IBM_SPEECH_BASE_MODEL_VERSION',
|
||||
'IBM_SPEECH_WATSON_METADATA',
|
||||
'IBM_SPEECH_WATSON_LEARNING_OPT_OUT'
|
||||
],
|
||||
nvidia: [
|
||||
'NVIDIA_HINTS'
|
||||
]
|
||||
};
|
||||
|
||||
@@ -107,6 +111,25 @@ const normalizeDeepgram = (evt, channel, language) => {
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeNvidia = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
const alternatives = (evt.alternatives || [])
|
||||
.map((alt) => ({
|
||||
confidence: alt.confidence,
|
||||
transcript: alt.transcript,
|
||||
}));
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: evt.is_final,
|
||||
alternatives,
|
||||
vendor: {
|
||||
name: 'nvidia',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeIbm = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
//const idx = evt.result_index;
|
||||
@@ -212,6 +235,8 @@ module.exports = (logger) => {
|
||||
return normalizeNuance(evt, channel, language);
|
||||
case 'ibm':
|
||||
return normalizeIbm(evt, channel, language);
|
||||
case 'nvidia':
|
||||
return normalizeNvidia(evt, channel, language);
|
||||
default:
|
||||
logger.error(`Unknown vendor ${vendor}`);
|
||||
return evt;
|
||||
@@ -440,6 +465,36 @@ module.exports = (logger) => {
|
||||
{IBM_SPEECH_WATSON_LEARNING_OPT_OUT: ibmOptions.watsonLearningOptOut}
|
||||
};
|
||||
}
|
||||
else if ('nvidia' === rOpts.vendor) {
|
||||
const {nvidiaOptions = {}} = rOpts;
|
||||
opts = {
|
||||
...opts,
|
||||
...((nvidiaOptions.profanityFilter || rOpts.profanityFilter) && {NVIDIA_PROFANITY_FILTER: 1}),
|
||||
...(!(nvidiaOptions.profanityFilter || rOpts.profanityFilter) && {NVIDIA_PROFANITY_FILTER: 0}),
|
||||
...((nvidiaOptions.punctuation || rOpts.punctuation) && {NVIDIA_PUNCTUATION: 1}),
|
||||
...(!(nvidiaOptions.punctuation || rOpts.punctuation) && {NVIDIA_PUNCTUATION: 0}),
|
||||
...((rOpts.words || nvidiaOptions.wordTimeOffsets) && {NVIDIA_WORD_TIME_OFFSETS: 1}),
|
||||
...(!(rOpts.words || nvidiaOptions.wordTimeOffsets) && {NVIDIA_WORD_TIME_OFFSETS: 0}),
|
||||
...(nvidiaOptions.maxAlternatives && {NVIDIA_MAX_ALTERNATIVES: nvidiaOptions.maxAlternatives}),
|
||||
...(!nvidiaOptions.maxAlternatives && {NVIDIA_MAX_ALTERNATIVES: 1}),
|
||||
...(rOpts.model && {NVIDIA_MODEL: rOpts.model}),
|
||||
...(nvidiaOptions.rivaUri && {NVIDIA_RIVA_URI: nvidiaOptions.rivaUri}),
|
||||
...(nvidiaOptions.verbatimTranscripts && {NVIDIA_VERBATIM_TRANSCRIPTS: 1}),
|
||||
...(rOpts.diarization && {NVIDIA_SPEAKER_DIARIZATION: 1}),
|
||||
...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 &&
|
||||
{NVIDIA_DIARIZATION_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}),
|
||||
...(rOpts.separateRecognitionPerChannel && {NVIDIA_SEPARATE_RECOGNITION_PER_CHANNEL: 1}),
|
||||
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'string' &&
|
||||
{NVIDIA_HINTS: rOpts.hints.join(',')}),
|
||||
...(rOpts.hints.length > 0 && typeof rOpts.hints[0] === 'object' &&
|
||||
{NVIDIA_HINTS: JSON.stringify(rOpts.hints)}),
|
||||
...(typeof rOpts.hintsBoost === 'number' &&
|
||||
{NVIDIA_HINTS_BOOST: rOpts.hintsBoost}),
|
||||
...(nvidiaOptions.customConfiguration &&
|
||||
{NVIDIA_CUSTOM_CONFIGURATION: JSON.stringify(nvidiaOptions.customConfiguration)}),
|
||||
};
|
||||
}
|
||||
|
||||
stickyVars[rOpts.vendor].forEach((key) => {
|
||||
if (!opts[key]) opts[key] = '';
|
||||
});
|
||||
@@ -468,6 +523,12 @@ module.exports = (logger) => {
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
|
||||
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.TranscriptionComplete);
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.StartOfSpeech);
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.Error);
|
||||
ep.removeCustomEventListener(NvidiaTranscriptionEvents.VadDetected);
|
||||
};
|
||||
|
||||
const setSpeechCredentialsAtRuntime = (recognizer) => {
|
||||
@@ -476,6 +537,10 @@ module.exports = (logger) => {
|
||||
const {clientId, secret} = recognizer.nuanceOptions || {};
|
||||
if (clientId && secret) return {client_id: clientId, secret};
|
||||
}
|
||||
else if (recognizer.vendor === 'nvidia') {
|
||||
const {rivaUri} = recognizer.nvidiaOptions || {};
|
||||
if (rivaUri) return {riva_uri: rivaUri};
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions || {};
|
||||
if (apiKey) return {api_key: apiKey};
|
||||
|
||||
Reference in New Issue
Block a user