mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-07-04 19:31:49 +00:00
feat(nvidia): NVCF cloud support for one-shot Riva/nvidia TTS
The nvidia/riva one-shot synth path was self-hosted-only (insecure gRPC to
riva_server_uri). Add NVCF cloud: when credentials.api_key is set, createRivaClient
dials grpc.nvcf.nvidia.com:443 over TLS with per-RPC metadata (function-id +
Bearer api key) baked into the channel credentials; function-id defaults to
ai-magpie-tts-multilingual, overridable via credentials.function_id.
- createRivaClient(uri, {apiKey, functionId}) — cloud when apiKey present, else
insecure self-hosted (unchanged).
- synthNvidia: pass api_key/function_id to the gRPC synth (caching path); and in
the say: path emit NVIDIA_API_KEY(+NVIDIA_FUNCTION_ID) for cloud so mediajam's
nvidia dialect uses NVCF (it already reads those). Self-hosted say: unchanged.
- assert now accepts riva_server_uri (self-hosted) OR api_key (cloud).
Closes the 'one-shot say TTS cloud' gap; pairs with the webapp nvidia api_key
field. Requires a version bump + publish.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+12
-5
@@ -96,7 +96,8 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
|||||||
else if ('nvidia' === vendor) {
|
else if ('nvidia' === vendor) {
|
||||||
assert.ok(voice, 'synthAudio requires voice when nvidia is used');
|
assert.ok(voice, 'synthAudio requires voice when nvidia is used');
|
||||||
assert.ok(language, 'synthAudio requires language when nvidia is used');
|
assert.ok(language, 'synthAudio requires language when nvidia is used');
|
||||||
assert.ok(credentials.riva_server_uri, 'synthAudio requires riva_server_uri in credentials when nvidia is used');
|
assert.ok(credentials.riva_server_uri || credentials.api_key,
|
||||||
|
'synthAudio requires riva_server_uri (self-hosted) or api_key (NVCF cloud) in credentials when nvidia is used');
|
||||||
}
|
}
|
||||||
else if ('wellsaid' === vendor) {
|
else if ('wellsaid' === vendor) {
|
||||||
language = 'en-US'; // WellSaid only supports English atm
|
language = 'en-US'; // WellSaid only supports English atm
|
||||||
@@ -682,10 +683,16 @@ const synthWellSaid = async(logger, {credentials, stats, language, voice, gender
|
|||||||
const synthNvidia = async(client, logger, {
|
const synthNvidia = async(client, logger, {
|
||||||
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||||
}) => {
|
}) => {
|
||||||
const {riva_server_uri} = credentials;
|
const {riva_server_uri, api_key, function_id} = credentials;
|
||||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||||
let params = '';
|
let params = '{';
|
||||||
params += `{riva_server_uri=${riva_server_uri}`;
|
if (api_key) {
|
||||||
|
/* NVCF cloud: mediajam connects to grpc.nvcf.nvidia.com using these */
|
||||||
|
params += `NVIDIA_API_KEY=${api_key}`;
|
||||||
|
if (function_id) params += `,NVIDIA_FUNCTION_ID=${function_id}`;
|
||||||
|
} else {
|
||||||
|
params += `riva_server_uri=${riva_server_uri}`;
|
||||||
|
}
|
||||||
params += `,playback_id=${key}`;
|
params += `,playback_id=${key}`;
|
||||||
params += `,voice=${voice}`;
|
params += `,voice=${voice}`;
|
||||||
params += `,language=${language}`;
|
params += `,language=${language}`;
|
||||||
@@ -701,7 +708,7 @@ const synthNvidia = async(client, logger, {
|
|||||||
let rivaClient, request;
|
let rivaClient, request;
|
||||||
const sampleRate = 8000;
|
const sampleRate = 8000;
|
||||||
try {
|
try {
|
||||||
rivaClient = await createRivaClient(riva_server_uri);
|
rivaClient = await createRivaClient(riva_server_uri, {apiKey: api_key, functionId: function_id});
|
||||||
request = new SynthesizeSpeechRequest();
|
request = new SynthesizeSpeechRequest();
|
||||||
request.setVoiceName(voice);
|
request.setVoiceName(voice);
|
||||||
request.setLanguageCode(language);
|
request.setLanguageCode(language);
|
||||||
|
|||||||
+18
-3
@@ -38,9 +38,24 @@ function makeAwsKey(awsAccessKeyId) {
|
|||||||
return `aws:${hash.digest('hex')}`;
|
return `aws:${hash.digest('hex')}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
const createRivaClient = async(rivaUri) => {
|
// NVCF cloud TTS function-id default: ai-magpie-tts-multilingual (public)
|
||||||
const client = new RivaSpeechSynthesisClient(rivaUri, grpc.credentials.createInsecure());
|
const NVIDIA_TTS_FUNCTION_ID = '877104f7-e885-42b9-8de8-f6e4c6303969';
|
||||||
return client;
|
|
||||||
|
const createRivaClient = async(rivaUri, {apiKey, functionId} = {}) => {
|
||||||
|
if (apiKey) {
|
||||||
|
/* NVCF cloud: TLS to grpc.nvcf.nvidia.com:443 with per-RPC metadata
|
||||||
|
(function-id + Bearer api key) baked into the channel credentials */
|
||||||
|
const callCreds = grpc.credentials.createFromMetadataGenerator((_params, cb) => {
|
||||||
|
const md = new grpc.Metadata();
|
||||||
|
md.add('function-id', functionId || NVIDIA_TTS_FUNCTION_ID);
|
||||||
|
md.add('authorization', `Bearer ${apiKey}`);
|
||||||
|
cb(null, md);
|
||||||
|
});
|
||||||
|
const creds = grpc.credentials.combineChannelCredentials(
|
||||||
|
grpc.credentials.createSsl(), callCreds);
|
||||||
|
return new RivaSpeechSynthesisClient('grpc.nvcf.nvidia.com:443', creds);
|
||||||
|
}
|
||||||
|
return new RivaSpeechSynthesisClient(rivaUri, grpc.credentials.createInsecure());
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
Reference in New Issue
Block a user