From 3ba1a143586ecf1aa8ea25b59adba133a8838701 Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Wed, 17 Jun 2026 17:25:44 -0400 Subject: [PATCH] feat(nvidia): NVCF cloud support for one-shot Riva/nvidia TTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvidia/riva one-shot synth path was self-hosted-only (insecure gRPC to riva_server_uri). Add NVCF cloud: when credentials.api_key is set, createRivaClient dials grpc.nvcf.nvidia.com:443 over TLS with per-RPC metadata (function-id + Bearer api key) baked into the channel credentials; function-id defaults to ai-magpie-tts-multilingual, overridable via credentials.function_id. - createRivaClient(uri, {apiKey, functionId}) — cloud when apiKey present, else insecure self-hosted (unchanged). - synthNvidia: pass api_key/function_id to the gRPC synth (caching path); and in the say: path emit NVIDIA_API_KEY(+NVIDIA_FUNCTION_ID) for cloud so mediajam's nvidia dialect uses NVCF (it already reads those). Self-hosted say: unchanged. - assert now accepts riva_server_uri (self-hosted) OR api_key (cloud). Closes the 'one-shot say TTS cloud' gap; pairs with the webapp nvidia api_key field. Requires a version bump + publish. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/synth-audio.js | 17 ++++++++++++----- lib/utils.js | 21 ++++++++++++++++++--- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index e97ce7c..cdbb0cc 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -96,7 +96,8 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc else if ('nvidia' === vendor) { assert.ok(voice, 'synthAudio requires voice when nvidia is used'); assert.ok(language, 'synthAudio requires language when nvidia is used'); - assert.ok(credentials.riva_server_uri, 'synthAudio requires riva_server_uri in credentials when nvidia is used'); + assert.ok(credentials.riva_server_uri || credentials.api_key, + 'synthAudio requires riva_server_uri (self-hosted) or api_key (NVCF cloud) in credentials when nvidia is used'); } else if ('wellsaid' === vendor) { language = 'en-US'; // WellSaid only supports English atm @@ -682,10 +683,16 @@ const synthWellSaid = async(logger, {credentials, stats, language, voice, gender const synthNvidia = async(client, logger, { credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { - const {riva_server_uri} = credentials; + const {riva_server_uri, api_key, function_id} = credentials; if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { - let params = ''; - params += `{riva_server_uri=${riva_server_uri}`; + let params = '{'; + if (api_key) { + /* NVCF cloud: mediajam connects to grpc.nvcf.nvidia.com using these */ + params += `NVIDIA_API_KEY=${api_key}`; + if (function_id) params += `,NVIDIA_FUNCTION_ID=${function_id}`; + } else { + params += `riva_server_uri=${riva_server_uri}`; + } params += `,playback_id=${key}`; params += `,voice=${voice}`; params += `,language=${language}`; @@ -701,7 +708,7 @@ const synthNvidia = async(client, logger, { let rivaClient, request; const sampleRate = 8000; try { - rivaClient = await createRivaClient(riva_server_uri); + rivaClient = await createRivaClient(riva_server_uri, {apiKey: api_key, functionId: function_id}); request = new SynthesizeSpeechRequest(); request.setVoiceName(voice); request.setLanguageCode(language); diff --git a/lib/utils.js b/lib/utils.js index 1d64f08..bc986c4 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -38,9 +38,24 @@ function makeAwsKey(awsAccessKeyId) { return `aws:${hash.digest('hex')}`; } -const createRivaClient = async(rivaUri) => { - const client = new RivaSpeechSynthesisClient(rivaUri, grpc.credentials.createInsecure()); - return client; +// NVCF cloud TTS function-id default: ai-magpie-tts-multilingual (public) +const NVIDIA_TTS_FUNCTION_ID = '877104f7-e885-42b9-8de8-f6e4c6303969'; + +const createRivaClient = async(rivaUri, {apiKey, functionId} = {}) => { + if (apiKey) { + /* NVCF cloud: TLS to grpc.nvcf.nvidia.com:443 with per-RPC metadata + (function-id + Bearer api key) baked into the channel credentials */ + const callCreds = grpc.credentials.createFromMetadataGenerator((_params, cb) => { + const md = new grpc.Metadata(); + md.add('function-id', functionId || NVIDIA_TTS_FUNCTION_ID); + md.add('authorization', `Bearer ${apiKey}`); + cb(null, md); + }); + const creds = grpc.credentials.combineChannelCredentials( + grpc.credentials.createSsl(), callCreds); + return new RivaSpeechSynthesisClient('grpc.nvcf.nvidia.com:443', creds); + } + return new RivaSpeechSynthesisClient(rivaUri, grpc.credentials.createInsecure()); }; module.exports = {