diff --git a/lib/add-file-to-cache.js b/lib/add-file-to-cache.js index 0f70f3c..fb610cb 100644 --- a/lib/add-file-to-cache.js +++ b/lib/add-file-to-cache.js @@ -1,6 +1,7 @@ const fs = require('fs/promises'); const {noopLogger, makeSynthKey} = require('./utils'); -const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 4 * 60) * 60; // cache tts for 4 hours +const {JAMBONES_TTS_CACHE_DURATION_MINS} = require('./config'); +const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS; async function addFileToCache(client, logger, path, {account_sid, vendor, language, voice, deploymentId, engine, text}) { diff --git a/lib/config.js b/lib/config.js new file mode 100644 index 0000000..d175e63 --- /dev/null +++ b/lib/config.js @@ -0,0 +1,17 @@ +const JAMBONES_TTS_TRIM_SILENCE = process.env.JAMBONES_TTS_TRIM_SILENCE; +const JAMBONES_DISABLE_TTS_STREAMING = process.env.JAMBONES_DISABLE_TTS_STREAMING; + +const JAMBONES_HTTP_PROXY_IP = process.env.JAMBONES_HTTP_PROXY_IP; +const JAMBONES_HTTP_PROXY_PORT = process.env.JAMBONES_HTTP_PROXY_PORT; +const JAMBONES_TTS_CACHE_DURATION_MINS = (parseInt(process.env.JAMBONES_TTS_CACHE_DURATION_MINS) || 4 * 60) * 60; // cache tts for 4 hours + +const TMP_FOLDER = '/tmp'; + +module.exports = { + JAMBONES_TTS_TRIM_SILENCE, + JAMBONES_DISABLE_TTS_STREAMING, + JAMBONES_HTTP_PROXY_IP, + JAMBONES_HTTP_PROXY_PORT, + JAMBONES_TTS_CACHE_DURATION_MINS, + TMP_FOLDER +}; diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 4da42b4..4183469 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -19,7 +19,8 @@ const { createNuanceClient, createKryptonClient, createRivaClient, - noopLogger + noopLogger, + makeFilePath } = require('./utils'); const getNuanceAccessToken = require('./get-nuance-access-token'); const { @@ -36,8 +37,13 @@ const { const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb'); const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb'); const debug = require('debug')('jambonz:realtimedb-helpers'); -const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 4 * 60) * 60; // cache tts for 4 hours -const TMP_FOLDER = '/tmp'; +const { + JAMBONES_DISABLE_TTS_STREAMING, + JAMBONES_HTTP_PROXY_IP, + JAMBONES_HTTP_PROXY_PORT, + JAMBONES_TTS_CACHE_DURATION_MINS, +} = require('./config'); +const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS; const OpenAI = require('openai'); const getAwsAuthToken = require('./get-aws-sts-token'); @@ -149,19 +155,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc text }); let filePath; - if (['nuance', 'nvidia'].includes(vendor) || - ( - (process.env.JAMBONES_TTS_TRIM_SILENCE || !process.env.JAMBONES_DISABLE_TTS_STREAMING) && - ['microsoft', 'azure'].includes(vendor) - ) || - ( - !process.env.JAMBONES_DISABLE_TTS_STREAMING && - ['elevenlabs', 'deepgram', 'rimelabs'].includes(vendor) - ) - ) { - filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.r8`; - } - else filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.mp3`; + filePath = makeFilePath(vendor, key, salt); debug(`synth key is ${key}`); let cached; if (!disableTtsCache) { @@ -444,7 +438,7 @@ const synthMicrosoft = async(logger, { content = `${words}`; logger.info({content}, 'synthMicrosoft'); } - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${apiKey}`; params += `,language=${language}`; @@ -454,8 +448,8 @@ const synthMicrosoft = async(logger, { if (region) params += `,region=${region}`; if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`; - if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`; - if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`; + if (JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${JAMBONES_HTTP_PROXY_IP}`; + if (JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${JAMBONES_HTTP_PROXY_PORT}`; params += '}'; return { filePath: `say:${params}${content.replace(/\n/g, ' ')}`, @@ -484,10 +478,10 @@ const synthMicrosoft = async(logger, { SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm : SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; - if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) { + if (JAMBONES_HTTP_PROXY_IP && JAMBONES_HTTP_PROXY_PORT) { logger.debug( - `synthMicrosoft: using proxy ${process.env.JAMBONES_HTTP_PROXY_IP}:${process.env.JAMBONES_HTTP_PROXY_PORT}`); - speechConfig.setProxy(process.env.JAMBONES_HTTP_PROXY_IP, process.env.JAMBONES_HTTP_PROXY_PORT); + `synthMicrosoft: using proxy ${JAMBONES_HTTP_PROXY_IP}:${JAMBONES_HTTP_PROXY_PORT}`); + speechConfig.setProxy(JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT); } const synthesizer = new SpeechSynthesizer(speechConfig); @@ -673,7 +667,7 @@ const synthElevenlabs = async(logger, { const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${api_key}`; params += ',vendor=elevenlabs'; @@ -726,7 +720,7 @@ const synthPlayHT = async(logger, { const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${api_key}`; params += `,user_id=${user_id}`; @@ -781,7 +775,7 @@ const synthRimelabs = async(logger, { const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${api_key}`; params += `,model_id=${model_id}`; @@ -823,7 +817,7 @@ const synthRimelabs = async(logger, { const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => { const {api_key, model_id, baseURL, timeout, speed} = credentials; /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */ - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${api_key}`; params += `,model_id=${model_id}`; @@ -862,7 +856,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa const synthDeepgram = async(logger, {credentials, stats, model, text, renderForCaching, disableTtsStreaming}) => { const {api_key} = credentials; - if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{api_key=${api_key}`; params += ',vendor=deepgram'; diff --git a/lib/utils.js b/lib/utils.js index 7884b1f..89b90c2 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -7,6 +7,7 @@ const HTTP_TIMEOUT = 5000; const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443'; const grpc = require('@grpc/grpc-js'); const formurlencoded = require('form-urlencoded'); +const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER } = require('./config'); const debug = require('debug')('jambonz:realtimedb-helpers'); /** @@ -19,7 +20,44 @@ const debug = require('debug')('jambonz:realtimedb-helpers'); function makeSynthKey({account_sid = '', vendor, language, voice, engine = '', text}) { const hash = crypto.createHash('sha1'); hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`); - return `tts${account_sid ? (':' + account_sid) : ''}:${hash.digest('hex')}`; + const hexHashKey = hash.digest('hex'); + const accountKey = account_sid ? `:${account_sid}` : ''; + const extension = getFileExtension(vendor); + const key = `tts${accountKey}:${extension}:${hexHashKey}`; + return key; +} + +function makeFilePath(vendor, key, salt = '') { + const extension = getFileExtension(vendor); + return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`; +} + +function getFileExtension(vendor) { + const mp3Extension = 'mp3'; + const r8Extension = 'r8'; + + switch (vendor) { + case 'azure': + case 'microsoft': + if (!JAMBONES_DISABLE_TTS_STREAMING || JAMBONES_TTS_TRIM_SILENCE) { + return r8Extension; + } else { + return mp3Extension; + } + case 'deepgram': + case 'elevenlabs': + case 'rimlabs': + if (!JAMBONES_DISABLE_TTS_STREAMING) { + return r8Extension; + } else { + return mp3Extension; + } + case 'nuance': + case 'nvidia': + return r8Extension; + default: + return mp3Extension; + } } const noopLogger = { @@ -123,5 +161,6 @@ module.exports = { createRivaClient, makeBasicAuthHeader, NUANCE_AUTH_ENDPOINT, - noopLogger + noopLogger, + makeFilePath }; diff --git a/package.json b/package.json index 32504e8..765bb55 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "test": "NODE_ENV=test node test/ ", "coverage": "nyc --reporter html --report-dir ./coverage npm run test", "jslint": "eslint index.js lib", + "jslint:fix": "eslint --fix '**/*.js'", "build": "./build_stubs.sh" }, "repository": { diff --git a/stubs/nuance/synthesizer_grpc_pb.js b/stubs/nuance/synthesizer_grpc_pb.js index 18b5c22..de0b045 100644 --- a/stubs/nuance/synthesizer_grpc_pb.js +++ b/stubs/nuance/synthesizer_grpc_pb.js @@ -62,9 +62,9 @@ function deserialize_nuance_tts_v1_UnarySynthesisResponse(buffer_arg) { // // The Synthesizer service offers these functionalities: -// - GetVoices: Queries the list of available voices, with filters to reduce the search space. -// - Synthesize: Synthesizes audio from input text and parameters, and returns an audio stream. -// - UnarySynthesize: Synthesizes audio from input text and parameters, and returns a single audio response. +// - GetVoices: Queries the list of available voices, with filters to reduce the search space. +// - Synthesize: Synthesizes audio from input text and parameters, and returns an audio stream. +// - UnarySynthesize: Synthesizes audio from input text and parameters, and returns a single audio response. var SynthesizerService = exports.SynthesizerService = { getVoices: { path: '/nuance.tts.v1.Synthesizer/GetVoices', diff --git a/stubs/riva/proto/riva_audio_grpc_pb.js b/stubs/riva/proto/riva_audio_grpc_pb.js index 97b3a24..51b4d69 100644 --- a/stubs/riva/proto/riva_audio_grpc_pb.js +++ b/stubs/riva/proto/riva_audio_grpc_pb.js @@ -1 +1 @@ -// GENERATED CODE -- NO SERVICES IN PROTO \ No newline at end of file +// GENERATED CODE -- NO SERVICES IN PROTO diff --git a/stubs/riva/proto/riva_tts_grpc_pb.js b/stubs/riva/proto/riva_tts_grpc_pb.js index 85b20d4..cc1dd31 100644 --- a/stubs/riva/proto/riva_tts_grpc_pb.js +++ b/stubs/riva/proto/riva_tts_grpc_pb.js @@ -57,7 +57,7 @@ function deserialize_nvidia_riva_tts_SynthesizeSpeechResponse(buffer_arg) { var RivaSpeechSynthesisService = exports.RivaSpeechSynthesisService = { // Used to request text-to-speech from the service. Submit a request containing the // desired text and configuration, and receive audio bytes in the requested format. -synthesize: { + synthesize: { path: '/nvidia.riva.tts.RivaSpeechSynthesis/Synthesize', requestStream: false, responseStream: false, @@ -69,9 +69,9 @@ synthesize: { responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse, }, // Used to request text-to-speech returned via stream as it becomes available. -// Submit a SynthesizeSpeechRequest with desired text and configuration, -// and receive stream of bytes in the requested format. -synthesizeOnline: { + // Submit a SynthesizeSpeechRequest with desired text and configuration, + // and receive stream of bytes in the requested format. + synthesizeOnline: { path: '/nvidia.riva.tts.RivaSpeechSynthesis/SynthesizeOnline', requestStream: false, responseStream: true, @@ -83,7 +83,7 @@ synthesizeOnline: { responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse, }, // Enables clients to request the configuration of the current Synthesize service, or a specific model within the service. -getRivaSynthesisConfig: { + getRivaSynthesisConfig: { path: '/nvidia.riva.tts.RivaSpeechSynthesis/GetRivaSynthesisConfig', requestStream: false, responseStream: false,