feat: add nuance, riva, ibm

This commit is contained in:
Quan HL
2023-02-21 08:49:15 +07:00
parent deee11b3f3
commit 1fed5aefab
23 changed files with 6779 additions and 0 deletions

13
config/test.json Normal file
View File

@@ -0,0 +1,13 @@
{
"logging": {
"level": "error"
},
"redis": {
"host": "127.0.0.1",
"port": 3379
},
"redis-auth": {
"host": "127.0.0.1",
"port": 3380
}
}

View File

@@ -0,0 +1,32 @@
const {noopLogger} = require('./lib/utils');
const promisify = require('@jambonz/promisify-redis');
const redis = promisify(require('redis'));
module.exports = (opts, logger) => {
const {host = '127.0.0.1', port = 6379, tls = false} = opts;
logger = logger || noopLogger;
const url = process.env.JAMBONES_REDIS_USERNAME && process.env.JAMBONES_REDIS_PASSWORD ?
`${process.env.JAMBONES_REDIS_USERNAME}:${process.env.JAMBONES_REDIS_PASSWORD}@${host}:${port}` :
`${host}:${port}`;
const client = redis.createClient(tls ? `rediss://${url}` : `redis://${url}`);
['ready', 'connect', 'reconnecting', 'error', 'end', 'warning']
.forEach((event) => {
client.on(event, (...args) => {
if ('error' === event) {
if (process.env.NODE_ENV === 'test' && args[0]?.code === 'ECONNREFUSED') return;
logger.error({...args}, '@jambonz/realtimedb-helpers - redis error');
}
else logger.debug({args}, `redis event ${event}`);
});
});
return {
client,
purgeTtsCache: require('./lib/purge-tts-cache').bind(null, client, logger),
synthAudio: require('./lib/synth-audio').bind(null, client, logger),
getNuanceAccessToken: require('./lib/get-nuance-access-token').bind(null, client, logger),
getIbmAccessToken: require('./lib/get-ibm-access-token').bind(null, client, logger),
getTtsVoices: require('./lib/get-tts-voices').bind(null, client, logger),
};
};

View File

@@ -0,0 +1,48 @@
const formurlencoded = require('form-urlencoded');
const {Pool} = require('undici');
const pool = new Pool('https://iam.cloud.ibm.com');
const {makeIbmKey, noopLogger} = require('./utils');
const debug = require('debug')('jambonz:realtimedb-helpers');
const HTTP_TIMEOUT = 5000;
async function getIbmAccessToken(client, logger, apiKey) {
logger = logger || noopLogger;
try {
const key = makeIbmKey(apiKey);
const access_token = await client.getAsync(key);
if (access_token) return {access_token, servedFromCache: true};
/* access token not found in cache, so fetch it from Ibm */
const payload = {
grant_type: 'urn:ibm:params:oauth:grant-type:apikey',
apikey: apiKey
};
const {statusCode, headers, body} = await pool.request({
path: '/identity/token',
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded'
},
body: formurlencoded(payload),
timeout: HTTP_TIMEOUT,
followRedirects: false
});
if (200 !== statusCode) {
const json = await body.json();
logger.debug({statusCode, headers, body: json}, 'error fetching access token from Ibm');
const err = new Error();
err.statusCode = statusCode;
throw err;
}
const json = await body.json();
await client.set(key, json.access_token, 'EX', json.expires_in - 30);
return {...json, servedFromCache: false};
} catch (err) {
debug(err, 'getIbmAccessToken: Error retrieving Ibm access token');
logger.error(err, 'getIbmAccessToken: Error retrieving Ibm access token for client_id ${clientId}');
throw err;
}
}
module.exports = getIbmAccessToken;

View File

@@ -0,0 +1,49 @@
const formurlencoded = require('form-urlencoded');
const {Pool} = require('undici');
const pool = new Pool('https://auth.crt.nuance.com');
const {makeNuanceKey, makeBasicAuthHeader, noopLogger} = require('./utils');
const debug = require('debug')('jambonz:realtimedb-helpers');
const HTTP_TIMEOUT = 5000;
async function getNuanceAccessToken(client, logger, clientId, secret, scope) {
logger = logger || noopLogger;
try {
const key = makeNuanceKey(clientId, secret, scope);
const access_token = await client.getAsync(key);
if (access_token) return {access_token, servedFromCache: true};
/* access token not found in cache, so fetch it from Nuance */
const payload = {
grant_type: 'client_credentials',
scope
};
const auth = makeBasicAuthHeader(clientId, secret);
const {statusCode, headers, body} = await pool.request({
path: '/oauth2/token',
method: 'POST',
headers: {
...auth,
'Content-Type': 'application/x-www-form-urlencoded'
},
body: formurlencoded(payload),
timeout: HTTP_TIMEOUT,
followRedirects: false
});
if (200 !== statusCode) {
logger.debug({statusCode, headers, body: body.text()}, 'error fetching access token from Nuance');
const err = new Error();
err.statusCode = statusCode;
throw err;
}
const json = await body.json();
await client.set(key, json.access_token, 'EX', json.expires_in - 30);
return {...json, servedFromCache: false};
} catch (err) {
debug(err, `getNuanceAccessToken: Error retrieving Nuance access token for client_id ${clientId}`);
logger.error(err, `getNuanceAccessToken: Error retrieving Nuance access token for client_id ${clientId}`);
throw err;
}
}
module.exports = getNuanceAccessToken;

111
lib/get-tts-voices.js Normal file
View File

@@ -0,0 +1,111 @@
const assert = require('assert');
const {noopLogger, createNuanceClient} = require('./utils');
const getNuanceAccessToken = require('./get-nuance-access-token');
const {GetVoicesRequest, Voice} = require('../stubs/nuance/synthesizer_pb');
const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1');
const { IamAuthenticator } = require('ibm-watson/auth');
const getIbmVoices = async(client, logger, credentials) => {
const {tts_region, tts_api_key} = credentials;
console.log(`region: ${tts_region}, api_key: ${tts_api_key}`);
const textToSpeech = new TextToSpeechV1({
authenticator: new IamAuthenticator({
apikey: tts_api_key,
}),
serviceUrl: `https://api.${tts_region}.text-to-speech.watson.cloud.ibm.com`
});
const voices = await textToSpeech.listVoices();
return voices;
};
const getNuanceVoices = async(client, logger, credentials) => {
const {client_id: clientId, secret: secret} = credentials;
return new Promise(async(resolve, reject) => {
/* get a nuance access token */
let token, nuanceClient;
try {
const access_token = await getNuanceAccessToken(client, logger, clientId, secret, 'tts');
token = access_token.access_token;
nuanceClient = await createNuanceClient(token);
} catch (err) {
logger.error({err}, 'getTtsVoices: error retrieving access token');
return reject(err);
}
/* retrieve all voices */
const v = new Voice();
const request = new GetVoicesRequest();
request.setVoice(v);
nuanceClient.getVoices(request, (err, response) => {
if (err) {
logger.error({err, clientId, secret, token}, 'getTtsVoices: error retrieving voices');
return reject(err);
}
/* return all the voices that are not restricted and eliminate duplicates */
const voices = response.getVoicesList()
.map((v) => {
return {
language: v.getLanguage(),
name: v.getName(),
model: v.getModel(),
gender: v.getGender() === 1 ? 'male' : 'female',
restricted: v.getRestricted()
};
});
const v = voices
.filter((v) => v.restricted === false)
.map((v) => {
delete v.restricted;
return v;
})
.sort((a, b) => {
if (a.language < b.language) return -1;
if (a.language > b.language) return 1;
if (a.name < b.name) return -1;
return 1;
});
const arr = [...new Set(v.map((v) => JSON.stringify(v)))]
.map((v) => JSON.parse(v));
resolve(arr);
});
});
};
/**
* Synthesize speech to an mp3 file, and also cache the generated speech
* in redis (base64 format) for 24 hours so as to avoid unnecessarily paying
* time and again for speech synthesis of the same text.
* It is the responsibility of the caller to unlink the mp3 file after use.
*
* @param {*} client - redis client
* @param {*} logger - pino logger
* @param {object} opts - options
* @param {string} opts.vendor - 'google' or 'aws' ('polly' is an alias for 'aws')
* @param {string} opt.language - language code
* @param {string} opts.voice - voice identifier
* @param {string} opts.text - text or ssml to synthesize
* @returns object containing filepath to an mp3 file in the /tmp folder containing
* the synthesized audio, and a variable indicating whether it was served from cache
*/
async function getTtsVoices(client, logger, {vendor, credentials}) {
logger = logger || noopLogger;
assert.ok(['nuance', 'ibm'].includes(vendor),
`getTtsVoices not supported for vendor ${vendor}`);
switch (vendor) {
case 'nuance':
return getNuanceVoices(client, logger, credentials);
case 'ibm':
return getIbmVoices(client, logger, credentials);
default:
break;
}
}
module.exports = getTtsVoices;

46
lib/purge-tts-cache.js Normal file
View File

@@ -0,0 +1,46 @@
const {noopLogger, makeSynthKey} = require('./utils');
const debug = require('debug')('jambonz:realtimedb-helpers');
/**
* Scan TTS Cache and purge records, use specific settings to purge just one
* @param {object} opts - options
* @param {boolean} opts.all - purge all records or only one specific, true by default
* @param {string} opts.vendor - 'google' or 'aws' ('polly' is an alias for 'aws')
* @param {string} opts.language - language code
* @param {string} opts.voice - voice identifier
* @param {string} opts.text - text or ssml to synthesize
* @returns {object} result - {error, purgedCount}
*/
async function purgeTtsCache(client, logger, {all, vendor, language, voice, deploymentId, engine, text} = {all: true}) {
logger = logger || noopLogger;
let purgedCount = 0, error;
try {
if (all) {
const keys = await client.keysAsync('tts:*');
purgedCount = await client.delAsync(keys);
} else {
const key = makeSynthKey({
vendor,
language: language || '',
voice: voice || deploymentId,
engine,
text,
});
purgedCount = await client.delAsync(key);
if (purgedCount === 0) error = 'Specified item not found';
}
} catch (err) {
debug(err, 'purgeTtsCache: Error');
logger.error(err, 'purgeTtsCache: Error');
error = err.message ?? 'Unknown Error';
}
logger.info(`purgeTtsCache: purged ${purgedCount} records`);
return {error, purgedCount};
}
module.exports = purgeTtsCache;

436
lib/synth-audio.js Normal file
View File

@@ -0,0 +1,436 @@
const assert = require('assert');
const fs = require('fs');
const bent = require('bent');
const ttsGoogle = require('@google-cloud/text-to-speech');
//const Polly = require('aws-sdk/clients/polly');
const { PollyClient, SynthesizeSpeechCommand } = require('@aws-sdk/client-polly');
const sdk = require('microsoft-cognitiveservices-speech-sdk');
const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1');
const { IamAuthenticator } = require('ibm-watson/auth');
const {
AudioConfig,
ResultReason,
SpeechConfig,
SpeechSynthesizer,
CancellationDetails,
SpeechSynthesisOutputFormat
} = sdk;
const {makeSynthKey, createNuanceClient, noopLogger, createRivaClient} = require('./utils');
const getNuanceAccessToken = require('./get-nuance-access-token');
const {
SynthesisRequest,
Voice,
AudioFormat,
AudioParameters,
PCM,
Input,
Text,
SSML,
EventParameters
} = require('../stubs/nuance/synthesizer_pb');
const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb');
const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb');
const debug = require('debug')('jambonz:realtimedb-helpers');
const EXPIRES = 3600 * 24; // cache tts for 24 hours
const TMP_FOLDER = '/tmp';
/**
* Synthesize speech to an mp3 file, and also cache the generated speech
* in redis (base64 format) for 24 hours so as to avoid unnecessarily paying
* time and again for speech synthesis of the same text.
* It is the responsibility of the caller to unlink the mp3 file after use.
*
* @param {*} client - redis client
* @param {*} logger - pino logger
* @param {object} opts - options
* @param {string} opts.vendor - 'google' or 'aws' ('polly' is an alias for 'aws')
* @param {string} opt.language - language code
* @param {string} opts.voice - voice identifier
* @param {string} opts.text - text or ssml to synthesize
* @param {boolean} opts.disableTtsCache - disable TTS Cache retrieval
* @returns object containing filepath to an mp3 file in the /tmp folder containing
* the synthesized audio, and a variable indicating whether it was served from cache
*/
async function synthAudio(client, logger, stats, {
vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId, disableTtsCache
}) {
let audioBuffer;
let servedFromCache = false;
let rtt;
logger = logger || noopLogger;
assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm'].includes(vendor),
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`);
if ('google' === vendor) {
assert.ok(language, 'synthAudio requires language when google is used');
}
else if (['aws', 'polly'].includes(vendor)) {
assert.ok(voice, 'synthAudio requires voice when aws polly is used');
}
else if ('microsoft' === vendor) {
assert.ok(language || deploymentId, 'synthAudio requires language when microsoft is used');
assert.ok(voice || deploymentId, 'synthAudio requires voice when microsoft is used');
}
else if ('nuance' === vendor) {
assert.ok(voice, 'synthAudio requires voice when nuance is used');
assert.ok(credentials.client_id, 'synthAudio requires client_id in credentials when nuance is used');
assert.ok(credentials.secret, 'synthAudio requires client_id in credentials when nuance is used');
}
else if ('nvidia' === vendor) {
assert.ok(voice, 'synthAudio requires voice when nvidia is used');
assert.ok(language, 'synthAudio requires language when nvidia is used');
assert.ok(credentials.riva_uri, 'synthAudio requires riva_uri in credentials when nuance is used');
}
else if ('ibm' === vendor) {
assert.ok(voice, 'synthAudio requires voice when ibm is used');
assert.ok(credentials.tts_region, 'synthAudio requires tts_region in credentials when ibm watson is used');
assert.ok(credentials.tts_api_key, 'synthAudio requires tts_api_key in credentials when nuance is used');
}
else if ('wellsaid' === vendor) {
language = 'en-US'; // WellSaid only supports English atm
assert.ok(voice, 'synthAudio requires voice when wellsaid is used');
assert.ok(!text.startsWith('<speak'), 'wellsaid does not support SSML tags');
}
const key = makeSynthKey({
vendor,
language: language || '',
voice: voice || deploymentId,
engine,
text
});
let filePath;
if (['nuance', 'nvidia'].includes(vendor)) {
filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.r8`;
}
else filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.mp3`;
debug(`synth key is ${key}`);
let cached;
if (!disableTtsCache) {
cached = await client.getAsync(key);
}
if (cached) {
// found in cache - extend the expiry and use it
debug('result WAS found in cache');
servedFromCache = true;
stats.increment('tts.cache.requests', ['found:yes']);
audioBuffer = Buffer.from(cached, 'base64');
client.expireAsync(key, EXPIRES).catch((err) => logger.info(err, 'Error setting expires'));
}
if (!cached) {
// not found in cache - go get it from speech vendor and add to cache
debug('result was NOT found in cache');
stats.increment('tts.cache.requests', ['found:no']);
let vendorLabel = vendor;
const startAt = process.hrtime();
switch (vendor) {
case 'google':
audioBuffer = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
break;
case 'aws':
case 'polly':
vendorLabel = 'aws';
audioBuffer = await synthPolly(logger, {credentials, stats, language, voice, text, engine});
break;
case 'azure':
case 'microsoft':
vendorLabel = 'microsoft';
audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath});
break;
case 'nuance':
model = model || 'enhanced';
audioBuffer = await synthNuance(client, logger, {credentials, stats, voice, model, text});
break;
case 'nvidia':
audioBuffer = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
break;
case 'ibm':
audioBuffer = await synthIbm(logger, {credentials, stats, voice, text});
break;
case 'wellsaid':
audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath});
break;
default:
assert(`synthAudio: unsupported speech vendor ${vendor}`);
}
const diff = process.hrtime(startAt);
const time = diff[0] * 1e3 + diff[1] * 1e-6;
rtt = time.toFixed(0);
stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]);
debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
client.setexAsync(key, EXPIRES, audioBuffer.toString('base64'))
.catch((err) => logger.error(err, `error calling setex on key ${key}`));
if (['microsoft'].includes(vendor)) return {filePath, servedFromCache, rtt};
}
return new Promise((resolve, reject) => {
fs.writeFile(filePath, audioBuffer, (err) => {
if (err) return reject(err);
resolve({filePath, servedFromCache, rtt});
});
});
}
const synthPolly = async(logger, {credentials, stats, language, voice, engine, text}) => {
try {
const polly = new PollyClient(credentials);
const opts = {
Engine: engine,
OutputFormat: 'mp3',
Text: text,
LanguageCode: language,
TextType: text.startsWith('<speak>') ? 'ssml' : 'text',
VoiceId: voice
};
const command = new SynthesizeSpeechCommand(opts);
const data = await polly.send(command);
const chunks = [];
return new Promise((resolve, reject) => {
data.AudioStream
.on('error', (err) => {
logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
stats.increment('tts.count', ['vendor:aws', 'accepted:no']);
reject(err);
})
.on('data', (chunk) => {
chunks.push(chunk);
})
.on('end', () => resolve(Buffer.concat(chunks)));
});
} catch (err) {
logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
stats.increment('tts.count', ['vendor:aws', 'accepted:no']);
throw err;
}
};
const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
const client = new ttsGoogle.TextToSpeechClient(credentials);
const opts = {
voice: {
name: voice,
languageCode: language,
ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
},
audioConfig: {audioEncoding: 'MP3'}
};
Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}});
try {
const responses = await client.synthesizeSpeech(opts);
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
client.close();
return responses[0].audioContent;
} catch (err) {
console.error(err);
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
client && client.close();
throw err;
}
};
const synthIbm = async(logger, {credentials, stats, voice, text}) => {
const {tts_api_key, tts_region} = credentials;
const params = {
text,
voice,
accept: 'audio/mp3'
};
try {
const textToSpeech = new TextToSpeechV1({
authenticator: new IamAuthenticator({
apikey: tts_api_key,
}),
serviceUrl: `https://api.${tts_region}.text-to-speech.watson.cloud.ibm.com`
});
const r = await textToSpeech.synthesize(params);
const chunks = [];
for await (const chunk of r.result) {
chunks.push(chunk);
}
return Buffer.concat(chunks);
} catch (err) {
logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm');
stats.increment('tts.count', ['vendor:ibm', 'accepted:no']);
throw new Error(err.statusText || err.message);
}
};
const synthMicrosoft = async(logger, {
credentials,
stats,
language,
voice,
text,
filePath
}) => {
try {
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint} = credentials;
let content = text;
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
speechConfig.speechSynthesisLanguage = language;
speechConfig.speechSynthesisVoiceName = voice;
if (use_custom_tts && custom_tts_endpoint) {
speechConfig.endpointId = custom_tts_endpoint;
/**
* Note: it seems that to use custom voice ssml is required with the voice attribute
* Otherwise sending plain text we get "Voice does not match"
*/
if (!content.startsWith('<speak')) content = `<speak>${text}</speak>`;
}
speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
const config = AudioConfig.fromAudioFileOutput(filePath);
const synthesizer = new SpeechSynthesizer(speechConfig, config);
if (content.startsWith('<speak>')) {
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
// eslint-disable-next-line max-len
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
logger.info({content}, 'synthMicrosoft');
}
return new Promise((resolve, reject) => {
const speakAsync = content.startsWith('<speak') ?
synthesizer.speakSsmlAsync.bind(synthesizer) :
synthesizer.speakTextAsync.bind(synthesizer);
speakAsync(
content,
async(result) => {
switch (result.reason) {
case ResultReason.Canceled:
const cancellation = CancellationDetails.fromResult(result);
logger.info({reason: cancellation.errorDetails}, 'synthAudio: (Microsoft) synthesis canceled');
synthesizer.close();
reject(cancellation.errorDetails);
break;
case ResultReason.SynthesizingAudioCompleted:
stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']);
synthesizer.close();
fs.readFile(filePath, (err, data) => {
if (err) return reject(err);
resolve(data);
});
break;
default:
logger.info({result}, 'synthAudio: (Microsoft) unexpected result');
break;
}
},
(err) => {
logger.info({err}, 'synthAudio: (Microsoft) error synthesizing');
stats.increment('tts.count', ['vendor:microsoft', 'accepted:no']);
synthesizer.close();
reject(err);
});
});
} catch (err) {
logger.info({err}, 'synthAudio: Error synthesizing speech using Microsoft');
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
}
};
const synthWellSaid = async(logger, {credentials, stats, language, voice, gender, text}) => {
const {api_key} = credentials;
try {
const post = bent('https://api.wellsaidlabs.com', 'POST', 'buffer', {
'X-Api-Key': api_key,
'Accept': 'audio/mpeg',
'Content-Type': 'application/json'
});
const mp3 = await post('/v1/tts/stream', {
text,
speaker_id: voice
});
return mp3;
} catch (err) {
logger.info({err}, 'testWellSaidTts returned error');
throw err;
}
};
const synthNuance = async(client, logger, {credentials, stats, voice, model, text}) => {
/* get a nuance access token */
const {client_id, secret} = credentials;
const {access_token} = await getNuanceAccessToken(client, logger, client_id, secret, 'tts');
const nuanceClient = await createNuanceClient(access_token);
const v = new Voice();
const p = new AudioParameters();
const f = new AudioFormat();
const pcm = new PCM();
const params = new EventParameters();
const request = new SynthesisRequest();
const input = new Input();
if (text.startsWith('<speak')) {
const ssml = new SSML();
ssml.setText(text);
input.setSsml(ssml);
}
else {
const t = new Text();
t.setText(text);
input.setText(t);
}
pcm.setSampleRateHz(8000);
f.setPcm(pcm);
p.setAudioFormat(f);
v.setName(voice);
v.setModel(model);
request.setVoice(v);
request.setAudioParams(p);
request.setInput(input);
request.setEventParams(params);
request.setUserId('jambonz');
return new Promise((resolve, reject) => {
nuanceClient.unarySynthesize(request, (err, response) => {
if (err) {
console.error(err);
return reject(err);
}
const status = response.getStatus();
const code = status.getCode();
if (code !== 200) {
const message = status.getMessage();
const details = status.getDetails();
return reject({code, message, details});
}
resolve(Buffer.from(response.getAudio()));
});
});
};
const synthNvidia = async(client, logger, {credentials, stats, language, voice, model, text}) => {
const {riva_uri} = credentials;
const rivaClient = await createRivaClient(riva_uri);
const request = new SynthesizeSpeechRequest();
request.setVoiceName(voice);
request.setLanguageCode(language);
request.setSampleRateHz(8000);
request.setEncoding(AudioEncoding.LINEAR_PCM);
request.setText(text);
return new Promise((resolve, reject) => {
console.log(`language ${language} voice ${voice} model ${model} text ${text}`);
rivaClient.synthesize(request, (err, response) => {
if (err) {
console.error(err);
return reject(err);
}
resolve(Buffer.from(response.getAudio()));
});
});
};
module.exports = synthAudio;

25
lib/utils.js Normal file
View File

@@ -0,0 +1,25 @@
const crypto = require('crypto');
/**
* Future TODO: cache recently used connections to providers
* to avoid connection overhead during a call.
* Will need to periodically age them out to avoid memory leaks.
*/
//const nuanceClientMap = new Map();
function makeSynthKey({vendor, language, voice, engine = '', text}) {
const hash = crypto.createHash('sha1');
hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
return `tts:${hash.digest('hex')}`;
}
const noopLogger = {
info: () => {},
debug: () => {},
error: () => {}
};
module.exports = {
makeSynthKey,
noopLogger
};

1902
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -24,6 +24,7 @@
},
"homepage": "https://github.com/jambonz/speech-utils#readme",
"dependencies": {
"@aws-sdk/client-polly": "^3.269.0",
"@google-cloud/text-to-speech": "^4.2.0",
"@grpc/grpc-js": "^1.8.7",
"@jambonz/realtimedb-helpers": "^0.6.3",
@@ -36,6 +37,7 @@
"undici": "^5.18.0"
},
"devDependencies": {
"config": "^3.3.9",
"eslint": "^8.33.0",
"eslint-plugin-promise": "^6.1.1",
"nyc": "^15.1.0",

View File

@@ -0,0 +1,36 @@
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT
syntax = "proto3";
package nvidia.riva;
option cc_enable_arenas = true;
option go_package = "nvidia.com/riva_speech";
/*
* AudioEncoding specifies the encoding of the audio bytes in the encapsulating message.
*/
enum AudioEncoding {
// Not specified.
ENCODING_UNSPECIFIED = 0;
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
LINEAR_PCM = 1;
// `FLAC` (Free Lossless Audio
// Codec) is the recommended encoding because it is
// lossless--therefore recognition is not compromised--and
// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
// encoding supports 16-bit and 24-bit samples, however, not all fields in
// `STREAMINFO` are supported.
FLAC = 2;
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
MULAW = 3;
OGGOPUS = 4;
// 8-bit samples that compand 13-bit audio samples using G.711 PCMU/a-law.
ALAW = 20;
}

View File

@@ -0,0 +1,77 @@
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT
syntax = "proto3";
package nvidia.riva.tts;
option cc_enable_arenas = true;
option go_package = "nvidia.com/riva_speech";
import "riva/proto/riva_audio.proto";
service RivaSpeechSynthesis {
// Used to request text-to-speech from the service. Submit a request containing the
// desired text and configuration, and receive audio bytes in the requested format.
rpc Synthesize(SynthesizeSpeechRequest) returns (SynthesizeSpeechResponse) {}
// Used to request text-to-speech returned via stream as it becomes available.
// Submit a SynthesizeSpeechRequest with desired text and configuration,
// and receive stream of bytes in the requested format.
rpc SynthesizeOnline(SynthesizeSpeechRequest) returns (stream SynthesizeSpeechResponse) {}
//Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
rpc GetRivaSynthesisConfig(RivaSynthesisConfigRequest) returns (RivaSynthesisConfigResponse) {}
}
message RivaSynthesisConfigRequest {
//If model is specified only return config for model, otherwise return all configs.
string model_name = 1;
}
message RivaSynthesisConfigResponse {
message Config {
string model_name = 1;
map<string,string> parameters = 2;
}
repeated Config model_config = 1;
}
message SynthesizeSpeechRequest {
string text = 1;
string language_code = 2;
// audio encoding params
AudioEncoding encoding = 3;
// The sample rate in hertz (Hz) of the audio output requested through `SynthesizeSpeechRequest` messages.
// Models produce an output at a fixed rate. The sample rate enables you to resample the generated audio output if required.
// You use the sample rate to up-sample or down-sample the audio for various scenarios. For example, the sample rate can be set to 8kHz (kilohertz) if the output
// audio is desired for a low bandwidth application.
// The sample rate values below 8kHz will not produce any meaningful output. Also, up-sampling too much will increase the
// size of the output without improving the output audio quality.
int32 sample_rate_hz = 4;
// voice params
string voice_name = 5;
}
message SynthesizeSpeechResponseMetadata {
// Currently experimental API addition that returns the input text
// after preprocessing has been completed as well as the predicted
// duration for each token.
// Note: this message is subject to future breaking changes, and potential
// removal.
string text = 1;
string processed_text = 2;
repeated float predicted_durations = 8;
}
message SynthesizeSpeechResponse {
bytes audio = 1;
SynthesizeSpeechResponseMetadata meta = 2;
}
/*
*
*/

View File

@@ -0,0 +1 @@
// GENERATED CODE -- NO SERVICES IN PROTO

View File

@@ -0,0 +1,31 @@
// source: riva/proto/riva_audio.proto
/**
* @fileoverview
* @enhanceable
* @suppress {missingRequire} reports error on implicit type usages.
* @suppress {messageConventions} JS Compiler reports an error if a variable or
* field starts with 'MSG_' and isn't a translatable message.
* @public
*/
// GENERATED CODE -- DO NOT EDIT!
/* eslint-disable */
// @ts-nocheck
var jspb = require('google-protobuf');
var goog = jspb;
var global = Function('return this')();
goog.exportSymbol('proto.nvidia.riva.AudioEncoding', null, global);
/**
* @enum {number}
*/
proto.nvidia.riva.AudioEncoding = {
ENCODING_UNSPECIFIED: 0,
LINEAR_PCM: 1,
FLAC: 2,
MULAW: 3,
OGGOPUS: 4,
ALAW: 20
};
goog.object.extend(exports, proto.nvidia.riva);

View File

@@ -0,0 +1,99 @@
// GENERATED CODE -- DO NOT EDIT!
// Original file comments:
// SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: MIT
//
'use strict';
var grpc = require('@grpc/grpc-js');
var riva_proto_riva_tts_pb = require('../../riva/proto/riva_tts_pb.js');
var riva_proto_riva_audio_pb = require('../../riva/proto/riva_audio_pb.js');
function serialize_nvidia_riva_tts_RivaSynthesisConfigRequest(arg) {
if (!(arg instanceof riva_proto_riva_tts_pb.RivaSynthesisConfigRequest)) {
throw new Error('Expected argument of type nvidia.riva.tts.RivaSynthesisConfigRequest');
}
return Buffer.from(arg.serializeBinary());
}
function deserialize_nvidia_riva_tts_RivaSynthesisConfigRequest(buffer_arg) {
return riva_proto_riva_tts_pb.RivaSynthesisConfigRequest.deserializeBinary(new Uint8Array(buffer_arg));
}
function serialize_nvidia_riva_tts_RivaSynthesisConfigResponse(arg) {
if (!(arg instanceof riva_proto_riva_tts_pb.RivaSynthesisConfigResponse)) {
throw new Error('Expected argument of type nvidia.riva.tts.RivaSynthesisConfigResponse');
}
return Buffer.from(arg.serializeBinary());
}
function deserialize_nvidia_riva_tts_RivaSynthesisConfigResponse(buffer_arg) {
return riva_proto_riva_tts_pb.RivaSynthesisConfigResponse.deserializeBinary(new Uint8Array(buffer_arg));
}
function serialize_nvidia_riva_tts_SynthesizeSpeechRequest(arg) {
if (!(arg instanceof riva_proto_riva_tts_pb.SynthesizeSpeechRequest)) {
throw new Error('Expected argument of type nvidia.riva.tts.SynthesizeSpeechRequest');
}
return Buffer.from(arg.serializeBinary());
}
function deserialize_nvidia_riva_tts_SynthesizeSpeechRequest(buffer_arg) {
return riva_proto_riva_tts_pb.SynthesizeSpeechRequest.deserializeBinary(new Uint8Array(buffer_arg));
}
function serialize_nvidia_riva_tts_SynthesizeSpeechResponse(arg) {
if (!(arg instanceof riva_proto_riva_tts_pb.SynthesizeSpeechResponse)) {
throw new Error('Expected argument of type nvidia.riva.tts.SynthesizeSpeechResponse');
}
return Buffer.from(arg.serializeBinary());
}
function deserialize_nvidia_riva_tts_SynthesizeSpeechResponse(buffer_arg) {
return riva_proto_riva_tts_pb.SynthesizeSpeechResponse.deserializeBinary(new Uint8Array(buffer_arg));
}
var RivaSpeechSynthesisService = exports.RivaSpeechSynthesisService = {
// Used to request text-to-speech from the service. Submit a request containing the
// desired text and configuration, and receive audio bytes in the requested format.
synthesize: {
path: '/nvidia.riva.tts.RivaSpeechSynthesis/Synthesize',
requestStream: false,
responseStream: false,
requestType: riva_proto_riva_tts_pb.SynthesizeSpeechRequest,
responseType: riva_proto_riva_tts_pb.SynthesizeSpeechResponse,
requestSerialize: serialize_nvidia_riva_tts_SynthesizeSpeechRequest,
requestDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechRequest,
responseSerialize: serialize_nvidia_riva_tts_SynthesizeSpeechResponse,
responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse,
},
// Used to request text-to-speech returned via stream as it becomes available.
// Submit a SynthesizeSpeechRequest with desired text and configuration,
// and receive stream of bytes in the requested format.
synthesizeOnline: {
path: '/nvidia.riva.tts.RivaSpeechSynthesis/SynthesizeOnline',
requestStream: false,
responseStream: true,
requestType: riva_proto_riva_tts_pb.SynthesizeSpeechRequest,
responseType: riva_proto_riva_tts_pb.SynthesizeSpeechResponse,
requestSerialize: serialize_nvidia_riva_tts_SynthesizeSpeechRequest,
requestDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechRequest,
responseSerialize: serialize_nvidia_riva_tts_SynthesizeSpeechResponse,
responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse,
},
// Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
getRivaSynthesisConfig: {
path: '/nvidia.riva.tts.RivaSpeechSynthesis/GetRivaSynthesisConfig',
requestStream: false,
responseStream: false,
requestType: riva_proto_riva_tts_pb.RivaSynthesisConfigRequest,
responseType: riva_proto_riva_tts_pb.RivaSynthesisConfigResponse,
requestSerialize: serialize_nvidia_riva_tts_RivaSynthesisConfigRequest,
requestDeserialize: deserialize_nvidia_riva_tts_RivaSynthesisConfigRequest,
responseSerialize: serialize_nvidia_riva_tts_RivaSynthesisConfigResponse,
responseDeserialize: deserialize_nvidia_riva_tts_RivaSynthesisConfigResponse,
},
};
exports.RivaSpeechSynthesisClient = grpc.makeGenericClientConstructor(RivaSpeechSynthesisService);

File diff suppressed because it is too large Load Diff

12
test/docker_start.js Normal file
View File

@@ -0,0 +1,12 @@
const test = require('tape').test ;
const exec = require('child_process').exec ;
test('starting docker network..', (t) => {
exec(`docker-compose -f ${__dirname}/docker-compose-testbed.yaml up -d`, (err, stdout, stderr) => {
setTimeout(() => {
t.end(err);
}, 2000);
});
});

12
test/docker_stop.js Normal file
View File

@@ -0,0 +1,12 @@
const test = require('tape').test ;
const exec = require('child_process').exec ;
test('stopping docker network..', (t) => {
t.timeoutAfter(10000);
exec(`docker-compose -f ${__dirname}/docker-compose-testbed.yaml down`, (err, stdout, stderr) => {
//console.log(`stderr: ${stderr}`);
process.exit(0);
});
t.end() ;
});

78
test/ibm.js Normal file
View File

@@ -0,0 +1,78 @@
const test = require('tape').test ;
const config = require('config');
const opts = config.get('redis');
const fs = require('fs');
const logger = require('pino')({level: 'error'});
process.on('unhandledRejection', (reason, p) => {
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
});
const stats = {
increment: () => {},
histogram: () => {}
};
test('IBM - create access key', async(t) => {
const fn = require('..');
const {client, getIbmAccessToken} = fn(opts, logger);
if (!process.env.IBM_API_KEY ) {
t.pass('skipping IBM test since no IBM api_key provided');
t.end();
client.quit();
return;
}
try {
let obj = await getIbmAccessToken(process.env.IBM_API_KEY);
//console.log({obj}, 'received access token from IBM');
t.ok(obj.access_token && !obj.servedFromCache, 'successfull received access token from IBM');
obj = await getIbmAccessToken(process.env.IBM_API_KEY);
//console.log({obj}, 'received access token from IBM - second request');
t.ok(obj.access_token && obj.servedFromCache, 'successfully received access token from cache');
await client.flushallAsync();
t.end();
}
catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('IBM - retrieve tts voices test', async(t) => {
const fn = require('..');
const {client, getTtsVoices} = fn(opts, logger);
if (!process.env.IBM_TTS_API_KEY || !process.env.IBM_TTS_REGION) {
t.pass('skipping IBM test since no IBM api_key and/or region provided');
t.end();
client.quit();
return;
}
try {
const opts = {
vendor: 'ibm',
credentials: {
tts_api_key: process.env.IBM_TTS_API_KEY,
tts_region: process.env.IBM_TTS_REGION
}
};
const obj = await getTtsVoices(opts);
const {voices} = obj.result;
//console.log(JSON.stringify(voices));
t.ok(voices.length > 0 && voices[0].language,
`GetVoices: successfully retrieved ${voices.length} voices from IBM`);
await client.flushallAsync();
t.end();
}
catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});

5
test/index.js Normal file
View File

@@ -0,0 +1,5 @@
require('./docker_start');
require('./synth');
require('./nuance');
require('./ibm');
require('./docker_stop');

50
test/nuance.js Normal file
View File

@@ -0,0 +1,50 @@
const test = require('tape').test ;
const config = require('config');
const opts = config.get('redis');
const fs = require('fs');
const logger = require('pino')({level: 'error'});
process.on('unhandledRejection', (reason, p) => {
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
});
const stats = {
increment: () => {},
histogram: () => {}
};
test('Nuance tests', async(t) => {
const fn = require('..');
const {client, getTtsVoices} = fn(opts, logger);
if (!process.env.NUANCE_CLIENT_ID || !process.env.NUANCE_SECRET ) {
t.pass('skipping Nuance test since no Nuance client_id and secret provided');
t.end();
client.quit();
return;
}
try {
const opts = {
vendor: 'nuance',
credentials: {
client_id: process.env.NUANCE_CLIENT_ID,
secret: process.env.NUANCE_SECRET
}
};
let voices = await getTtsVoices(opts);
//console.log(`received ${voices.length} voices from Nuance`);
//console.log(JSON.stringify(voices));
t.ok(voices.length > 0 && voices[0].language,
`GetVoices: successfully retrieved ${voices.length} voices from Nuance`);
await client.flushallAsync();
t.end();
}
catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});

382
test/synth.js Normal file
View File

@@ -0,0 +1,382 @@
const test = require('tape').test;
const config = require('config');
const opts = config.get('redis');
const fs = require('fs');
const {makeSynthKey} = require('../lib/utils');
const logger = require('pino')();
process.on('unhandledRejection', (reason, p) => {
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
});
const stats = {
increment: () => {
},
histogram: () => {
},
};
test('Google speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.GCP_FILE && !process.env.GCP_JSON_KEY) {
t.pass('skipping google speech synth tests since neither GCP_FILE nor GCP_JSON_KEY provided');
return t.end();
}
try {
const str = process.env.GCP_JSON_KEY || fs.readFileSync(process.env.GCP_FILE);
const creds = JSON.parse(str);
let opts = await synthAudio(stats, {
vendor: 'google',
credentials: {
credentials: {
client_email: creds.client_email,
private_key: creds.private_key,
},
},
language: 'en-GB',
gender: 'MALE',
text: 'This is a test. This is only a test',
salt: 'foo.bar',
});
t.ok(!opts.servedFromCache, `successfully synthesized google audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'google',
credentials: {
credentials: {
client_email: creds.client_email,
private_key: creds.private_key,
},
},
language: 'en-GB',
gender: 'MALE',
text: 'This is a test. This is only a test',
});
t.ok(opts.servedFromCache, `successfully retrieved cached google audio from ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'google',
credentials: {
credentials: {
client_email: creds.client_email,
private_key: creds.private_key,
},
},
disableTtsCache: true,
language: 'en-GB',
gender: 'MALE',
text: 'This is a test. This is only a test',
});
t.ok(!opts.servedFromCache, `successfully synthesized google audio regardless of current cache to ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('AWS speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.AWS_ACCESS_KEY_ID || !process.env.AWS_SECRET_ACCESS_KEY || !process.env.AWS_REGION) {
t.pass('skipping AWS speech synth tests since AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, or AWS_REGION not provided');
return t.end();
}
try {
let opts = await synthAudio(stats, {
vendor: 'aws',
credentials: {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
region: process.env.AWS_REGION,
},
language: 'en-US',
voice: 'Joey',
text: 'This is a test. This is only a test',
});
t.ok(!opts.servedFromCache, `successfully synthesized aws audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'aws',
credentials: {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
region: process.env.AWS_REGION,
},
language: 'en-US',
voice: 'Joey',
text: 'This is a test. This is only a test',
});
t.ok(opts.servedFromCache, `successfully retrieved aws audio from cache ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('Azure speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.MICROSOFT_API_KEY || !process.env.MICROSOFT_REGION) {
t.pass('skipping Microsoft speech synth tests since MICROSOFT_API_KEY or MICROSOFT_REGION not provided');
return t.end();
}
try {
const longText = `Henry is best known for his six marriages, including his efforts to have his first marriage
(to Catherine of Aragon) annulled. His disagreement with Pope Clement VII about such an
annulment led Henry to initiate the English Reformation,
separating the Church of England from papal authority. He appointed himself Supreme Head of the Church of England
and dissolved convents and monasteries, for which he was excommunicated.
Henry is also known as "the father of the Royal Navy," as he invested heavily in the navy,
increasing its size from a few to more than 50 ships, and established the Navy Board.`;
let opts = await synthAudio(stats, {
vendor: 'microsoft',
credentials: {
api_key: process.env.MICROSOFT_API_KEY,
region: process.env.MICROSOFT_REGION,
},
language: 'en-US',
voice: 'en-US-ChristopherNeural',
text: longText,
});
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'microsoft',
credentials: {
api_key: process.env.MICROSOFT_API_KEY,
region: process.env.MICROSOFT_REGION,
},
language: 'en-US',
voice: 'en-US-ChristopherNeural',
text: longText,
});
t.ok(opts.servedFromCache, `successfully retrieved microsoft audio from cache ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('Azure custom voice speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.MICROSOFT_CUSTOM_API_KEY || !process.env.MICROSOFT_DEPLOYMENT_ID || !process.env.MICROSOFT_CUSTOM_REGION) {
t.pass('skipping Microsoft speech synth custom voice tests since MICROSOFT_CUSTOM_API_KEY or MICROSOFT_DEPLOYMENT_ID or MICROSOFT_CUSTOM_REGION not provided');
return t.end();
}
try {
const text = 'Hi, this is my custom voice. How does it sound to you? Do I have a future as a virtual bot?';
let opts = await synthAudio(stats, {
vendor: 'microsoft',
credentials: {
api_key: process.env.MICROSOFT_CUSTOM_API_KEY,
region: process.env.MICROSOFT_CUSTOM_REGION,
use_custom_tts: true,
custom_tts_endpoint: process.env.MICROSOFT_DEPLOYMENT_ID,
},
language: 'en-US',
voice: process.env.MICROSOFT_CUSTOM_VOICE,
text,
});
t.ok(!opts.servedFromCache, `successfully synthesized microsoft audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'microsoft',
credentials: {
api_key: process.env.MICROSOFT_CUSTOM_API_KEY,
region: process.env.MICROSOFT_CUSTOM_REGION,
use_custom_tts: true,
custom_tts_endpoint: process.env.MICROSOFT_DEPLOYMENT_ID,
},
language: 'en-US',
voice: process.env.MICROSOFT_CUSTOM_VOICE,
text,
});
t.ok(opts.servedFromCache, `successfully retrieved microsoft custom voice audio from cache ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('Nuance speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.NUANCE_CLIENT_ID || !process.env.NUANCE_SECRET) {
t.pass('skipping Nuance speech synth tests since NUANCE_CLIENT_ID or NUANCE_SECRET not provided');
return t.end();
}
try {
let opts = await synthAudio(stats, {
vendor: 'nuance',
credentials: {
client_id: process.env.NUANCE_CLIENT_ID,
secret: process.env.NUANCE_SECRET,
},
language: 'en-US',
voice: 'Evan',
text: 'This is a test. This is only a test',
});
t.ok(!opts.servedFromCache, `successfully synthesized nuance audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'nuance',
credentials: {
client_id: process.env.NUANCE_CLIENT_ID,
secret: process.env.NUANCE_SECRET,
},
language: 'en-US',
voice: 'Evan',
text: 'This is a test. This is only a test',
});
t.ok(opts.servedFromCache, `successfully retrieved nuance audio from cache ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('Nvidia speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.RIVA_URI) {
t.pass('skipping Nvidia speech synth tests since RIVA_URI not provided');
return t.end();
}
try {
let opts = await synthAudio(stats, {
vendor: 'nvidia',
credentials: {
riva_uri: process.env.RIVA_URI,
},
language: 'en-US',
voice: 'English-US.Female-1',
text: 'This is a test. This is only a test',
});
t.ok(!opts.servedFromCache, `successfully synthesized nuance audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'nvidia',
credentials: {
riva_uri: process.env.RIVA_URI,
},
language: 'en-US',
voice: 'English-US.Female-1',
text: 'This is a test. This is only a test',
});
t.ok(opts.servedFromCache, `successfully retrieved nuance audio from cache ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});
test('IBM watson speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
if (!process.env.IBM_TTS_API_KEY || !process.env.IBM_TTS_REGION) {
t.pass('skipping IBM Watson speech synth tests since IBM_TTS_API_KEY or IBM_TTS_API_KEY not provided');
return t.end();
}
const text = `<speak> Hi there and welcome to jambones! jambones is the <sub alias="seapass">CPaaS</sub> designed with the needs of communication service providers in mind. This is an example of simple text-to-speech, but there is so much more you can do. Try us out!</speak>`;
try {
let opts = await synthAudio(stats, {
vendor: 'ibm',
credentials: {
tts_api_key: process.env.IBM_TTS_API_KEY,
tts_region: process.env.IBM_TTS_REGION,
},
language: 'en-US',
voice: 'en-US_AllisonV2Voice',
text,
});
t.ok(!opts.servedFromCache, `successfully synthesized ibm audio to ${opts.filePath}`);
opts = await synthAudio(stats, {
vendor: 'ibm',
credentials: {
tts_api_key: process.env.IBM_TTS_API_KEY,
tts_region: process.env.IBM_TTS_REGION,
},
language: 'en-US',
voice: 'en-US_AllisonV2Voice',
text,
});
t.ok(opts.servedFromCache, `successfully retrieved ibm audio from cache ${opts.filePath}`);
} catch (err) {
console.error(JSON.stringify(err));
t.end(err);
}
client.quit();
});
test('TTS Cache tests', async(t) => {
const fn = require('..');
const {purgeTtsCache, client} = fn(opts, logger);
try {
// save some random tts keys to cache
const minRecords = 8;
for (const i in Array(minRecords).fill(0)) {
await client.setAsync(makeSynthKey({vendor: i, language: i, voice: i, engine: i, text: i}), i);
}
const {purgedCount} = await purgeTtsCache();
t.ok(purgedCount >= minRecords, `successfully purged at least ${minRecords} tts records from cache`);
const cached = (await client.keysAsync('tts:*')).length;
t.equal(cached, 0, `successfully purged all tts records from cache`);
} catch (err) {
console.error(JSON.stringify(err));
t.end(err);
}
try {
// save some random tts keys to cache
for (const i in Array(10).fill(0)) {
await client.setAsync(makeSynthKey({vendor: i, language: i, voice: i, engine: i, text: i}), i);
}
// save a specific key to tts cache
const opts = {vendor: 'aws', language: 'en-US', voice: 'MALE', engine: 'Engine', text: 'Hello World!'};
await client.setAsync(makeSynthKey(opts), opts.text);
const {purgedCount} = await purgeTtsCache({all: false, ...opts});
t.ok(purgedCount === 1, `successfully purged one specific tts record from cache`);
// returns error for unknown key
const {purgedCount: purgedCountWhenErrored, error} = await purgeTtsCache({
all: false,
vendor: 'non-existing',
language: 'non-existing',
voice: 'non-existing',
});
t.ok(purgedCountWhenErrored === 0, `purged no records when specified key was not found`);
t.ok(error, `error returned when specified key was not found`);
// make sure other tts keys are still there
const cached = (await client.keysAsync('tts:*')).length;
t.ok(cached >= 1, `successfully kept all non-specified tts records in cache`);
} catch (err) {
console.error(JSON.stringify(err));
t.end(err);
}
client.quit();
});

2054
test/tmp/redis.conf Normal file

File diff suppressed because it is too large Load Diff