diff --git a/index.js b/index.js index dad893f..560e05f 100644 --- a/index.js +++ b/index.js @@ -14,6 +14,7 @@ module.exports = (opts, logger) => { purgeTtsCache: require('./lib/purge-tts-cache').bind(null, client, logger), addFileToCache: require('./lib/add-file-to-cache').bind(null, client, logger), synthAudio: require('./lib/synth-audio').bind(null, client, createHash, retrieveHash, logger), + getVerbioAccessToken: require('./lib/get-verbio-token').bind(null, client, logger), getNuanceAccessToken: require('./lib/get-nuance-access-token').bind(null, client, logger), getIbmAccessToken: require('./lib/get-ibm-access-token').bind(null, client, logger), getAwsAuthToken: require('./lib/get-aws-sts-token').bind(null, logger, createHash, retrieveHash), diff --git a/lib/config.js b/lib/config.js index d175e63..9482d90 100644 --- a/lib/config.js +++ b/lib/config.js @@ -7,11 +7,14 @@ const JAMBONES_TTS_CACHE_DURATION_MINS = (parseInt(process.env.JAMBONES_TTS_CACH const TMP_FOLDER = '/tmp'; +const HTTP_TIMEOUT = 5000; + module.exports = { JAMBONES_TTS_TRIM_SILENCE, JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT, JAMBONES_TTS_CACHE_DURATION_MINS, - TMP_FOLDER + TMP_FOLDER, + HTTP_TIMEOUT }; diff --git a/lib/get-ibm-access-token.js b/lib/get-ibm-access-token.js index 3ebaef5..1635eeb 100644 --- a/lib/get-ibm-access-token.js +++ b/lib/get-ibm-access-token.js @@ -2,8 +2,8 @@ const formurlencoded = require('form-urlencoded'); const {Pool} = require('undici'); const pool = new Pool('https://iam.cloud.ibm.com'); const {makeIbmKey, noopLogger} = require('./utils'); +const { HTTP_TIMEOUT } = require('./config'); const debug = require('debug')('jambonz:realtimedb-helpers'); -const HTTP_TIMEOUT = 5000; async function getIbmAccessToken(client, logger, apiKey) { logger = logger || noopLogger; diff --git a/lib/get-nuance-access-token.js b/lib/get-nuance-access-token.js index 1cc1383..519516d 100644 --- a/lib/get-nuance-access-token.js +++ b/lib/get-nuance-access-token.js @@ -2,8 +2,8 @@ const formurlencoded = require('form-urlencoded'); const {Pool} = require('undici'); const pool = new Pool('https://auth.crt.nuance.com'); const {makeNuanceKey, makeBasicAuthHeader, noopLogger} = require('./utils'); +const { HTTP_TIMEOUT } = require('./config'); const debug = require('debug')('jambonz:realtimedb-helpers'); -const HTTP_TIMEOUT = 5000; async function getNuanceAccessToken(client, logger, clientId, secret, scope) { logger = logger || noopLogger; diff --git a/lib/get-tts-voices.js b/lib/get-tts-voices.js index 913f807..2f33899 100644 --- a/lib/get-tts-voices.js +++ b/lib/get-tts-voices.js @@ -1,12 +1,16 @@ const assert = require('assert'); const {noopLogger, createNuanceClient, createKryptonClient} = require('./utils'); const getNuanceAccessToken = require('./get-nuance-access-token'); +const getVerbioAccessToken = require('./get-verbio-token'); const {GetVoicesRequest, Voice} = require('../stubs/nuance/synthesizer_pb'); const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1'); const { IamAuthenticator } = require('ibm-watson/auth'); const ttsGoogle = require('@google-cloud/text-to-speech'); const { PollyClient, DescribeVoicesCommand } = require('@aws-sdk/client-polly'); const getAwsAuthToken = require('./get-aws-sts-token'); +const {Pool} = require('undici'); +const { HTTP_TIMEOUT } = require('./config'); +const verbioVoicePool = new Pool('https://us.rest.speechcenter.verbio.com'); const getIbmVoices = async(client, logger, credentials) => { const {tts_region, tts_api_key} = credentials; @@ -117,6 +121,26 @@ const getAwsVoices = async(_client, createHash, retrieveHash, logger, credential } }; +const getVerbioVoices = async(client, logger, credentials) => { + try { + const access_token = await getVerbioAccessToken(client, logger, credentials); + const { body} = await verbioVoicePool.request({ + path: '/api/v1/voices', + method: 'GET', + headers: { + 'Authorization': `Bearer ${access_token.access_token}`, + 'User-Agent': 'jambonz' + }, + timeout: HTTP_TIMEOUT, + followRedirects: false + }); + return await body.json(); + } catch (err) { + logger.info({err}, 'getVerbioVoices - failed to list voices for Verbio'); + throw err; + } +}; + /** * Synthesize speech to an mp3 file, and also cache the generated speech * in redis (base64 format) for 24 hours so as to avoid unnecessarily paying @@ -136,7 +160,7 @@ const getAwsVoices = async(_client, createHash, retrieveHash, logger, credential async function getTtsVoices(client, createHash, retrieveHash, logger, {vendor, credentials}) { logger = logger || noopLogger; - assert.ok(['nuance', 'ibm', 'google', 'aws', 'polly'].includes(vendor), + assert.ok(['nuance', 'ibm', 'google', 'aws', 'polly', 'verbio'].includes(vendor), `getTtsVoices not supported for vendor ${vendor}`); switch (vendor) { @@ -149,6 +173,8 @@ async function getTtsVoices(client, createHash, retrieveHash, logger, {vendor, c case 'aws': case 'polly': return getAwsVoices(client, createHash, retrieveHash, logger, credentials); + case 'verbio': + return getVerbioVoices(client, logger, credentials); default: break; } diff --git a/lib/get-verbio-token.js b/lib/get-verbio-token.js new file mode 100644 index 0000000..9445a2e --- /dev/null +++ b/lib/get-verbio-token.js @@ -0,0 +1,51 @@ +const {Pool} = require('undici'); +const { noopLogger, makeVerbioKey } = require('./utils'); +const { HTTP_TIMEOUT } = require('./config'); +const pool = new Pool('https://auth.speechcenter.verbio.com:444'); +const debug = require('debug')('jambonz:realtimedb-helpers'); + +async function getVerbioAccessToken(client, logger, credentials) { + logger = logger || noopLogger; + const { client_id, client_secret } = credentials; + try { + const key = makeVerbioKey(client_id); + const access_token = await client.get(key); + if (access_token) { + return {access_token, servedFromCache: true}; + } + + const payload = { + client_id, + client_secret + }; + + const {statusCode, headers, body} = await pool.request({ + path: '/api/v1/token', + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'User-Agent': 'jambonz' + }, + body: JSON.stringify(payload), + timeout: HTTP_TIMEOUT, + followRedirects: false + }); + + if (200 !== statusCode) { + logger.debug({statusCode, headers, body: await body.text()}, 'error fetching access token from Verbio'); + const err = new Error(); + err.statusCode = statusCode; + throw err; + } + const json = await body.json(); + const expiry = Math.floor(json.expiration_time - Date.now() / 1000 - 30); + await client.set(key, json.access_token, 'EX', expiry); + return {...json, servedFromCache: false}; + } catch (err) { + debug(err, `getVerbioAccessToken: Error retrieving Verbio access token for client_id ${client_id}`); + logger.error(err, `getVerbioAccessToken: Error retrieving Verbio access token for client_id ${client_id}`); + throw err; + } +} + +module.exports = getVerbioAccessToken; diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 78de557..3a19c5b 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -23,6 +23,7 @@ const { makeFilePath } = require('./utils'); const getNuanceAccessToken = require('./get-nuance-access-token'); +const getVerbioAccessToken = require('./get-verbio-token'); const { SynthesisRequest, Voice, @@ -92,7 +93,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc logger = logger || noopLogger; assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', - 'whisper', 'deepgram', 'playht', 'rimelabs'].includes(vendor) || + 'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio'].includes(vendor) || vendor.startsWith('custom'), `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`); if ('google' === vendor) { @@ -145,6 +146,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc assert.ok(credentials.api_key, 'synthAudio requires api_key when whisper is used'); } else if (vendor.startsWith('custom')) { assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`); + } else if ('verbio' === vendor) { + assert.ok(voice, 'synthAudio requires voice when verbio is used'); + assert.ok(credentials.client_id, 'synthAudio requires client_id when verbio is used'); + assert.ok(credentials.client_secret, 'synthAudio requires client_secret when verbio is used'); } const key = makeSynthKey({ account_sid, @@ -223,6 +228,11 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc audioBuffer = await synthWhisper(logger, { credentials, stats, voice, text, renderForCaching, disableTtsStreaming}); break; + case 'verbio': + audioBuffer = await synthVerbio(client, logger, { + credentials, stats, voice, text, renderForCaching, disableTtsStreaming}); + if (audioBuffer?.filePath) return audioBuffer; + break; case 'deepgram': audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text, renderForCaching, disableTtsStreaming}); @@ -808,6 +818,46 @@ const synthRimelabs = async(logger, { throw err; } }; +const synthVerbio = async(client, logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => { + //https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API + if (text.length > 2000) { + throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters'); + } + const token = await getVerbioAccessToken(client, logger, credentials); + if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + let params = ''; + params += `{access_token=${token.access_token}`; + params += ',vendor=verbio'; + params += `,voice=${voice}`; + params += ',write_cache_file=1'; + params += '}'; + + return { + filePath: `say:${params}${text.replace(/\n/g, ' ')}`, + servedFromCache: false, + rtt: 0 + }; + } + + try { + const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', { + 'Authorization': `Bearer ${token.access_token}`, + 'User-Agent': 'jambonz', + 'Content-Type': 'application/json' + }); + const r8 = await post('/api/v1/synthesize', { + voice_id: voice, + output_sample_rate: '8k', + output_encoding: 'pcm16', + text + }); + return r8; + } catch (err) { + logger.info({err}, 'synth Verbio returned error'); + stats.increment('tts.count', ['vendor:verbio', 'accepted:no']); + throw err; + } +}; const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => { const {api_key, model_id, baseURL, timeout, speed} = credentials; diff --git a/lib/utils.js b/lib/utils.js index 209d6c1..5b07b07 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -3,11 +3,10 @@ const {SynthesizerClient} = require('../stubs/nuance/synthesizer_grpc_pb'); const {RivaSpeechSynthesisClient} = require('../stubs/riva/proto/riva_tts_grpc_pb'); const {Pool} = require('undici'); const pool = new Pool('https://auth.crt.nuance.com'); -const HTTP_TIMEOUT = 5000; const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443'; const grpc = require('@grpc/grpc-js'); const formurlencoded = require('form-urlencoded'); -const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER } = require('./config'); +const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER, HTTP_TIMEOUT } = require('./config'); const debug = require('debug')('jambonz:realtimedb-helpers'); /** @@ -87,6 +86,12 @@ function makeAwsKey(awsAccessKeyId) { return `aws:${hash.digest('hex')}`; } +function makeVerbioKey(client_id) { + const hash = crypto.createHash('sha1'); + hash.update(client_id); + return `verbio:${hash.digest('hex')}`; +} + function makeNuanceKey(clientId, secret, scope) { const hash = crypto.createHash('sha1'); hash.update(`${clientId}:${secret}:${scope}`); @@ -155,6 +160,7 @@ module.exports = { makeNuanceKey, makeIbmKey, makeAwsKey, + makeVerbioKey, getNuanceAccessToken, createNuanceClient, createKryptonClient, diff --git a/package-lock.json b/package-lock.json index de219f5..82e4622 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@jambonz/speech-utils", - "version": "0.1.0", + "version": "0.1.1", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "@jambonz/speech-utils", - "version": "0.1.0", + "version": "0.1.1", "license": "MIT", "dependencies": { "@aws-sdk/client-polly": "^3.496.0", diff --git a/package.json b/package.json index 765bb55..b669f8f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@jambonz/speech-utils", - "version": "0.1.0", + "version": "0.1.1", "description": "TTS-related speech utilities for jambonz", "main": "index.js", "author": "Dave Horton", diff --git a/test/list-voices.js b/test/list-voices.js index f27dc75..c978765 100644 --- a/test/list-voices.js +++ b/test/list-voices.js @@ -12,6 +12,32 @@ const stats = { histogram: () => {} }; +test('Verbio - get Access key and voices', async(t) => { + const fn = require('..'); + const {client, getTtsVoices, getVerbioAccessToken} = fn(opts, logger); + if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) { + t.pass('skipping Verbio test since no Verbio Keys provided'); + t.end(); + client.quit(); + return; + } + + try { + const credentials = { + client_id: process.env.VERBIO_CLIENT_ID, + client_secret: process.env.VERBIO_CLIENT_SECRET + }; + let obj = await getVerbioAccessToken(credentials); + t.ok(obj.access_token , 'successfully received access token not from cache'); + const voices = await getTtsVoices({vendor: 'verbio', credentials}); + t.ok(voices && voices.length != 0, 'successfully received verbio voices'); + } catch (err) { + console.error(err); + t.end(err); + } + client.quit(); +}); + test('IBM - create access key', async(t) => { const fn = require('..'); const {client, getIbmAccessToken} = fn(opts, logger); diff --git a/test/synth.js b/test/synth.js index 4112dfe..1c80ff5 100644 --- a/test/synth.js +++ b/test/synth.js @@ -670,6 +670,40 @@ test('whisper speech synth tests', async(t) => { language: 'en-US', voice: 'alloy', text, + renderForCaching: true + }); + t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`); + + } catch (err) { + console.error(JSON.stringify(err)); + t.end(err); + } + client.quit(); +}); + +test('Verbio speech synth tests', async(t) => { + const fn = require('..'); + const {synthAudio, client} = fn(opts, logger); + + if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) { + t.pass('skipping Verbio Synthesize test since no Verbio Keys provided'); + t.end(); + client.quit(); + return; + } + + const text = 'Hi there and welcome to jambones!'; + try { + let opts = await synthAudio(stats, { + vendor: 'verbio', + credentials: { + client_id: process.env.VERBIO_CLIENT_ID, + client_secret: process.env.VERBIO_CLIENT_SECRET + }, + language: 'en-US', + voice: 'tommy_en-us', + text, + renderForCaching: true }); t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);