merge latest main into feature branch

2025-12-19 03:37:49 +00:00 · 2024-05-28 18:44:41 +02:00
parent 39d54050cc e099bbb58f
commit f13fc84853
12 changed files with 207 additions and 10 deletions
--- a/index.js
+++ b/index.js
@@ -14,6 +14,7 @@ module.exports = (opts, logger) => {
    purgeTtsCache: require('./lib/purge-tts-cache').bind(null, client, logger),
    addFileToCache: require('./lib/add-file-to-cache').bind(null, client, logger),
    synthAudio: require('./lib/synth-audio').bind(null, client, createHash, retrieveHash, logger),
+    getVerbioAccessToken: require('./lib/get-verbio-token').bind(null, client, logger),
    getNuanceAccessToken: require('./lib/get-nuance-access-token').bind(null, client, logger),
    getIbmAccessToken: require('./lib/get-ibm-access-token').bind(null, client, logger),
    getAwsAuthToken: require('./lib/get-aws-sts-token').bind(null, logger, createHash, retrieveHash),
--- a/lib/config.js
+++ b/lib/config.js
@@ -7,11 +7,14 @@ const JAMBONES_TTS_CACHE_DURATION_MINS = (parseInt(process.env.JAMBONES_TTS_CACH

 const TMP_FOLDER = '/tmp';

+const HTTP_TIMEOUT = 5000;
+
 module.exports = {
  JAMBONES_TTS_TRIM_SILENCE,
  JAMBONES_DISABLE_TTS_STREAMING,
  JAMBONES_HTTP_PROXY_IP,
  JAMBONES_HTTP_PROXY_PORT,
  JAMBONES_TTS_CACHE_DURATION_MINS,
-  TMP_FOLDER
+  TMP_FOLDER,
+  HTTP_TIMEOUT
 };
--- a/lib/get-ibm-access-token.js
+++ b/lib/get-ibm-access-token.js
@@ -2,8 +2,8 @@ const formurlencoded = require('form-urlencoded');
 const {Pool} = require('undici');
 const pool = new Pool('https://iam.cloud.ibm.com');
 const {makeIbmKey, noopLogger} = require('./utils');
+const { HTTP_TIMEOUT } = require('./config');
 const debug = require('debug')('jambonz:realtimedb-helpers');
-const HTTP_TIMEOUT = 5000;

 async function getIbmAccessToken(client, logger, apiKey) {
  logger = logger || noopLogger;
--- a/lib/get-nuance-access-token.js
+++ b/lib/get-nuance-access-token.js
@@ -2,8 +2,8 @@ const formurlencoded = require('form-urlencoded');
 const {Pool} = require('undici');
 const pool = new Pool('https://auth.crt.nuance.com');
 const {makeNuanceKey, makeBasicAuthHeader, noopLogger} = require('./utils');
+const { HTTP_TIMEOUT } = require('./config');
 const debug = require('debug')('jambonz:realtimedb-helpers');
-const HTTP_TIMEOUT = 5000;

 async function getNuanceAccessToken(client, logger, clientId, secret, scope) {
  logger = logger || noopLogger;
--- a/lib/get-tts-voices.js
+++ b/lib/get-tts-voices.js
@@ -1,12 +1,16 @@
 const assert = require('assert');
 const {noopLogger, createNuanceClient, createKryptonClient} = require('./utils');
 const getNuanceAccessToken = require('./get-nuance-access-token');
+const getVerbioAccessToken = require('./get-verbio-token');
 const {GetVoicesRequest, Voice} = require('../stubs/nuance/synthesizer_pb');
 const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1');
 const { IamAuthenticator } = require('ibm-watson/auth');
 const ttsGoogle = require('@google-cloud/text-to-speech');
 const { PollyClient, DescribeVoicesCommand } = require('@aws-sdk/client-polly');
 const getAwsAuthToken = require('./get-aws-sts-token');
+const {Pool} = require('undici');
+const { HTTP_TIMEOUT } = require('./config');
+const verbioVoicePool = new Pool('https://us.rest.speechcenter.verbio.com');

 const getIbmVoices = async(client, logger, credentials) => {
  const {tts_region, tts_api_key} = credentials;
@@ -117,6 +121,26 @@ const getAwsVoices = async(_client, createHash, retrieveHash, logger, credential
  }
 };

+const getVerbioVoices = async(client, logger, credentials) => {
+  try {
+    const access_token = await getVerbioAccessToken(client, logger, credentials);
+    const { body} =  await verbioVoicePool.request({
+      path: '/api/v1/voices',
+      method: 'GET',
+      headers: {
+        'Authorization': `Bearer ${access_token.access_token}`,
+        'User-Agent': 'jambonz'
+      },
+      timeout: HTTP_TIMEOUT,
+      followRedirects: false
+    });
+    return await body.json();
+  } catch (err) {
+    logger.info({err}, 'getVerbioVoices - failed to list voices for Verbio');
+    throw err;
+  }
+};
+
 /**
 * Synthesize speech to an mp3 file, and also cache the generated speech
 * in redis (base64 format) for 24 hours so as to avoid unnecessarily paying
@@ -136,7 +160,7 @@ const getAwsVoices = async(_client, createHash, retrieveHash, logger, credential
 async function getTtsVoices(client, createHash, retrieveHash, logger, {vendor, credentials}) {
  logger = logger || noopLogger;

-  assert.ok(['nuance', 'ibm', 'google', 'aws', 'polly'].includes(vendor),
+  assert.ok(['nuance', 'ibm', 'google', 'aws', 'polly', 'verbio'].includes(vendor),
    `getTtsVoices not supported for vendor ${vendor}`);

  switch (vendor) {
@@ -149,6 +173,8 @@ async function getTtsVoices(client, createHash, retrieveHash, logger, {vendor, c
    case 'aws':
    case 'polly':
      return getAwsVoices(client, createHash, retrieveHash, logger, credentials);
+    case 'verbio':
+      return getVerbioVoices(client, logger, credentials);
    default:
      break;
  }
--- a/lib/get-verbio-token.js
+++ b/lib/get-verbio-token.js
@@ -0,0 +1,51 @@
+const {Pool} = require('undici');
+const { noopLogger, makeVerbioKey } = require('./utils');
+const { HTTP_TIMEOUT } = require('./config');
+const pool = new Pool('https://auth.speechcenter.verbio.com:444');
+const debug = require('debug')('jambonz:realtimedb-helpers');
+
+async function getVerbioAccessToken(client, logger, credentials) {
+  logger = logger || noopLogger;
+  const { client_id, client_secret } = credentials;
+  try {
+    const key = makeVerbioKey(client_id);
+    const access_token = await client.get(key);
+    if (access_token) {
+      return {access_token, servedFromCache: true};
+    }
+
+    const payload = {
+      client_id,
+      client_secret
+    };
+
+    const {statusCode, headers, body} =  await pool.request({
+      path: '/api/v1/token',
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'User-Agent': 'jambonz'
+      },
+      body: JSON.stringify(payload),
+      timeout: HTTP_TIMEOUT,
+      followRedirects: false
+    });
+
+    if (200 !== statusCode) {
+      logger.debug({statusCode, headers, body: await body.text()}, 'error fetching access token from Verbio');
+      const err = new Error();
+      err.statusCode = statusCode;
+      throw err;
+    }
+    const json = await body.json();
+    const expiry =  Math.floor(json.expiration_time - Date.now() / 1000 - 30);
+    await client.set(key, json.access_token, 'EX', expiry);
+    return {...json, servedFromCache: false};
+  } catch (err) {
+    debug(err, `getVerbioAccessToken: Error retrieving Verbio access token for client_id ${client_id}`);
+    logger.error(err, `getVerbioAccessToken: Error retrieving Verbio access token for client_id ${client_id}`);
+    throw err;
+  }
+}
+
+module.exports = getVerbioAccessToken;
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -23,6 +23,7 @@ const {
  makeFilePath
 } = require('./utils');
 const getNuanceAccessToken = require('./get-nuance-access-token');
+const getVerbioAccessToken = require('./get-verbio-token');
 const {
  SynthesisRequest,
  Voice,
@@ -92,7 +93,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
  logger = logger || noopLogger;

  assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
-    'whisper', 'deepgram', 'playht', 'rimelabs'].includes(vendor) ||
+    'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio'].includes(vendor) ||
  vendor.startsWith('custom'),
  `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
  if ('google' === vendor) {
@@ -145,6 +146,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    assert.ok(credentials.api_key, 'synthAudio requires api_key when whisper is used');
  } else  if (vendor.startsWith('custom')) {
    assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`);
+  } else if ('verbio' === vendor) {
+    assert.ok(voice, 'synthAudio requires voice when verbio is used');
+    assert.ok(credentials.client_id, 'synthAudio requires client_id when verbio is used');
+    assert.ok(credentials.client_secret, 'synthAudio requires client_secret when verbio is used');
  }
  const key = makeSynthKey({
    account_sid,
@@ -223,6 +228,11 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        audioBuffer = await synthWhisper(logger, {
          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});        
        break;
+      case 'verbio':
+        audioBuffer = await synthVerbio(client, logger, {
+          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
+        if (audioBuffer?.filePath) return audioBuffer;
+        break;
      case 'deepgram':
        audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
          renderForCaching, disableTtsStreaming});        
@@ -808,6 +818,46 @@ const synthRimelabs = async(logger, {
    throw err;
  }
 };
+const synthVerbio = async(client, logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
+  //https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
+  if (text.length > 2000) {
+    throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters');
+  }
+  const token = await getVerbioAccessToken(client, logger, credentials);
+  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+    let params = '';
+    params += `{access_token=${token.access_token}`;
+    params += ',vendor=verbio';
+    params += `,voice=${voice}`;
+    params += ',write_cache_file=1';
+    params += '}';
+
+    return {
+      filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
+      servedFromCache: false,
+      rtt: 0
+    };
+  }
+
+  try {
+    const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', {
+      'Authorization': `Bearer ${token.access_token}`,
+      'User-Agent': 'jambonz',
+      'Content-Type': 'application/json'
+    });
+    const r8 = await post('/api/v1/synthesize', {
+      voice_id: voice,
+      output_sample_rate: '8k',
+      output_encoding: 'pcm16',
+      text
+    });
+    return r8;
+  } catch (err) {
+    logger.info({err}, 'synth Verbio returned error');
+    stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
+    throw err;
+  }
+};

 const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
  const {api_key, model_id, baseURL, timeout, speed} = credentials;
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -3,11 +3,10 @@ const {SynthesizerClient} = require('../stubs/nuance/synthesizer_grpc_pb');
 const {RivaSpeechSynthesisClient} = require('../stubs/riva/proto/riva_tts_grpc_pb');
 const {Pool} = require('undici');
 const pool = new Pool('https://auth.crt.nuance.com');
-const HTTP_TIMEOUT = 5000;
 const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
 const grpc = require('@grpc/grpc-js');
 const formurlencoded = require('form-urlencoded');
-const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER } = require('./config');
+const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER, HTTP_TIMEOUT } = require('./config');

 const debug = require('debug')('jambonz:realtimedb-helpers');
 /**
@@ -87,6 +86,12 @@ function makeAwsKey(awsAccessKeyId) {
  return `aws:${hash.digest('hex')}`;
 }

+function makeVerbioKey(client_id) {
+  const hash = crypto.createHash('sha1');
+  hash.update(client_id);
+  return `verbio:${hash.digest('hex')}`;
+}
+
 function makeNuanceKey(clientId, secret, scope) {
  const hash = crypto.createHash('sha1');
  hash.update(`${clientId}:${secret}:${scope}`);
@@ -155,6 +160,7 @@ module.exports = {
  makeNuanceKey,
  makeIbmKey,
  makeAwsKey,
+  makeVerbioKey,
  getNuanceAccessToken,
  createNuanceClient,
  createKryptonClient,
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "@jambonz/speech-utils",
-  "version": "0.1.0",
+  "version": "0.1.1",
  "lockfileVersion": 2,
  "requires": true,
  "packages": {
    "": {
      "name": "@jambonz/speech-utils",
-      "version": "0.1.0",
+      "version": "0.1.1",
      "license": "MIT",
      "dependencies": {
        "@aws-sdk/client-polly": "^3.496.0",
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@jambonz/speech-utils",
-  "version": "0.1.0",
+  "version": "0.1.1",
  "description": "TTS-related speech utilities for jambonz",
  "main": "index.js",
  "author": "Dave Horton",
--- a/test/list-voices.js
+++ b/test/list-voices.js
@@ -12,6 +12,32 @@ const stats = {
  histogram: () => {}
 };

+test('Verbio - get Access key and voices', async(t) => {
+  const fn = require('..');
+  const {client, getTtsVoices, getVerbioAccessToken} = fn(opts, logger);
+  if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) {
+    t.pass('skipping Verbio test since no Verbio Keys provided');
+    t.end();
+    client.quit();
+    return;
+  }
+
+  try {
+    const credentials = {
+      client_id: process.env.VERBIO_CLIENT_ID,
+      client_secret: process.env.VERBIO_CLIENT_SECRET
+    };
+    let obj = await getVerbioAccessToken(credentials);
+    t.ok(obj.access_token , 'successfully received access token not from cache');
+    const voices = await getTtsVoices({vendor: 'verbio', credentials});
+    t.ok(voices && voices.length != 0, 'successfully received verbio voices');
+  } catch (err) {
+    console.error(err);
+    t.end(err);
+  }
+  client.quit();
+});
+
 test('IBM - create access key', async(t) => {
  const fn = require('..');
  const {client, getIbmAccessToken} = fn(opts, logger);
--- a/test/synth.js
+++ b/test/synth.js
@@ -670,6 +670,40 @@ test('whisper speech synth tests', async(t) => {
      language: 'en-US',
      voice: 'alloy',
      text,
+      renderForCaching: true
+    });
+    t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);
+
+  } catch (err) {
+    console.error(JSON.stringify(err));
+    t.end(err);
+  }
+  client.quit();
+});
+
+test('Verbio speech synth tests', async(t) => {
+  const fn = require('..');
+  const {synthAudio, client} = fn(opts, logger);
+
+  if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) {
+    t.pass('skipping Verbio Synthesize test since no Verbio Keys provided');
+    t.end();
+    client.quit();
+    return;
+  }
+
+  const text = 'Hi there and welcome to jambones!';
+  try {
+    let opts = await synthAudio(stats, {
+      vendor: 'verbio',
+      credentials: {
+        client_id: process.env.VERBIO_CLIENT_ID,
+        client_secret: process.env.VERBIO_CLIENT_SECRET
+      },
+      language: 'en-US',
+      voice: 'tommy_en-us',
+      text,
+      renderForCaching: true
    });
    t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);