support verbio speech (#323)

* support verbio speech * wip * update speech version * update verb specification
2026-01-25 02:08:24 +00:00 · 2024-05-29 18:35:40 +07:00
parent ffe9cb23eb
commit d33d0aa519
8 changed files with 212 additions and 15 deletions
--- a/app.js
+++ b/app.js
@@ -54,6 +54,7 @@ const {
  getTtsSize,
  purgeTtsCache,
  getAwsAuthToken,
+  getVerbioAccessToken,
  synthAudio
 } = require('@jambonz/speech-utils')({}, logger);
 const {
@@ -99,6 +100,7 @@ app.locals = {
  getTtsVoices,
  getTtsSize,
  getAwsAuthToken,
+  getVerbioAccessToken,
  purgeTtsCache,
  synthAudio,
  lookupAppBySid,
--- a/db/jambones.sqs
+++ b/db/jambones.sqs
@@ -551,7 +551,7 @@
        </location>
        <size>
            <width>293.00</width>
-            <height>560.00</height>
+            <height>540.00</height>
        </size>
        <zorder>6</zorder>
        <SQLField>
@@ -3111,11 +3111,11 @@
        <SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
        <SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
        <uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
-        <windowHeight><![CDATA[870.000000]]></windowHeight>
-        <windowLocationX><![CDATA[-1164.000000]]></windowLocationX>
-        <windowLocationY><![CDATA[1131.000000]]></windowLocationY>
-        <windowScrollOrigin><![CDATA[{0.5, 0}]]></windowScrollOrigin>
-        <windowWidth><![CDATA[1512.000000]]></windowWidth>
+        <windowHeight><![CDATA[1027.000000]]></windowHeight>
+        <windowLocationX><![CDATA[1728.000000]]></windowLocationX>
+        <windowLocationY><![CDATA[65.000000]]></windowLocationY>
+        <windowScrollOrigin><![CDATA[{1, 0}]]></windowScrollOrigin>
+        <windowWidth><![CDATA[1675.000000]]></windowWidth>
    </SQLDocumentInfo>
    <AllowsIndexRenamingOnInsert><![CDATA[1]]></AllowsIndexRenamingOnInsert>
    <defaultLabelExpanded><![CDATA[1]]></defaultLabelExpanded>
--- a/lib/routes/api/speech-credentials.js
+++ b/lib/routes/api/speech-credentials.js
@@ -8,7 +8,9 @@ const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = req
 const {decryptCredential, testWhisper, testDeepgramTTS,
  getLanguagesAndVoicesForVendor,
  testPlayHT,
-  testRimelabs} = require('../../utils/speech-utils');
+  testRimelabs,
+  testVerbioTts,
+  testVerbioStt} = require('../../utils/speech-utils');
 const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
 const {
  testGoogleTts,
@@ -116,6 +118,7 @@ const encryptCredential = (obj) => {
    role_arn,
    region,
    client_id,
+    client_secret,
    secret,
    nuance_tts_uri,
    nuance_stt_uri,
@@ -140,6 +143,7 @@ const encryptCredential = (obj) => {
    model_id,
    user_id,
    voice_engine,
+    engine_version,
    options
  } = obj;

@@ -255,6 +259,13 @@ const encryptCredential = (obj) => {
      const whisperData = JSON.stringify({api_key, model_id});
      return encrypt(whisperData);

+    case 'verbio':
+      assert(engine_version, 'invalid verbio speech credential: client_id is required');
+      assert(client_id, 'invalid verbio speech credential: client_id is required');
+      assert(client_secret, 'invalid verbio speech credential: secret is required');
+      const verbioData = JSON.stringify({client_id, client_secret, engine_version});
+      return encrypt(verbioData);
+
    default:
      if (vendor.startsWith('custom:')) {
        const customData = JSON.stringify({auth_token, custom_stt_url, custom_tts_url});
@@ -501,7 +512,7 @@ router.put('/:sid', async(req, res) => {
 * Test a credential
 */
 router.get('/:sid/test', async(req, res) => {
-  const {logger, synthAudio} = req.app.locals;
+  const {logger, synthAudio, getVerbioAccessToken} = req.app.locals;
  try {
    const sid = parseSpeechCredentialSid(req);
    const creds = await SpeechCredential.retrieve(sid);
@@ -672,8 +683,7 @@ router.get('/:sid/test', async(req, res) => {
          SpeechCredential.sttTestResult(sid, false);
        }
      }
-    }
-    else if (cred.vendor === 'deepgram') {
+    } else if (cred.vendor === 'deepgram') {
      const {api_key} = credential;
      if (cred.use_for_tts) {
        try {
@@ -803,6 +813,27 @@ router.get('/:sid/test', async(req, res) => {
          SpeechCredential.ttsTestResult(sid, false);
        }
      }
+    } else if (cred.vendor === 'verbio') {
+      if (cred.use_for_tts) {
+        try {
+          await testVerbioTts(logger, synthAudio, credential);
+          results.tts.status = 'ok';
+          SpeechCredential.ttsTestResult(sid, true);
+        } catch (err) {
+          results.tts = {status: 'fail', reason: err.message};
+          SpeechCredential.ttsTestResult(sid, false);
+        }
+      }
+      if (cred.use_for_stt) {
+        try {
+          await testVerbioStt(logger, getVerbioAccessToken, credential);
+          results.stt.status = 'ok';
+          SpeechCredential.sttTestResult(sid, true);
+        } catch (err) {
+          results.stt = {status: 'fail', reason: err.message};
+          SpeechCredential.sttTestResult(sid, false);
+        }
+      }
    }

    res.status(200).json(results);
--- a/lib/utils/speech-data/stt-verbio.js
+++ b/lib/utils/speech-data/stt-verbio.js
@@ -0,0 +1,14 @@
+module.exports = [
+  { name: 'US English', value: 'en-US' },
+  { name: 'British English', value: 'en-GB' },
+  { name: 'LATAM Spanish', value: 'en-USes-419' },
+  { name: 'Spanish', value: 'es' },
+  { name: 'Catalan', value: 'ca-ES', version: 'v2' },
+  { name: 'Brazilian Portuguese', value: 'pt-BR' },
+  { name: 'French', value: 'fr', version: 'v1' },
+  { name: 'Canadian French', value: 'fr-CA', version: 'v1' },
+  { name: 'German', value: 'de', version: 'v1' },
+  { name: 'Italian', value: 'it', version: 'v1' },
+  { name: 'Turkish', value: 'tr', version: 'v1' },
+  { name: 'Japanese', value: 'ja', version: 'v1' },
+];
--- a/lib/utils/speech-data/tts-verbio.js
+++ b/lib/utils/speech-data/tts-verbio.js
@@ -0,0 +1,62 @@
+module.exports = [
+  {
+    value: 'en-US',
+    name: 'US English',
+    voices: [
+      {
+        value: 'tommy_en_us',
+        name: 'Tommy-Male',
+      },
+    ],
+  },
+  {
+    value: 'es-ES',
+    name: 'Castilian Spanish',
+    voices: [
+      {
+        value: 'david_es_es',
+        name: 'David-Male',
+      },
+    ],
+  },
+  {
+    value: 'es-PE',
+    name: 'Peruvian Spanish',
+    voices: [
+      {
+        value: 'miguel_es_pe',
+        name: 'Miguel-Male',
+      },
+    ],
+  },
+  {
+    value: 'es-PE',
+    name: 'Peruvian Spanish',
+    voices: [
+      {
+        value: 'luz_es_pe',
+        name: 'Luz-Female',
+      },
+    ],
+  },
+  {
+    value: 'pt-BR',
+    name: 'Brazilian Portuguese',
+    voices: [
+      {
+        value: 'bel_pt_br',
+        name: 'Bel-Female',
+      },
+    ],
+  },
+  {
+    value: 'ca-ES',
+    name: 'Catalan',
+    voices: [
+      {
+        value: 'anna_ca',
+        name: 'Anna-Female',
+      },
+    ],
+  },
+];
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -18,6 +18,7 @@ const TtsNvidiaLanguagesVoices = require('./speech-data/tts-nvidia');
 const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
 const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
 const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
+const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');

 const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
 const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
@@ -35,6 +36,7 @@ const SttNvidiaLanguagesVoices = require('./speech-data/stt-nvidia');
 const SttCobaltLanguagesVoices = require('./speech-data/stt-cobalt');
 const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
 const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
+const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');

 const testSonioxStt = async(logger, credentials) => {
  const api_key = credentials;
@@ -366,6 +368,43 @@ const testWellSaidStt = async(logger, credentials) => {
  return true;
 };

+const testVerbioTts = async(logger, synthAudio, credentials) => {
+  try {
+    await synthAudio(
+      {
+        increment: () => {},
+        histogram: () => {}
+      },
+      {
+        vendor: 'verbio',
+        credentials,
+        language: 'en-US',
+        voice: 'tommy_en-us',
+        text: 'Hi there and welcome to jambones!'
+      }
+    );
+  } catch (err) {
+    logger.info({err}, 'synth Verbio returned error');
+    throw err;
+  }
+};
+const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
+  const token = await getVerbioAccessToken(credentials);
+  try {
+    const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'json', {
+      'Authorization': `Bearer ${token.access_token}`,
+      'User-Agent': 'jambonz',
+      'Content-Type': 'audio/wav'
+    });
+    const json = await post('/api/v1/recognize?language=en-US&version=V1',
+      fs.readFileSync(`${__dirname}/../../data/test_audio.wav`));
+    logger.debug({json}, 'successfully speech to text from verbio');
+  } catch (err) {
+    logger.info({err}, 'testWellSaidTts returned error');
+    throw err;
+  }
+};
+
 const testAssemblyStt = async(logger, credentials) => {
  const {api_key} = credentials;

@@ -512,6 +551,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
    obj.model_id = o.model_id;
+  } else if ('verbio' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.client_id = o.client_id;
+    obj.client_secret = isObscureKey ? obscureKey(o.client_secret) : o.client_secret;
+    obj.engine_version = o.engine_version;
  }
 }

@@ -568,6 +612,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
      return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
    case 'whisper':
      return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
+    case 'verbio':
+      return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
    default:
      logger.info(`invalid vendor ${vendor}, return empty result`);
      throw new Error(`Invalid vendor ${vendor}`);
@@ -816,6 +862,23 @@ async function getLanguagesVoicesForWhisper(credential) {
  return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
 }

+async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
+  const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
+    if (!v.version || credentials.engine_version === v.version) {
+      acc.push(v);
+    }
+    return acc;
+  }, []);
+  try {
+    const data = await getTtsVoices({vendor: 'verbio', credentials});
+    const voices = parseVerbioLanguagesVoices(data);
+    return tranform(voices, stt, undefined);
+  } catch (err) {
+    logger.info({err}, 'there is error while fetching verbio speech voices');
+    return tranform(TtsVerbioLanguagesVoices, stt, undefined);
+  }
+}
+
 function tranform(tts, stt, models) {
  return {
    ...(tts && {tts}),
@@ -943,6 +1006,29 @@ function parseMicrosoftLanguagesVoices(data) {
  }, []);
 }

+function parseVerbioLanguagesVoices(data) {
+  return data.reduce((acc, voice) => {
+    const languageCode = voice.language;
+    const existingLanguage = acc.find((lang) => lang.value === languageCode);
+    if (existingLanguage) {
+      existingLanguage.voices.push({
+        value: voice.voice_id,
+        name: voice.name,
+      });
+    } else {
+      acc.push({
+        value: voice.language,
+        name: voice.language,
+        voices: [{
+          value: voice.voice_id,
+          name: voice.name,
+        }]
+      });
+    }
+    return acc;
+  }, []);
+}
+
 module.exports = {
  testGoogleTts,
  testGoogleStt,
@@ -966,5 +1052,7 @@ module.exports = {
  getSpeechCredential,
  decryptCredential,
  testWhisper,
+  testVerbioTts,
+  testVerbioStt,
  getLanguagesAndVoicesForVendor
 };
--- a/package-lock.json
+++ b/package-lock.json
@@ -21,7 +21,7 @@
        "@jambonz/realtimedb-helpers": "^0.8.9",
        "@jambonz/speech-utils": "^0.1.3",
        "@jambonz/time-series": "^0.2.8",
-        "@jambonz/verb-specifications": "^0.0.69",
+        "@jambonz/verb-specifications": "^0.0.72",
        "@soniox/soniox-node": "^1.2.2",
        "argon2": "^0.40.1",
        "assemblyai": "^4.3.4",
@@ -2082,9 +2082,9 @@
      }
    },
    "node_modules/@jambonz/verb-specifications": {
-      "version": "0.0.69",
-      "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
-      "integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
+      "version": "0.0.72",
+      "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.72.tgz",
+      "integrity": "sha512-sjA+/LQP2p1zE02UByy9OaAaSxbfQNxQ6D0pwYoMG42U8n+8Det+GFM/9+oFVnbNjUH9bvgT8vrR57U0lU4Cpw==",
      "dependencies": {
        "debug": "^4.3.4",
        "pino": "^8.8.0"
--- a/package.json
+++ b/package.json
@@ -31,7 +31,7 @@
    "@jambonz/realtimedb-helpers": "^0.8.9",
    "@jambonz/speech-utils": "^0.1.3",
    "@jambonz/time-series": "^0.2.8",
-    "@jambonz/verb-specifications": "^0.0.69",
+    "@jambonz/verb-specifications": "^0.0.72",
    "@soniox/soniox-node": "^1.2.2",
    "argon2": "^0.40.1",
    "assemblyai": "^4.3.4",