support mod_playht_tts (#304)

* support mod_playht_tts * wip * wip * wip * wip * wip * update speech utils version
2026-01-25 02:08:24 +00:00 · 2024-04-08 21:21:29 +07:00
parent 40de2c5945
commit e2c1383723
10 changed files with 146 additions and 17 deletions
--- a/db/jambones-sql.sql
+++ b/db/jambones-sql.sql
@@ -162,7 +162,7 @@ regex VARCHAR(32) NOT NULL COMMENT 'regex-based pattern match against dialed num
 description VARCHAR(1024),
 priority INTEGER NOT NULL COMMENT 'lower priority routes are attempted first',
 PRIMARY KEY (lcr_route_sid)
-) COMMENT='An ordered list of  digit patterns in an LCR table.  The pat';
+) COMMENT='An ordered list of  digit patterns in an LCR table.  The patterns are tested in sequence until one matches';

 CREATE TABLE lcr
 (
@@ -173,7 +173,7 @@ default_carrier_set_entry_sid CHAR(36) COMMENT 'default carrier/route to use whe
 service_provider_sid CHAR(36),
 account_sid CHAR(36),
 PRIMARY KEY (lcr_sid)
-) COMMENT='An LCR (least cost routing) table that is used by a service ';
+) COMMENT='An LCR (least cost routing) table that is used by a service provider or account to make decisions about routing outbound calls when multiple carriers are available.';

 CREATE TABLE password_settings
 (
@@ -496,7 +496,7 @@ messaging_hook_sid CHAR(36) COMMENT 'webhook to call for inbound SMS/MMS ',
 app_json TEXT,
 speech_synthesis_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
 speech_synthesis_language VARCHAR(12) NOT NULL DEFAULT 'en-US',
-speech_synthesis_voice VARCHAR(64),
+speech_synthesis_voice VARCHAR(256),
 speech_synthesis_label VARCHAR(64),
 speech_recognizer_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
 speech_recognizer_language VARCHAR(64) NOT NULL DEFAULT 'en-US',
@@ -504,7 +504,7 @@ speech_recognizer_label VARCHAR(64),
 use_for_fallback_speech BOOLEAN DEFAULT false,
 fallback_speech_synthesis_vendor VARCHAR(64),
 fallback_speech_synthesis_language VARCHAR(12),
-fallback_speech_synthesis_voice VARCHAR(64),
+fallback_speech_synthesis_voice VARCHAR(256),
 fallback_speech_synthesis_label VARCHAR(64),
 fallback_speech_recognizer_vendor VARCHAR(64),
 fallback_speech_recognizer_language VARCHAR(64),
--- a/db/jambones.sqs
+++ b/db/jambones.sqs
@@ -2568,7 +2568,7 @@
        </SQLField>
        <SQLField>
            <name><![CDATA[speech_synthesis_voice]]></name>
-            <type><![CDATA[VARCHAR(64)]]></type>
+            <type><![CDATA[VARCHAR(256)]]></type>
            <notNull><![CDATA[0]]></notNull>
            <uid><![CDATA[929D66F0-64B9-4D7C-AB4B-24F131E1178F]]></uid>
        </SQLField>
@@ -2618,7 +2618,7 @@
        </SQLField>
        <SQLField>
            <name><![CDATA[fallback_speech_synthesis_voice]]></name>
-            <type><![CDATA[VARCHAR(64)]]></type>
+            <type><![CDATA[VARCHAR(256)]]></type>
            <notNull><![CDATA[0]]></notNull>
            <uid><![CDATA[6A0E92C9-32B9-4179-A893-3DADF5DD7728]]></uid>
        </SQLField>
@@ -3108,7 +3108,7 @@
        <RightSidebarWidth><![CDATA[1235.000000]]></RightSidebarWidth>
        <sidebarIndex><![CDATA[2]]></sidebarIndex>
        <snapToGrid><![CDATA[0]]></snapToGrid>
-        <SourceSidebarWidth><![CDATA[0.000000]]></SourceSidebarWidth>
+        <SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
        <SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
        <uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
        <windowHeight><![CDATA[870.000000]]></windowHeight>
--- a/db/upgrade-jambonz-db.js
+++ b/db/upgrade-jambonz-db.js
@@ -194,6 +194,8 @@ const sql = {
  ],
  9000: [
    'ALTER TABLE sip_gateways ADD COLUMN send_options_ping BOOLEAN NOT NULL DEFAULT 0',
+    'ALTER TABLE applications MODIFY COLUMN speech_synthesis_voice VARCHAR(256)',
+    'ALTER TABLE applications MODIFY COLUMN fallback_speech_synthesis_voice VARCHAR(256)',
  ]
 };

--- a/lib/routes/api/speech-credentials.js
+++ b/lib/routes/api/speech-credentials.js
@@ -6,7 +6,8 @@ const sysError = require('../error');
 const {decrypt, encrypt} = require('../../utils/encrypt-decrypt');
 const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils');
 const {decryptCredential, testWhisper, testDeepgramTTS,
-  getLanguagesAndVoicesForVendor} = require('../../utils/speech-utils');
+  getLanguagesAndVoicesForVendor,
+  testPlayHT} = require('../../utils/speech-utils');
 const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
 const {
  testGoogleTts,
@@ -135,6 +136,8 @@ const encryptCredential = (obj) => {
    auth_token = '',
    cobalt_server_uri,
    model_id,
+    user_id,
+    voice_engine,
    options
  } = obj;

@@ -219,6 +222,13 @@ const encryptCredential = (obj) => {
      const elevenlabsData = JSON.stringify({api_key, model_id, options});
      return encrypt(elevenlabsData);

+    case 'playht':
+      assert(api_key, 'invalid playht speech credential: api_key is required');
+      assert(user_id, 'invalid playht speech credential: user_id is required');
+      assert(voice_engine, 'invalid voice_engine speech credential: voice_engine is required');
+      const playhtData = JSON.stringify({api_key, user_id, voice_engine, options});
+      return encrypt(playhtData);
+
    case 'assemblyai':
      assert(api_key, 'invalid assemblyai speech credential: api_key is required');
      const assemblyaiData = JSON.stringify({api_key});
@@ -418,6 +428,7 @@ router.put('/:sid', async(req, res) => {
          custom_tts_url,
          cobalt_server_uri,
          model_id,
+          voice_engine,
          options,
          deepgram_stt_uri,
          deepgram_stt_use_tls,
@@ -443,6 +454,7 @@ router.put('/:sid', async(req, res) => {
          custom_tts_url,
          cobalt_server_uri,
          model_id,
+          voice_engine,
          options,
          deepgram_stt_uri,
          deepgram_stt_use_tls,
@@ -724,6 +736,17 @@ router.get('/:sid/test', async(req, res) => {
          SpeechCredential.ttsTestResult(sid, false);
        }
      }
+    } else if (cred.vendor === 'playht') {
+      if (cred.use_for_tts) {
+        try {
+          await testPlayHT(logger, synthAudio, credential);
+          results.tts.status = 'ok';
+          SpeechCredential.ttsTestResult(sid, true);
+        } catch (err) {
+          results.tts = {status: 'fail', reason: err.message};
+          SpeechCredential.ttsTestResult(sid, false);
+        }
+      }
    } else if (cred.vendor === 'assemblyai') {
      const {api_key} = credential;
      if (cred.use_for_stt) {
--- a/lib/utils/speech-data/tts-model-playht.js
+++ b/lib/utils/speech-data/tts-model-playht.js
@@ -0,0 +1,6 @@
+module.exports = [
+  { name: 'PlayHT2.0-turbo', value: 'PlayHT2.0-turbo' },
+  { name: 'PlayHT2.0', value: 'PlayHT2.0' },
+  { name: 'PlayHT1.0', value: 'PlayHT1.0' },
+];
+
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -21,6 +21,7 @@ const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
 const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
 const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
 const TtsModelWhisper = require('./speech-data/tts-model-whisper');
+const TtsModelPlayHT = require('./speech-data/tts-model-playht');

 const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
 const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -240,6 +241,27 @@ const testElevenlabs = async(logger, credentials) => {
  }
 };

+const testPlayHT = async(logger, synthAudio, credentials) => {
+  try {
+    await synthAudio(
+      {
+        increment: () => {},
+        histogram: () => {}
+      },
+      {
+        vendor: 'playht',
+        credentials,
+        language: 'en-US',
+        voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
+        text: 'Hi there and welcome to jambones!'
+      }
+    );
+  } catch (err) {
+    logger.info({err}, 'synth Playht returned error');
+    throw err;
+  }
+};
+
 const testWhisper = async(logger, synthAudio, credentials) => {
  try {
    await synthAudio({increment: () => {}, histogram: () => {}},
@@ -428,6 +450,12 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
    obj.model_id = o.model_id;
    obj.options = o.options;
+  } else if ('playht' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+    obj.user_id = o.user_id;
+    obj.voice_engine = o.voice_engine;
+    obj.options = o.options;
  } else if (obj.vendor.startsWith('custom:')) {
    const o = JSON.parse(decrypt(credential));
    obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token;
@@ -488,6 +516,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
      return await getLanguagesVoicesForSoniox(credential, getTtsVoices, logger);
    case 'elevenlabs':
      return await getLanguagesVoicesForElevenlabs(credential, getTtsVoices, logger);
+    case 'playht':
+      return await getLanguagesVoicesForPlayHT(credential, getTtsVoices, logger);
    case 'assemblyai':
      return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
    case 'whisper':
@@ -645,6 +675,49 @@ async function getLanguagesVoicesForElevenlabs(credential) {
  }
 }

+const concat = (a) => {
+  return a ? ` ${a},` : '';
+};
+
+async function getLanguagesVoicesForPlayHT(credential) {
+  if (credential) {
+    const get = bent('https://api.play.ht', 'GET', 'json', {
+      'AUTHORIZATION' : credential.api_key,
+      'X-USER-ID': credential.user_id,
+      'Accept': 'application/json'
+    });
+
+    const voices = await get('/api/v2/voices');
+
+    const buildVoice = (d) => {
+      let name = `${d.name} -${concat(d.accent)}${concat(d.age)}${concat(d.gender)}
+${concat(d.loudness)}${concat(d.style)}${concat(d.tempo)}${concat(d.texture)}` ;
+      name = name.endsWith(',') ? name.slice(0, -1) : name;
+      return {
+        value: `${d.id}`,
+        name
+      };
+    };
+
+    const ttsVoices = voices.reduce((acc, voice) => {
+      const languageCode = voice.language_code;
+      const existingLanguage = acc.find((lang) => lang.value === languageCode);
+      if (existingLanguage) {
+        existingLanguage.voices.push(buildVoice(voice));
+      } else {
+        acc.push({
+          value: voice.language_code,
+          name: voice.language,
+          voices: [buildVoice(voice)]
+        });
+      }
+      return acc;
+    }, []);
+    return tranform(ttsVoices, undefined, TtsModelPlayHT);
+  }
+  return tranform(undefined, undefined, TtsModelPlayHT);
+}
+
 async function getLanguagesVoicesForAssemblyAI(credential) {
  return tranform(undefined, SttAssemblyaiLanguagesVoices);
 }
@@ -796,6 +869,7 @@ module.exports = {
  testIbmStt,
  testSonioxStt,
  testElevenlabs,
+  testPlayHT,
  testAssemblyStt,
  testDeepgramTTS,
  getSpeechCredential,
--- a/package-lock.json
+++ b/package-lock.json
@@ -19,7 +19,7 @@
        "@jambonz/lamejs": "^1.2.2",
        "@jambonz/mw-registrar": "^0.2.7",
        "@jambonz/realtimedb-helpers": "^0.8.8",
-        "@jambonz/speech-utils": "^0.0.49",
+        "@jambonz/speech-utils": "^0.0.50",
        "@jambonz/time-series": "^0.2.8",
        "@jambonz/verb-specifications": "^0.0.69",
        "@soniox/soniox-node": "^1.2.2",
@@ -2027,9 +2027,9 @@
      }
    },
    "node_modules/@jambonz/speech-utils": {
-      "version": "0.0.49",
-      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.49.tgz",
-      "integrity": "sha512-hIVdgiPJJN2WYm7qlcP8yZv+1w+z38sCmiFNf9xMAAV6pjrDCEeUjrwpLhaFkWqVmkNrHh9PHuzPlkLDNkzhIA==",
+      "version": "0.0.50",
+      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.50.tgz",
+      "integrity": "sha512-fcMaOuWrBVFh6FKiiurYhnQV71xXmnkyBQmp4OjNd1Zo8Ya+tZMdAJjyHtimjJdgiwJbwDnfdSwKSuz8G9CVkQ==",
      "dependencies": {
        "@aws-sdk/client-polly": "^3.496.0",
        "@aws-sdk/client-sts": "^3.496.0",
--- a/package.json
+++ b/package.json
@@ -29,7 +29,7 @@
    "@jambonz/lamejs": "^1.2.2",
    "@jambonz/mw-registrar": "^0.2.7",
    "@jambonz/realtimedb-helpers": "^0.8.8",
-    "@jambonz/speech-utils": "^0.0.49",
+    "@jambonz/speech-utils": "^0.0.50",
    "@jambonz/time-series": "^0.2.8",
    "@jambonz/verb-specifications": "^0.0.69",
    "@soniox/soniox-node": "^1.2.2",
--- a/test/docker-compose-testbed.yaml
+++ b/test/docker-compose-testbed.yaml
@@ -10,7 +10,7 @@ networks:

 services:  
  mysql:
-    platform: linux/x86_64
+    # platform: linux/x86_64
    image: mysql:5.7
    ports:
      - "3360:3306"
@@ -36,7 +36,7 @@ services:
        ipv4_address: 172.58.0.3
    
  influxdb:
-    platform: linux/x86_64
+    # platform: linux/x86_64
    image: influxdb:1.8
    ports:
      - "8086:8086"
--- a/test/speech-credentials.js
+++ b/test/speech-credentials.js
@@ -536,7 +536,7 @@ test('speech credentials tests', async(t) => {
        model_id: 'eleven_multilingual_v2'
      }
    });
-    t.ok(result.statusCode === 201, 'successfully added speech credential for Cobalt');
+    t.ok(result.statusCode === 201, 'successfully added speech credential for elevenlabs');
    const elevenlabs_sid = result.body.sid;

    /* delete the credential */
@@ -544,7 +544,31 @@ test('speech credentials tests', async(t) => {
      auth: authUser,
      resolveWithFullResponse: true,
    });
-    t.ok(result.statusCode === 204, 'successfully deleted speech credential for Cobalt');
+    t.ok(result.statusCode === 204, 'successfully deleted speech credential for elevenlabs');
+
+    /* add a credential for playht */
+    result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
+      resolveWithFullResponse: true,
+      auth: authUser,
+      json: true,
+      body: {
+        vendor: 'playht',
+        use_for_stt: false,
+        use_for_tts: true,
+        api_key: 'asdasdasdasddsadasda',
+        user_id: 'user_id',
+        voice_engine: 'PlayHT2.0-turbo'
+      }
+    });
+    t.ok(result.statusCode === 201, 'successfully added speech credential for playht');
+    const playht_sid = result.body.sid;
+
+    /* delete the credential */
+    result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${playht_sid}`, {
+      auth: authUser,
+      resolveWithFullResponse: true,
+    });
+    t.ok(result.statusCode === 204, 'successfully deleted speech credential for playht');


    /* add a credential for custom voices google */