support inworld tts (#472)

* support inworld tts * inworld tts voices
2025-12-19 05:47:46 +00:00 · 2025-06-27 22:12:00 +07:00
parent 0842793aea
commit 5421f1421f
7 changed files with 241 additions and 6 deletions
--- a/lib/routes/api/speech-credentials.js
+++ b/lib/routes/api/speech-credentials.js
@@ -14,7 +14,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
  testSpeechmaticsStt,
  testCartesia,
  testVoxistStt,
-  testOpenAiStt} = require('../../utils/speech-utils');
+  testOpenAiStt,
+  testInworld} = require('../../utils/speech-utils');
 const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
 const {
  testGoogleTts,
@@ -283,6 +284,12 @@ const encryptCredential = (obj) => {
      const rimelabsData = JSON.stringify({api_key, model_id, options});
      return encrypt(rimelabsData);

+    case 'inworld':
+      assert(api_key, 'invalid inworld speech credential: api_key is required');
+      assert(model_id, 'invalid inworld speech credential: model_id is required');
+      const inworldData = JSON.stringify({api_key, model_id, options});
+      return encrypt(inworldData);
+
    case 'assemblyai':
      assert(api_key, 'invalid assemblyai speech credential: api_key is required');
      const assemblyaiData = JSON.stringify({api_key});
@@ -874,6 +881,17 @@ router.get('/:sid/test', async(req, res) => {
          SpeechCredential.ttsTestResult(sid, false);
        }
      }
+    } else if (cred.vendor === 'inworld') {
+      if (cred.use_for_tts) {
+        try {
+          await testInworld(logger, synthAudio, credential);
+          results.tts.status = 'ok';
+          SpeechCredential.ttsTestResult(sid, true);
+        } catch (err) {
+          results.tts = {status: 'fail', reason: err.message};
+          SpeechCredential.ttsTestResult(sid, false);
+        }
+      }
    } else if (cred.vendor === 'rimelabs') {
      if (cred.use_for_tts) {
        try {
--- a/lib/utils/speech-data/tts-inworld.js
+++ b/lib/utils/speech-data/tts-inworld.js
@@ -0,0 +1,118 @@
+module.exports = [
+  {
+    value: 'en',
+    name: 'English',
+    voices: [
+      { name: 'Alex', value: 'Alex' },
+      { name: 'Ashley', value: 'Ashley' },
+      { name: 'Craig', value: 'Craig' },
+      { name: 'Deborah', value: 'Deborah' },
+      { name: 'Dennis', value: 'Dennis' },
+      { name: 'Edward', value: 'Edward' },
+      { name: 'Elizabeth', value: 'Elizabeth' },
+      { name: 'Hades', value: 'Hades' },
+      { name: 'Julia', value: 'Julia' },
+      { name: 'Pixie', value: 'Pixie' },
+      { name: 'Mark', value: 'Mark' },
+      { name: 'Olivia', value: 'Olivia' },
+      { name: 'Priya', value: 'Priya' },
+      { name: 'Ronald', value: 'Ronald' },
+      { name: 'Sarah', value: 'Sarah' },
+      { name: 'Shaun', value: 'Shaun' },
+      { name: 'Theodore', value: 'Theodore' },
+      { name: 'Timothy', value: 'Timothy' },
+      { name: 'Wendy', value: 'Wendy' },
+      { name: 'Dominus', value: 'Dominus' },
+    ],
+  },
+  {
+    value: 'zh',
+    name: 'Chinese',
+    voices: [
+      { name: 'Yichen', value: 'Yichen' },
+      { name: 'Xiaoyin', value: 'Xiaoyin' },
+      { name: 'Xinyi', value: 'Xinyi' },
+      { name: 'Jing', value: 'Jing' },
+    ],
+  },
+  {
+    value: 'nl',
+    name: 'Dutch',
+    voices: [
+      { name: 'Erik', value: 'Erik' },
+      { name: 'Katrien', value: 'Katrien' },
+      { name: 'Lennart', value: 'Lennart' },
+      { name: 'Lore', value: 'Lore' },
+    ],
+  },
+  {
+    value: 'fr',
+    name: 'French',
+    voices: [
+      { name: 'Alain', value: 'Alain' },
+      { name: 'Hélène', value: 'Hélène' },
+      { name: 'Mathieu', value: 'Mathieu' },
+      { name: 'Étienne', value: 'Étienne' },
+    ],
+  },
+  {
+    value: 'de',
+    name: 'German',
+    voices: [
+      { name: 'Johanna', value: 'Johanna' },
+      { name: 'Josef', value: 'Josef' },
+    ],
+  },
+  {
+    value: 'it',
+    name: 'Italian',
+    voices: [
+      { name: 'Gianni', value: 'Gianni' },
+      { name: 'Orietta', value: 'Orietta' },
+    ],
+  },
+  {
+    value: 'ja',
+    name: 'Japanese',
+    voices: [
+      { name: 'Asuka', value: 'Asuka' },
+      { name: 'Satoshi', value: 'Satoshi' },
+    ],
+  },
+  {
+    value: 'ko',
+    name: 'Korean',
+    voices: [
+      { name: 'Hyunwoo', value: 'Hyunwoo' },
+      { name: 'Minji', value: 'Minji' },
+      { name: 'Seojun', value: 'Seojun' },
+      { name: 'Yoona', value: 'Yoona' },
+    ],
+  },
+  {
+    value: 'pl',
+    name: 'Polish',
+    voices: [
+      { name: 'Szymon', value: 'Szymon' },
+      { name: 'Wojciech', value: 'Wojciech' },
+    ],
+  },
+  {
+    value: 'pt',
+    name: 'Portuguese',
+    voices: [
+      { name: 'Heitor', value: 'Heitor' },
+      { name: 'Maitê', value: 'Maitê' },
+    ],
+  },
+  {
+    value: 'es',
+    name: 'Spanish',
+    voices: [
+      { name: 'Diego', value: 'Diego' },
+      { name: 'Lupita', value: 'Lupita' },
+      { name: 'Miguel', value: 'Miguel' },
+      { name: 'Rafael', value: 'Rafael' },
+    ],
+  },
+];
--- a/lib/utils/speech-data/tts-model-inworld.js
+++ b/lib/utils/speech-data/tts-model-inworld.js
@@ -0,0 +1,5 @@
+module.exports = [
+  { name: 'Llama Inworld TTS', value: 'inworld-tts-1' },
+  { name: 'Llama Inworld TTS Max', value: 'inworld-tts-1-max' },
+];
+
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -19,6 +19,7 @@ const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
 const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
 const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
 const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
+const TtsInworldLanguagesVoices = require('./speech-data/tts-inworld');
 const ttsCartesia = require('./speech-data/tts-cartesia');

 const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
@@ -28,6 +29,7 @@ const TtsModelWhisper = require('./speech-data/tts-model-whisper');
 const TtsModelPlayHT = require('./speech-data/tts-model-playht');
 const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht');
 const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs');
+const TtsModelInworld = require('./speech-data/tts-model-inworld');
 const TtsModelCartesia = require('./speech-data/tts-model-cartesia');
 const TtsModelOpenai = require('./speech-data/tts-model-openai');

@@ -382,6 +384,28 @@ const testRimelabs = async(logger, synthAudio, credentials) => {
  }
 };

+const testInworld = async(logger, synthAudio, credentials) => {
+  try {
+    await synthAudio(
+      {
+        increment: () => {},
+        histogram: () => {}
+      },
+      {
+        vendor: 'inworld',
+        credentials,
+        language: 'en',
+        voice: 'Ashley',
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
+      }
+    );
+  } catch (err) {
+    logger.info({err}, 'synth inworld returned error');
+    throw err;
+  }
+};
+
 const testWhisper = async(logger, synthAudio, credentials) => {
  try {
    await synthAudio({increment: () => {}, histogram: () => {}},
@@ -683,6 +707,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
    obj.model_id = o.model_id;
    obj.stt_model_id = o.stt_model_id;
    obj.options = o.options;
+  } else if ('inworld' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+    obj.model_id = o.model_id;
+    obj.options = o.options;
  } else if ('rimelabs' === obj.vendor) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
@@ -765,6 +794,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
      return await getLanguagesVoicesForPlayHT(credential, getTtsVoices, logger);
    case 'rimelabs':
      return await getLanguagesVoicesForRimelabs(credential, getTtsVoices, logger);
+    case 'inworld':
+      return await getLanguagesVoicesForInworld(credential, getTtsVoices, logger);
    case 'assemblyai':
      return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
    case 'voxist':
@@ -1130,6 +1161,46 @@ async function getLanguagesVoicesForRimelabs(credential) {
  return tranform(ttsVoices, undefined, TtsModelRimelabs);
 }

+async function getLanguagesVoicesForInworld(credential) {
+  const api_key = credential ? credential.api_key : null;
+  if (!api_key) {
+    return tranform(TtsInworldLanguagesVoices, undefined, TtsModelInworld);
+  }
+  const response = await fetch('https://api.inworld.ai/tts/v1/voices', {
+    headers: {
+      'Accept': 'application/json',
+      'Authorization': `Basic ${api_key}`
+    }
+  });
+  if (!response.ok) {
+    throw new Error('failed to list models');
+  }
+  const data = await response.json();
+
+  const ttsVoices = data.voices.reduce((acc, voice) => {
+    // Process each language for this voice
+    voice.languages.forEach((languageCode) => {
+      const existingLanguage = acc.find((lang) => lang.value === languageCode);
+      const voiceEntry = {
+        name: voice.displayName || capitalizeFirst(voice.voiceId),
+        value: voice.voiceId
+      };
+
+      if (existingLanguage) {
+        existingLanguage.voices.push(voiceEntry);
+      } else {
+        acc.push({
+          value: languageCode,
+          name: capitalizeFirst(languageCode),
+          voices: [voiceEntry]
+        });
+      }
+    });
+    return acc;
+  }, []);
+  return tranform(ttsVoices, undefined, TtsModelInworld);
+}
+
 async function getLanguagesVoicesForAssemblyAI(credential) {
  return tranform(undefined, SttAssemblyaiLanguagesVoices);
 }
@@ -1442,6 +1513,7 @@ module.exports = {
  testElevenlabs,
  testPlayHT,
  testRimelabs,
+  testInworld,
  testAssemblyStt,
  testDeepgramTTS,
  getSpeechCredential,
--- a/package-lock.json
+++ b/package-lock.json
@@ -20,7 +20,7 @@
        "@jambonz/lamejs": "^1.2.2",
        "@jambonz/mw-registrar": "^0.2.7",
        "@jambonz/realtimedb-helpers": "^0.8.14",
-        "@jambonz/speech-utils": "^0.2.10",
+        "@jambonz/speech-utils": "^0.2.13",
        "@jambonz/time-series": "^0.2.8",
        "@jambonz/verb-specifications": "^0.0.104",
        "@soniox/soniox-node": "^1.2.2",
@@ -4155,9 +4155,9 @@
      }
    },
    "node_modules/@jambonz/speech-utils": {
-      "version": "0.2.12",
-      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.2.12.tgz",
-      "integrity": "sha512-1xik/ZRUtPE2SOztxweGI+RTXUbiUXRShJ8G/l7VJJBkSWbfKKerYIRfHicAPumHicaUrbqSzZ6hr0eghv80KA==",
+      "version": "0.2.13",
+      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.2.13.tgz",
+      "integrity": "sha512-8ISTWTfz3fWtPmzPDsZG8zgnf6pTjLA1WasMAF/d/ktGswqVsbhoPcDh5ZyZ7BsEqOMLMIv2Hn0ESmrBuMn5kw==",
      "license": "MIT",
      "dependencies": {
        "23": "^0.0.0",
--- a/package.json
+++ b/package.json
@@ -31,7 +31,7 @@
    "@jambonz/lamejs": "^1.2.2",
    "@jambonz/mw-registrar": "^0.2.7",
    "@jambonz/realtimedb-helpers": "^0.8.14",
-    "@jambonz/speech-utils": "^0.2.10",
+    "@jambonz/speech-utils": "^0.2.13",
    "@jambonz/time-series": "^0.2.8",
    "@jambonz/verb-specifications": "^0.0.104",
    "@soniox/soniox-node": "^1.2.2",
--- a/test/speech-credentials.js
+++ b/test/speech-credentials.js
@@ -717,6 +717,28 @@ test('speech credentials tests', async(t) => {
    t.ok(result.statusCode === 204, 'successfully deleted speech credential for rimelabs');


+    result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
+      resolveWithFullResponse: true,
+      auth: authUser,
+      json: true,
+      body: {
+        vendor: 'inworld',
+        use_for_stt: false,
+        use_for_tts: true,
+        api_key: 'asdasdasdasddsadasda',
+        model_id: 'inworld-tts-1',
+      }
+    });
+    t.ok(result.statusCode === 201, 'successfully added speech credential for inworld');
+    const inworld_sid = result.body.sid;
+
+    /* delete the credential */
+    result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${inworld_sid}`, {
+      auth: authUser,
+      resolveWithFullResponse: true,
+    });
+    t.ok(result.statusCode === 204, 'successfully deleted speech credential for inworld');
+
    /* add a credential for custom voices google */
    result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
      resolveWithFullResponse: true,