support inworld tts

2026-01-25 02:08:26 +00:00 · 2025-06-26 17:34:24 +07:00
parent db135ee5ad
commit c00d4f9be4
2 changed files with 103 additions and 1 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -98,7 +98,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
  logger = logger || noopLogger;
  assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
-    'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia'].includes(vendor) ||
+    'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld'].includes(vendor) ||
  vendor.startsWith('custom'),
  `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
  if ('google' === vendor) {
@@ -141,6 +141,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    assert.ok(credentials.api_key, 'synthAudio requires api_key when playht is used');
    assert.ok(credentials.user_id, 'synthAudio requires user_id when playht is used');
    assert.ok(credentials.voice_engine, 'synthAudio requires voice_engine when playht is used');
  } else if ('inworld' === vendor) {
    assert.ok(voice, 'synthAudio requires voice when inworld is used');
    assert.ok(credentials.api_key, 'synthAudio requires api_key when inworld is used');
    assert.ok(credentials.model_id, 'synthAudio requires model_id when inworld is used');
  } else if ('rimelabs' === vendor) {
    assert.ok(voice, 'synthAudio requires voice when rimelabs is used');
    assert.ok(credentials.api_key, 'synthAudio requires api_key when rimelabs is used');
@@ -238,6 +242,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        audioData = await synthCartesia(logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'inworld':
        audioData = await synthInworld(logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'rimelabs':
        audioData = await synthRimelabs(logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
@@ -954,6 +962,70 @@ const synthPlayHT = async(client, logger, {
  }
 };
 const synthInworld = async(logger, {
  credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming
 }) => {
  const {api_key, model_id, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
  /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
    params += `{api_key=${api_key}`;
    params += `,model_id=${model_id}`;
    params += ',vendor=inworld';
    params += `,voice=${voice}`;
    params += ',write_cache_file=1';
    if (opts.temperature) params += `,temperature=${opts.temperature}`;
    if (opts.audioConfig?.pitch) params += `,pitch=${opts.pitch}`;
    if (opts.audioConfig?.speakingRate) params += `,speakingRate=${opts.speakingRate}`;
    params += '}';
    return {
      filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
      servedFromCache: false,
      rtt: 0
    };
  }
  try {
    const url = 'https://api.inworld.ai/tts/v1/voice';
    const sampleRate = 8000;
    const options = {
      method: 'POST',
      headers: {
        'Authorization': `Basic ${api_key}`,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({
        text,
        voiceId: voice,
        modelId: model_id,
        audioConfig: {
          ...(opts.audioConfig || {}),
          audioEncoding: 'MP3',
        }
      })
    };
    const response = await fetch(url, options);
    if (!response.ok) {
      throw new Error(await response.text());
    }
    const json = await response.json();
    return {
      audioContent: Buffer.from(json.audioContent, 'base64'),
      extension: 'mp3',
      sampleRate
    };
  } catch (err) {
    logger.info({err}, 'synth inworld returned error');
    stats.increment('tts.count', ['vendor:inworld', 'accepted:no']);
    throw err;
  }
 };
 const synthRimelabs = async(logger, {
  credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming
 }) => {
--- a/test/synth.js
+++ b/test/synth.js
@@ -720,6 +720,36 @@ test('Cartesia speech synth tests', async(t) => {
  client.quit();
 });
 test('inworld speech synth', async(t) => {
  const fn = require('..');
  const {synthAudio, client} = fn(opts, logger);
  if (!process.env.INWORLD_API_KEY) {
    t.pass('skipping inworld speech synth tests since INWORLD_API_KEY is not provided');
    return t.end();
  }
  const text = 'Hi there and welcome to jambones!';
  try {
    const opts = await synthAudio(stats, {
      vendor: 'inworld',
      credentials: {
        api_key: process.env.INWORLD_API_KEY,
        model_id: 'inworld-tts-1'
      },
      language: 'en',
      voice: 'Ashley',
      text,
      renderForCaching: true
    });
    t.ok(!opts.servedFromCache, `successfully synthesized inworld audio to ${opts.filePath}`);
  } catch (err) {
    console.error(JSON.stringify(err));
    t.end(err);
  }
  client.quit();
 });
 test('rimelabs speech synth tests mist', async(t) => {
  const fn = require('..');
  const {synthAudio, client} = fn(opts, logger);