support resemble ai

2025-12-19 03:37:49 +00:00 · 2025-08-10 15:29:01 +07:00
parent 043642ea5f
commit 49b23f5240
2 changed files with 92 additions and 1 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -98,7 +98,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
  logger = logger || noopLogger;

  assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
-    'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld'].includes(vendor) ||
+    'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld', 'resemble'].includes(vendor) ||
  vendor.startsWith('custom'),
  `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
  if ('google' === vendor) {
@@ -166,6 +166,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
  } else if ('cartesia' === vendor) {
    assert.ok(credentials.api_key, 'synthAudio requires api_key when cartesia is used');
    assert.ok(credentials.model_id, 'synthAudio requires model_id when cartesia is used');
+  } else if (vendor === 'resemble') {
+    assert.ok(voice, 'synthAudio requires voice when resemble is used');
+    assert.ok(credentials.api_key, 'synthAudio requires api_key when resemble is used');
  }

  const key = makeSynthKey({
@@ -263,6 +266,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        audioData = await synthDeepgram(logger, {credentials, stats, model, text,
          renderForCaching, disableTtsStreaming});
        break;
+      case 'resemble':
+        audioData = await synthResemble(logger, {
+          credentials, stats, voice, text, options, renderForCaching, disableTtsStreaming});
+        break;
      case vendor.startsWith('custom') ? vendor : 'cant_match_value':
        audioData = await synthCustomVendor(logger,
          {credentials, stats, language, voice, text});
@@ -1285,6 +1292,61 @@ const synthCartesia = async(logger, {

 };

+const synthResemble = async(logger, {
+  credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming
+}) => {
+  const {api_key, resemble_tts_uri} = credentials;
+  const {project_uuid, use_hd} = options || {};
+
+  /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
+  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+    let params = '';
+    params += `{api_key=${api_key}`;
+    params += ',vendor=resemble';
+    params += `,voice=${voice}`;
+    params += ',write_cache_file=1';
+    if (project_uuid) params += `,project_uuid=${project_uuid}`;
+    if (use_hd) params += `,use_hd=${use_hd}`;
+    if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`;
+
+    params += '}';
+
+    return {
+      filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
+      servedFromCache: false,
+      rtt: 0
+    };
+  }
+
+  try {
+    const baseUrl = resemble_tts_uri || 'https://f.cluster.resemble.ai';
+    const post = bent(baseUrl, 'POST', 'buffer', {
+      'Authorization': `Bearer ${api_key}`,
+      'Content-Type': 'application/json'
+    });
+    const response = await post('/synthesize', {
+      voice_uuid: voice,
+      data: text,
+      sample_rate: 8000,
+      output_format: 'mp3',
+      ...(project_uuid && {project_uuid}),
+      ...(use_hd && {use_hd}),
+    });
+
+    const json = JSON.parse(response.toString('utf8'));
+    const audioContent = Buffer.from(json.audio_content, 'base64');
+    return {
+      audioContent,
+      extension: 'mp3',
+      sampleRate: 8000
+    };
+  } catch (err) {
+    logger.info({err}, 'synth Elevenlabs returned error');
+    stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
+    throw err;
+  }
+};
+
 const getFileExtFromMime = (mime) => {
  switch (mime) {
    case 'audio/wav':
--- a/test/synth.js
+++ b/test/synth.js
@@ -750,6 +750,35 @@ test('inworld speech synth', async(t) => {
  client.quit();
 });

+test('resemble speech synth', async(t) => {
+  const fn = require('..');
+  const {synthAudio, client} = fn(opts, logger);
+
+  if (!process.env.RESEMBLE_API_KEY) {
+    t.pass('skipping resemble speech synth tests since RESEMBLE_API_KEY is not provided');
+    return t.end();
+  }
+  const text = '<speak prompt="Speak in an excited, upbeat tone">Hello from Resemble!</speak>';
+  try {
+    const opts = await synthAudio(stats, {
+      vendor: 'resemble',
+      credentials: {
+        api_key: process.env.RESEMBLE_API_KEY,
+      },
+      language: 'en',
+      voice: '3f5fb9f1',
+      text,
+      renderForCaching: true
+    });
+    t.ok(!opts.servedFromCache, `successfully synthesized resemble audio to ${opts.filePath}`);
+
+  } catch (err) {
+    console.error(JSON.stringify(err));
+    t.end(err);
+  }
+  client.quit();
+});
+
 test('rimelabs speech synth tests mist', async(t) => {
  const fn = require('..');
  const {synthAudio, client} = fn(opts, logger);