support resemble TTS (#488)

* support resemble TTS * wip * wip * update speech utils version * update resemble voice list
2026-01-25 02:08:24 +00:00 · 2025-08-13 19:18:08 +07:00
parent 2b66a121a0
commit fd9dc77a58
6 changed files with 766 additions and 51 deletions
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -21,6 +21,7 @@ const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
 const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
 const TtsInworldLanguagesVoices = require('./speech-data/tts-inworld');
 const ttsCartesia = require('./speech-data/tts-cartesia');
+const TtsResembleLanguagesVoices = require('./speech-data/tts-resemble');

 const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
 const TtsLanguagesDeepgram = require('./speech-data/tts-deepgram');
@@ -424,6 +425,24 @@ const testWhisper = async(logger, synthAudio, credentials) => {
  }
 };

+const testResembleTTS = async(logger, synthAudio, credentials) => {
+  try {
+    await synthAudio({increment: () => {}, histogram: () => {}},
+      {
+        vendor: 'resemble',
+        credentials,
+        language: 'en-US',
+        voice: '3f5fb9f1',
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
+      }
+    );
+  } catch (err) {
+    logger.info({err}, 'synth resemble returned error');
+    throw err;
+  }
+};
+
 const testDeepgramTTS = async(logger, synthAudio, credentials) => {
  try {
    await synthAudio({increment: () => {}, histogram: () => {}},
@@ -729,6 +748,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
    obj.service_version = o.service_version;
+  }  else if ('resemble' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+    obj.resemble_tts_uri = o.resemble_tts_uri;
+    obj.resemble_tts_use_tls = o.resemble_tts_use_tls;
  } else if ('voxist' === obj.vendor) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
@@ -799,6 +823,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
      return await getLanguagesVoicesForRimelabs(credential, getTtsVoices, logger);
    case 'inworld':
      return await getLanguagesVoicesForInworld(credential, getTtsVoices, logger);
+    case 'resemble':
+      return await getLanguagesAndVoicesForResemble(credential, getTtsVoices, logger);
    case 'assemblyai':
      return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
    case 'voxist':
@@ -1240,6 +1266,82 @@ async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
  }
 }

+async function getLanguagesAndVoicesForResemble(credential, getTtsVoices, logger) {
+  if (credential) {
+    try {
+      const {api_key} = credential;
+      let allVoices = [];
+      let page = 1;
+      let hasMorePages = true;
+      // Fetch all pages of voices
+      while (hasMorePages) {
+        const response = await fetch(`https://app.resemble.ai/api/v2/voices?page=${page}&page_size=100`, {
+          headers: {
+            'Authorization': `Token token=${api_key}`,
+            'Accept': 'application/json'
+          }
+        });
+
+        if (!response.ok) {
+          throw new Error('failed to list voices');
+        }
+
+        const data = await response.json();
+
+        if (!data.success) {
+          throw new Error('API returned unsuccessful response');
+        }
+
+        allVoices = allVoices.concat(data.items);
+
+        // Check if there are more pages
+        hasMorePages = page < data.num_pages;
+        page++;
+      }
+
+      // Filter only finished voices that support text_to_speech
+      const availableVoices = allVoices.filter((voice) =>
+        voice.status === 'finished' &&
+        voice.component_status?.text_to_speech?.status === 'ready'
+      );
+
+      // Group voices by language
+      const ttsVoices = availableVoices.reduce((acc, voice) => {
+        const languageCode = voice.default_language || 'en-US';
+        const existingLanguage = acc.find((lang) => lang.value === languageCode);
+
+        const voiceEntry = {
+          name: `${voice.name} (${voice.voice_type}) - ${voice.source}`,
+          value: voice.uuid
+        };
+
+        if (existingLanguage) {
+          existingLanguage.voices.push(voiceEntry);
+        } else {
+
+          acc.push({
+            value: languageCode,
+            name: capitalizeFirst(languageCode),
+            voices: [voiceEntry]
+          });
+        }
+
+        return acc;
+      }, []);
+      // Sort languages and voices
+      ttsVoices.sort((a, b) => a.name.localeCompare(b.name));
+      ttsVoices.forEach((lang) => {
+        lang.voices.sort((a, b) => a.name.localeCompare(b.name));
+      });
+      return tranform(ttsVoices);
+    } catch (err) {
+      logger.info('Error while fetching Resemble languages, voices, return predefined values', err);
+    }
+  }
+
+  return tranform(TtsResembleLanguagesVoices);
+}
+
 function tranform(tts, stt, models, sttModels) {
  return {
    ...(tts && {tts}),
@@ -1528,5 +1630,6 @@ module.exports = {
  testSpeechmaticsStt,
  testCartesia,
  testVoxistStt,
-  testOpenAiStt
+  testOpenAiStt,
+  testResembleTTS
 };