support tts cartesia (#370)

* support tts cartesia * update speech utils * revert reset password * revert serve-integration
2026-01-25 02:08:24 +00:00 · 2024-12-19 21:19:28 +07:00
parent 8851b3fac0
commit 0eb8097e32
6 changed files with 544 additions and 7 deletions
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -27,6 +27,7 @@ const TtsModelWhisper = require('./speech-data/tts-model-whisper');
 const TtsModelPlayHT = require('./speech-data/tts-model-playht');
 const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht');
 const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs');
+const TtsModelCartesia = require('./speech-data/tts-model-cartesia');

 const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
 const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -40,6 +41,8 @@ const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
 const SttSpeechmaticsLanguagesVoices = require('./speech-data/stt-speechmatics');
 const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
 const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
+const ttsCartesia = require('./speech-data/tts-cartesia');
+const ttsModelCartesia = require('./speech-data/tts-model-cartesia');


 const testSonioxStt = async(logger, credentials) => {
@@ -606,6 +609,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
    obj.user_id = o.user_id;
    obj.voice_engine = o.voice_engine;
    obj.options = o.options;
+  } else if ('cartesia' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+    obj.model_id = o.model_id;
+    obj.options = o.options;
  } else if ('rimelabs' === obj.vendor) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
@@ -688,6 +696,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
      return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
    case 'speechmatics':
      return await getLanguagesVoicesForSpeechmatics(credential, getTtsVoices, logger);
+    case 'cartesia':
+      return await getLanguagesVoicesForCartesia(credential, getTtsVoices, logger);
    default:
      logger.info(`invalid vendor ${vendor}, return empty result`);
      throw new Error(`Invalid vendor ${vendor}`);
@@ -1143,6 +1153,95 @@ function parseVerbioLanguagesVoices(data) {
  }, []);
 }

+const fetchCartesiaVoices = async(credential) => {
+  if (credential) {
+    const get = bent('https://api.cartesia.ai', 'GET', 'json', {
+      'X-API-Key' : credential.api_key,
+      'Cartesia-Version': '2024-06-10',
+      'Accept': 'application/json'
+    });
+
+    const voices = await get('/voices');
+    return voices;
+  }
+};
+
+const testCartesia = async(logger, synthAudio, credentials) => {
+  try {
+    await synthAudio(
+      {
+        increment: () => {},
+        histogram: () => {}
+      },
+      {
+        vendor: 'cartesia',
+        credentials,
+        language: 'en',
+        voice: '694f9389-aac1-45b6-b726-9d9369183238',
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
+      }
+    );
+    // Test if Cartesia can fetch voices
+    await fetchCartesiaVoices(credentials);
+  } catch (err) {
+    logger.info({err}, 'synth cartesia returned error');
+    throw err;
+  }
+};
+
+async function getLanguagesVoicesForCartesia(credential) {
+  if (credential) {
+    const {model_id} = credential;
+    const {languages} = ttsModelCartesia.find((m) => m.value === model_id);
+    const voices = await fetchCartesiaVoices(credential);
+
+    const buildVoice = (d) => (
+      {
+        value: `${d.id}`,
+        name: `${d.name} - ${d.description}`
+      });
+    const languageMap = {
+      en: 'English',
+      fr: 'French',
+      de: 'German',
+      es: 'Spanish',
+      pt: 'Portuguese',
+      zh: 'Chinese',
+      ja: 'Japanese',
+      hi: 'Hindi',
+      it: 'Italian',
+      ko: 'Korean',
+      nl: 'Dutch',
+      pl: 'Polish',
+      ru: 'Russian',
+      sv: 'Swedish',
+      tr: 'Turkish',
+    };
+    const ttsVoices = voices.reduce((acc, voice) => {
+      if (!languages.includes(voice.language)) {
+        return acc;
+      }
+
+      const languageCode = voice.language;
+      const existingLanguage = acc.find((lang) => lang.value === languageCode);
+      if (existingLanguage) {
+        existingLanguage.voices.push(buildVoice(voice));
+      } else {
+        acc.push({
+          value: languageCode,
+          name: languageMap[languageCode],
+          voices: [buildVoice(voice)]
+        });
+      }
+      return acc;
+    }, []);
+
+    return tranform(ttsVoices, undefined, TtsModelCartesia);
+  }
+  return tranform(ttsCartesia, undefined, TtsModelCartesia);
+}
+
 module.exports = {
  testGoogleTts,
  testGoogleStt,
@@ -1169,5 +1268,6 @@ module.exports = {
  testVerbioTts,
  testVerbioStt,
  getLanguagesAndVoicesForVendor,
-  testSpeechmaticsStt
+  testSpeechmaticsStt,
+  testCartesia
 };