support tts cartesia (#370)

* support tts cartesia

* update speech utils

* revert reset password

* revert serve-integration
This commit is contained in:
Hoan Luu Huu
2024-12-19 21:19:28 +07:00
committed by GitHub
parent 8851b3fac0
commit 0eb8097e32
6 changed files with 544 additions and 7 deletions

View File

@@ -27,6 +27,7 @@ const TtsModelWhisper = require('./speech-data/tts-model-whisper');
const TtsModelPlayHT = require('./speech-data/tts-model-playht');
const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht');
const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs');
const TtsModelCartesia = require('./speech-data/tts-model-cartesia');
const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -40,6 +41,8 @@ const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
const SttSpeechmaticsLanguagesVoices = require('./speech-data/stt-speechmatics');
const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
const ttsCartesia = require('./speech-data/tts-cartesia');
const ttsModelCartesia = require('./speech-data/tts-model-cartesia');
const testSonioxStt = async(logger, credentials) => {
@@ -606,6 +609,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
obj.user_id = o.user_id;
obj.voice_engine = o.voice_engine;
obj.options = o.options;
} else if ('cartesia' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
obj.options = o.options;
} else if ('rimelabs' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
@@ -688,6 +696,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
case 'speechmatics':
return await getLanguagesVoicesForSpeechmatics(credential, getTtsVoices, logger);
case 'cartesia':
return await getLanguagesVoicesForCartesia(credential, getTtsVoices, logger);
default:
logger.info(`invalid vendor ${vendor}, return empty result`);
throw new Error(`Invalid vendor ${vendor}`);
@@ -1143,6 +1153,95 @@ function parseVerbioLanguagesVoices(data) {
}, []);
}
const fetchCartesiaVoices = async(credential) => {
if (credential) {
const get = bent('https://api.cartesia.ai', 'GET', 'json', {
'X-API-Key' : credential.api_key,
'Cartesia-Version': '2024-06-10',
'Accept': 'application/json'
});
const voices = await get('/voices');
return voices;
}
};
const testCartesia = async(logger, synthAudio, credentials) => {
try {
await synthAudio(
{
increment: () => {},
histogram: () => {}
},
{
vendor: 'cartesia',
credentials,
language: 'en',
voice: '694f9389-aac1-45b6-b726-9d9369183238',
text: 'Hi there and welcome to jambones!',
renderForCaching: true
}
);
// Test if Cartesia can fetch voices
await fetchCartesiaVoices(credentials);
} catch (err) {
logger.info({err}, 'synth cartesia returned error');
throw err;
}
};
async function getLanguagesVoicesForCartesia(credential) {
if (credential) {
const {model_id} = credential;
const {languages} = ttsModelCartesia.find((m) => m.value === model_id);
const voices = await fetchCartesiaVoices(credential);
const buildVoice = (d) => (
{
value: `${d.id}`,
name: `${d.name} - ${d.description}`
});
const languageMap = {
en: 'English',
fr: 'French',
de: 'German',
es: 'Spanish',
pt: 'Portuguese',
zh: 'Chinese',
ja: 'Japanese',
hi: 'Hindi',
it: 'Italian',
ko: 'Korean',
nl: 'Dutch',
pl: 'Polish',
ru: 'Russian',
sv: 'Swedish',
tr: 'Turkish',
};
const ttsVoices = voices.reduce((acc, voice) => {
if (!languages.includes(voice.language)) {
return acc;
}
const languageCode = voice.language;
const existingLanguage = acc.find((lang) => lang.value === languageCode);
if (existingLanguage) {
existingLanguage.voices.push(buildVoice(voice));
} else {
acc.push({
value: languageCode,
name: languageMap[languageCode],
voices: [buildVoice(voice)]
});
}
return acc;
}, []);
return tranform(ttsVoices, undefined, TtsModelCartesia);
}
return tranform(ttsCartesia, undefined, TtsModelCartesia);
}
module.exports = {
testGoogleTts,
testGoogleStt,
@@ -1169,5 +1268,6 @@ module.exports = {
testVerbioTts,
testVerbioStt,
getLanguagesAndVoicesForVendor,
testSpeechmaticsStt
testSpeechmaticsStt,
testCartesia
};