support resemble TTS (#488)

* support resemble TTS

* wip

* wip

* update speech utils version

* update resemble voice list
This commit is contained in:
Hoan Luu Huu
2025-08-13 19:18:08 +07:00
committed by GitHub
parent 2b66a121a0
commit fd9dc77a58
6 changed files with 766 additions and 51 deletions

View File

@@ -21,6 +21,7 @@ const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
const TtsInworldLanguagesVoices = require('./speech-data/tts-inworld');
const ttsCartesia = require('./speech-data/tts-cartesia');
const TtsResembleLanguagesVoices = require('./speech-data/tts-resemble');
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
const TtsLanguagesDeepgram = require('./speech-data/tts-deepgram');
@@ -424,6 +425,24 @@ const testWhisper = async(logger, synthAudio, credentials) => {
}
};
const testResembleTTS = async(logger, synthAudio, credentials) => {
try {
await synthAudio({increment: () => {}, histogram: () => {}},
{
vendor: 'resemble',
credentials,
language: 'en-US',
voice: '3f5fb9f1',
text: 'Hi there and welcome to jambones!',
renderForCaching: true
}
);
} catch (err) {
logger.info({err}, 'synth resemble returned error');
throw err;
}
};
const testDeepgramTTS = async(logger, synthAudio, credentials) => {
try {
await synthAudio({increment: () => {}, histogram: () => {}},
@@ -729,6 +748,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.service_version = o.service_version;
} else if ('resemble' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.resemble_tts_uri = o.resemble_tts_uri;
obj.resemble_tts_use_tls = o.resemble_tts_use_tls;
} else if ('voxist' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
@@ -799,6 +823,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForRimelabs(credential, getTtsVoices, logger);
case 'inworld':
return await getLanguagesVoicesForInworld(credential, getTtsVoices, logger);
case 'resemble':
return await getLanguagesAndVoicesForResemble(credential, getTtsVoices, logger);
case 'assemblyai':
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
case 'voxist':
@@ -1240,6 +1266,82 @@ async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
}
}
async function getLanguagesAndVoicesForResemble(credential, getTtsVoices, logger) {
if (credential) {
try {
const {api_key} = credential;
let allVoices = [];
let page = 1;
let hasMorePages = true;
// Fetch all pages of voices
while (hasMorePages) {
const response = await fetch(`https://app.resemble.ai/api/v2/voices?page=${page}&page_size=100`, {
headers: {
'Authorization': `Token token=${api_key}`,
'Accept': 'application/json'
}
});
if (!response.ok) {
throw new Error('failed to list voices');
}
const data = await response.json();
if (!data.success) {
throw new Error('API returned unsuccessful response');
}
allVoices = allVoices.concat(data.items);
// Check if there are more pages
hasMorePages = page < data.num_pages;
page++;
}
// Filter only finished voices that support text_to_speech
const availableVoices = allVoices.filter((voice) =>
voice.status === 'finished' &&
voice.component_status?.text_to_speech?.status === 'ready'
);
// Group voices by language
const ttsVoices = availableVoices.reduce((acc, voice) => {
const languageCode = voice.default_language || 'en-US';
const existingLanguage = acc.find((lang) => lang.value === languageCode);
const voiceEntry = {
name: `${voice.name} (${voice.voice_type}) - ${voice.source}`,
value: voice.uuid
};
if (existingLanguage) {
existingLanguage.voices.push(voiceEntry);
} else {
acc.push({
value: languageCode,
name: capitalizeFirst(languageCode),
voices: [voiceEntry]
});
}
return acc;
}, []);
// Sort languages and voices
ttsVoices.sort((a, b) => a.name.localeCompare(b.name));
ttsVoices.forEach((lang) => {
lang.voices.sort((a, b) => a.name.localeCompare(b.name));
});
return tranform(ttsVoices);
} catch (err) {
logger.info('Error while fetching Resemble languages, voices, return predefined values', err);
}
}
return tranform(TtsResembleLanguagesVoices);
}
function tranform(tts, stt, models, sttModels) {
return {
...(tts && {tts}),
@@ -1528,5 +1630,6 @@ module.exports = {
testSpeechmaticsStt,
testCartesia,
testVoxistStt,
testOpenAiStt
testOpenAiStt,
testResembleTTS
};